KVM: nSVM: split out __nested_vmcb_check_controls
[linux-2.6-block.git] / arch / x86 / kvm / svm / svm.c
CommitLineData
44a95dae
SS
1#define pr_fmt(fmt) "SVM: " fmt
2
edf88417
AK
3#include <linux/kvm_host.h>
4
85f455f7 5#include "irq.h"
1d737c8a 6#include "mmu.h"
5fdbf976 7#include "kvm_cache_regs.h"
fe4c7b19 8#include "x86.h"
66f7b72e 9#include "cpuid.h"
25462f7f 10#include "pmu.h"
e495606d 11
6aa8b732 12#include <linux/module.h>
ae759544 13#include <linux/mod_devicetable.h>
9d8f549d 14#include <linux/kernel.h>
6aa8b732
AK
15#include <linux/vmalloc.h>
16#include <linux/highmem.h>
ef0f6496 17#include <linux/amd-iommu.h>
e8edc6e0 18#include <linux/sched.h>
af658dca 19#include <linux/trace_events.h>
5a0e3ad6 20#include <linux/slab.h>
5881f737 21#include <linux/hashtable.h>
00089c04 22#include <linux/objtool.h>
e9df0942 23#include <linux/psp-sev.h>
1654efcb 24#include <linux/file.h>
89c50580
BS
25#include <linux/pagemap.h>
26#include <linux/swap.h>
33af3a7e 27#include <linux/rwsem.h>
4d96f910 28#include <linux/cc_platform.h>
6aa8b732 29
8221c137 30#include <asm/apic.h>
1018faa6 31#include <asm/perf_event.h>
67ec6607 32#include <asm/tlbflush.h>
e495606d 33#include <asm/desc.h>
facb0139 34#include <asm/debugreg.h>
631bc487 35#include <asm/kvm_para.h>
411b44ba 36#include <asm/irq_remapping.h>
28a27752 37#include <asm/spec-ctrl.h>
ba5bade4 38#include <asm/cpu_device_id.h>
f1c6366e 39#include <asm/traps.h>
d69c1382 40#include <asm/fpu/api.h>
6aa8b732 41
63d1142f 42#include <asm/virtext.h>
229456fc 43#include "trace.h"
63d1142f 44
883b0a91 45#include "svm.h"
35a78319 46#include "svm_ops.h"
883b0a91 47
1e0c7d40
VP
48#include "kvm_onhyperv.h"
49#include "svm_onhyperv.h"
50
6aa8b732
AK
51MODULE_AUTHOR("Qumranet");
52MODULE_LICENSE("GPL");
53
575b255c 54#ifdef MODULE
ae759544 55static const struct x86_cpu_id svm_cpu_id[] = {
320debe5 56 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
ae759544
JT
57 {}
58};
59MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
575b255c 60#endif
ae759544 61
6aa8b732
AK
62#define SEG_TYPE_LDT 2
63#define SEG_TYPE_BUSY_TSS16 3
64
6bc31bdc
AP
65#define SVM_FEATURE_LBRV (1 << 1)
66#define SVM_FEATURE_SVML (1 << 2)
ddce97aa
AP
67#define SVM_FEATURE_TSC_RATE (1 << 4)
68#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
69#define SVM_FEATURE_FLUSH_ASID (1 << 6)
70#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
6bc31bdc 71#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
80b7706e 72
24e09cbf
JR
73#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
74
fbc0db76 75#define TSC_RATIO_RSVD 0xffffff0000000000ULL
92a1f12d
JR
76#define TSC_RATIO_MIN 0x0000000000000001ULL
77#define TSC_RATIO_MAX 0x000000ffffffffffULL
fbc0db76 78
67ec6607
JR
79static bool erratum_383_found __read_mostly;
80
883b0a91 81u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
323c3d80 82
2b036c6b
BO
83/*
84 * Set osvw_len to higher value when updated Revision Guides
85 * are published and we know what the new status bits are
86 */
87static uint64_t osvw_len = 4, osvw_status;
88
fbc0db76
JR
89static DEFINE_PER_CPU(u64, current_tsc_ratio);
90#define TSC_RATIO_DEFAULT 0x0100000000ULL
91
09941fbb 92static const struct svm_direct_access_msrs {
ac72a9b7 93 u32 index; /* Index of the MSR */
376c6d28 94 bool always; /* True if intercept is initially cleared */
fd6fa73d 95} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
8c06585d 96 { .index = MSR_STAR, .always = true },
ac72a9b7 97 { .index = MSR_IA32_SYSENTER_CS, .always = true },
adc2a237
ML
98 { .index = MSR_IA32_SYSENTER_EIP, .always = false },
99 { .index = MSR_IA32_SYSENTER_ESP, .always = false },
ac72a9b7
JR
100#ifdef CONFIG_X86_64
101 { .index = MSR_GS_BASE, .always = true },
102 { .index = MSR_FS_BASE, .always = true },
103 { .index = MSR_KERNEL_GS_BASE, .always = true },
104 { .index = MSR_LSTAR, .always = true },
105 { .index = MSR_CSTAR, .always = true },
106 { .index = MSR_SYSCALL_MASK, .always = true },
107#endif
b2ac58f9 108 { .index = MSR_IA32_SPEC_CTRL, .always = false },
15d45071 109 { .index = MSR_IA32_PRED_CMD, .always = false },
ac72a9b7
JR
110 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
111 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
112 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
113 { .index = MSR_IA32_LASTINTTOIP, .always = false },
376c6d28
TL
114 { .index = MSR_EFER, .always = false },
115 { .index = MSR_IA32_CR_PAT, .always = false },
116 { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
ac72a9b7 117 { .index = MSR_INVALID, .always = false },
6c8166a7
AK
118};
119
8566ac8b
BM
120/*
121 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
122 * pause_filter_count: On processors that support Pause filtering(indicated
123 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
124 * count value. On VMRUN this value is loaded into an internal counter.
125 * Each time a pause instruction is executed, this counter is decremented
126 * until it reaches zero at which time a #VMEXIT is generated if pause
127 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
128 * Intercept Filtering for more details.
129 * This also indicate if ple logic enabled.
130 *
131 * pause_filter_thresh: In addition, some processor families support advanced
132 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
133 * the amount of time a guest is allowed to execute in a pause loop.
134 * In this mode, a 16-bit pause filter threshold field is added in the
135 * VMCB. The threshold value is a cycle count that is used to reset the
136 * pause counter. As with simple pause filtering, VMRUN loads the pause
137 * count value from VMCB into an internal counter. Then, on each pause
138 * instruction the hardware checks the elapsed number of cycles since
139 * the most recent pause instruction against the pause filter threshold.
140 * If the elapsed cycle count is greater than the pause filter threshold,
141 * then the internal pause count is reloaded from the VMCB and execution
142 * continues. If the elapsed cycle count is less than the pause filter
143 * threshold, then the internal pause count is decremented. If the count
144 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
145 * triggered. If advanced pause filtering is supported and pause filter
146 * threshold field is set to zero, the filter will operate in the simpler,
147 * count only mode.
148 */
149
150static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
151module_param(pause_filter_thresh, ushort, 0444);
152
153static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
154module_param(pause_filter_count, ushort, 0444);
155
156/* Default doubles per-vcpu window every exit. */
157static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
158module_param(pause_filter_count_grow, ushort, 0444);
159
160/* Default resets per-vcpu window every exit to pause_filter_count. */
161static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
162module_param(pause_filter_count_shrink, ushort, 0444);
163
164/* Default is to compute the maximum so we can never overflow. */
165static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
166module_param(pause_filter_count_max, ushort, 0444);
167
99840a75
SC
168/*
169 * Use nested page tables by default. Note, NPT may get forced off by
170 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
171 */
172bool npt_enabled = true;
173module_param_named(npt, npt_enabled, bool, 0444);
e3da3acd 174
e2358851
DB
175/* allow nested virtualization in KVM/SVM */
176static int nested = true;
236de055
AG
177module_param(nested, int, S_IRUGO);
178
d647eb63
PB
179/* enable/disable Next RIP Save */
180static int nrips = true;
181module_param(nrips, int, 0444);
182
89c8a498
JN
183/* enable/disable Virtual VMLOAD VMSAVE */
184static int vls = true;
185module_param(vls, int, 0444);
186
640bd6e5
JN
187/* enable/disable Virtual GIF */
188static int vgif = true;
189module_param(vgif, int, 0444);
5ea11f2b 190
4c84926e
ML
191/* enable/disable LBR virtualization */
192static int lbrv = true;
193module_param(lbrv, int, 0444);
194
f800650a
ML
195static int tsc_scaling = true;
196module_param(tsc_scaling, int, 0444);
197
fdf513e3
VK
198/*
199 * enable / disable AVIC. Because the defaults differ for APICv
200 * support between VMX and SVM we cannot use module_param_named.
201 */
202static bool avic;
203module_param(avic, bool, 0444);
204
291bd20d 205bool __read_mostly dump_invalid_vmcb;
6f2f8453
PB
206module_param(dump_invalid_vmcb, bool, 0644);
207
4b639a9f
ML
208
209bool intercept_smi = true;
210module_param(intercept_smi, bool, 0444);
211
212
2e215216 213static bool svm_gp_erratum_intercept = true;
82a11e9c 214
7607b717
BS
215static u8 rsm_ins_bytes[] = "\x0f\xaa";
216
4866d5e3 217static unsigned long iopm_base;
6aa8b732
AK
218
219struct kvm_ldttss_desc {
220 u16 limit0;
221 u16 base0;
e0231715
JR
222 unsigned base1:8, type:5, dpl:2, p:1;
223 unsigned limit1:4, zero0:3, g:1, base2:8;
6aa8b732
AK
224 u32 base3;
225 u32 zero1;
226} __attribute__((packed));
227
eaf78265 228DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
6aa8b732 229
844d69c2
SC
230/*
231 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
232 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
233 *
234 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
235 * defer the restoration of TSC_AUX until the CPU returns to userspace.
236 */
0caa0a77 237static int tsc_aux_uret_slot __read_mostly = -1;
844d69c2 238
09941fbb 239static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
6aa8b732 240
9d8f549d 241#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
6aa8b732
AK
242#define MSRS_RANGE_SIZE 2048
243#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
244
883b0a91 245u32 svm_msrpm_offset(u32 msr)
455716fa
JR
246{
247 u32 offset;
248 int i;
249
250 for (i = 0; i < NUM_MSR_MAPS; i++) {
251 if (msr < msrpm_ranges[i] ||
252 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
253 continue;
254
255 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
256 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
257
258 /* Now we have the u8 offset - but need the u32 offset */
259 return offset / 4;
260 }
261
262 /* MSR not in any range */
263 return MSR_INVALID;
264}
265
6aa8b732
AK
266#define MAX_INST_SIZE 15
267
d468d94b 268static int get_max_npt_level(void)
4b16184c
JR
269{
270#ifdef CONFIG_X86_64
43e540cc 271 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4b16184c
JR
272#else
273 return PT32E_ROOT_LEVEL;
274#endif
275}
276
72f211ec 277int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
6aa8b732 278{
c513f484 279 struct vcpu_svm *svm = to_svm(vcpu);
2fcf4876 280 u64 old_efer = vcpu->arch.efer;
6dc696d4 281 vcpu->arch.efer = efer;
9167ab79
PB
282
283 if (!npt_enabled) {
284 /* Shadow paging assumes NX to be available. */
285 efer |= EFER_NX;
286
287 if (!(efer & EFER_LMA))
288 efer &= ~EFER_LME;
289 }
6aa8b732 290
2fcf4876
ML
291 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
292 if (!(efer & EFER_SVME)) {
293 svm_leave_nested(svm);
294 svm_set_gif(svm, true);
82a11e9c
BD
295 /* #GP intercept is still needed for vmware backdoor */
296 if (!enable_vmware_backdoor)
297 clr_exception_intercept(svm, GP_VECTOR);
2fcf4876
ML
298
299 /*
300 * Free the nested guest state, unless we are in SMM.
301 * In this case we will return to the nested guest
302 * as soon as we leave SMM.
303 */
63129754 304 if (!is_smm(vcpu))
2fcf4876
ML
305 svm_free_nested(svm);
306
307 } else {
308 int ret = svm_allocate_nested(svm);
309
310 if (ret) {
311 vcpu->arch.efer = old_efer;
312 return ret;
313 }
82a11e9c
BD
314
315 if (svm_gp_erratum_intercept)
316 set_exception_intercept(svm, GP_VECTOR);
2fcf4876 317 }
c513f484
PB
318 }
319
320 svm->vmcb->save.efer = efer | EFER_SVME;
06e7852c 321 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
72f211ec 322 return 0;
6aa8b732
AK
323}
324
6aa8b732
AK
325static int is_external_interrupt(u32 info)
326{
327 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
328 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
329}
330
37ccdcbe 331static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2809f5d2
GC
332{
333 struct vcpu_svm *svm = to_svm(vcpu);
334 u32 ret = 0;
335
336 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
37ccdcbe
PB
337 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
338 return ret;
2809f5d2
GC
339}
340
341static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
342{
343 struct vcpu_svm *svm = to_svm(vcpu);
344
345 if (mask == 0)
346 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
347 else
348 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
349
350}
351
f8ea7c60 352static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
6aa8b732 353{
a2fa3e9f
GH
354 struct vcpu_svm *svm = to_svm(vcpu);
355
f1c6366e
TL
356 /*
357 * SEV-ES does not expose the next RIP. The RIP update is controlled by
358 * the type of exit and the #VC handler in the guest.
359 */
360 if (sev_es_guest(vcpu->kvm))
361 goto done;
362
d647eb63 363 if (nrips && svm->vmcb->control.next_rip != 0) {
d2922422 364 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
6bc31bdc 365 svm->next_rip = svm->vmcb->control.next_rip;
f104765b 366 }
6bc31bdc 367
1957aa63
SC
368 if (!svm->next_rip) {
369 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
370 return 0;
371 } else {
1957aa63
SC
372 kvm_rip_write(vcpu, svm->next_rip);
373 }
f1c6366e
TL
374
375done:
2809f5d2 376 svm_set_interrupt_shadow(vcpu, 0);
f8ea7c60 377
60fc3d02 378 return 1;
6aa8b732
AK
379}
380
cfcd20e5 381static void svm_queue_exception(struct kvm_vcpu *vcpu)
116a4752
JK
382{
383 struct vcpu_svm *svm = to_svm(vcpu);
cfcd20e5
WL
384 unsigned nr = vcpu->arch.exception.nr;
385 bool has_error_code = vcpu->arch.exception.has_error_code;
cfcd20e5 386 u32 error_code = vcpu->arch.exception.error_code;
116a4752 387
63129754 388 kvm_deliver_exception_payload(vcpu);
da998b46 389
d647eb63 390 if (nr == BP_VECTOR && !nrips) {
63129754 391 unsigned long rip, old_rip = kvm_rip_read(vcpu);
66b7138f
JK
392
393 /*
394 * For guest debugging where we have to reinject #BP if some
395 * INT3 is guest-owned:
396 * Emulate nRIP by moving RIP forward. Will fail if injection
397 * raises a fault that is not intercepted. Still better than
398 * failing in all cases.
399 */
63129754
PB
400 (void)skip_emulated_instruction(vcpu);
401 rip = kvm_rip_read(vcpu);
66b7138f
JK
402 svm->int3_rip = rip + svm->vmcb->save.cs.base;
403 svm->int3_injected = rip - old_rip;
404 }
405
116a4752
JK
406 svm->vmcb->control.event_inj = nr
407 | SVM_EVTINJ_VALID
408 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
409 | SVM_EVTINJ_TYPE_EXEPT;
410 svm->vmcb->control.event_inj_err = error_code;
411}
412
67ec6607
JR
413static void svm_init_erratum_383(void)
414{
415 u32 low, high;
416 int err;
417 u64 val;
418
e6ee94d5 419 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
67ec6607
JR
420 return;
421
422 /* Use _safe variants to not break nested virtualization */
423 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
424 if (err)
425 return;
426
427 val |= (1ULL << 47);
428
429 low = lower_32_bits(val);
430 high = upper_32_bits(val);
431
432 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
433
434 erratum_383_found = true;
435}
436
2b036c6b
BO
437static void svm_init_osvw(struct kvm_vcpu *vcpu)
438{
439 /*
440 * Guests should see errata 400 and 415 as fixed (assuming that
441 * HLT and IO instructions are intercepted).
442 */
443 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
444 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
445
446 /*
447 * By increasing VCPU's osvw.length to 3 we are telling the guest that
448 * all osvw.status bits inside that length, including bit 0 (which is
449 * reserved for erratum 298), are valid. However, if host processor's
450 * osvw_len is 0 then osvw_status[0] carries no information. We need to
451 * be conservative here and therefore we tell the guest that erratum 298
452 * is present (because we really don't know).
453 */
454 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
455 vcpu->arch.osvw.status |= 1;
456}
457
6aa8b732
AK
458static int has_svm(void)
459{
63d1142f 460 const char *msg;
6aa8b732 461
63d1142f 462 if (!cpu_has_svm(&msg)) {
ff81ff10 463 printk(KERN_INFO "has_svm: %s\n", msg);
6aa8b732
AK
464 return 0;
465 }
466
4d96f910 467 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
ccd85d90
SC
468 pr_info("KVM is unsupported when running as an SEV guest\n");
469 return 0;
470 }
471
6aa8b732
AK
472 return 1;
473}
474
13a34e06 475static void svm_hardware_disable(void)
6aa8b732 476{
fbc0db76 477 /* Make sure we clean up behind us */
f800650a 478 if (tsc_scaling)
fbc0db76
JR
479 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
480
2c8dceeb 481 cpu_svm_disable();
1018faa6
JR
482
483 amd_pmu_disable_virt();
6aa8b732
AK
484}
485
13a34e06 486static int svm_hardware_enable(void)
6aa8b732
AK
487{
488
0fe1e009 489 struct svm_cpu_data *sd;
6aa8b732 490 uint64_t efer;
6aa8b732
AK
491 struct desc_struct *gdt;
492 int me = raw_smp_processor_id();
493
10474ae8
AG
494 rdmsrl(MSR_EFER, efer);
495 if (efer & EFER_SVME)
496 return -EBUSY;
497
6aa8b732 498 if (!has_svm()) {
1f5b77f5 499 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
10474ae8 500 return -EINVAL;
6aa8b732 501 }
0fe1e009 502 sd = per_cpu(svm_data, me);
0fe1e009 503 if (!sd) {
1f5b77f5 504 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
10474ae8 505 return -EINVAL;
6aa8b732
AK
506 }
507
0fe1e009
TH
508 sd->asid_generation = 1;
509 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
510 sd->next_asid = sd->max_asid + 1;
ed3cd233 511 sd->min_asid = max_sev_asid + 1;
6aa8b732 512
45fc8757 513 gdt = get_current_gdt_rw();
0fe1e009 514 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
6aa8b732 515
9962d032 516 wrmsrl(MSR_EFER, efer | EFER_SVME);
6aa8b732 517
85ca8be9 518 wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
10474ae8 519
fbc0db76 520 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
f800650a
ML
521 /*
522 * Set the default value, even if we don't use TSC scaling
523 * to avoid having stale value in the msr
524 */
fbc0db76 525 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
89cbc767 526 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
fbc0db76
JR
527 }
528
2b036c6b
BO
529
530 /*
531 * Get OSVW bits.
532 *
533 * Note that it is possible to have a system with mixed processor
534 * revisions and therefore different OSVW bits. If bits are not the same
535 * on different processors then choose the worst case (i.e. if erratum
536 * is present on one processor and not on another then assume that the
537 * erratum is present everywhere).
538 */
539 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
540 uint64_t len, status = 0;
541 int err;
542
543 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
544 if (!err)
545 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
546 &err);
547
548 if (err)
549 osvw_status = osvw_len = 0;
550 else {
551 if (len < osvw_len)
552 osvw_len = len;
553 osvw_status |= status;
554 osvw_status &= (1ULL << osvw_len) - 1;
555 }
556 } else
557 osvw_status = osvw_len = 0;
558
67ec6607
JR
559 svm_init_erratum_383();
560
1018faa6
JR
561 amd_pmu_enable_virt();
562
10474ae8 563 return 0;
6aa8b732
AK
564}
565
0da1db75
JR
566static void svm_cpu_uninit(int cpu)
567{
a2b2d4bf 568 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
0da1db75 569
0fe1e009 570 if (!sd)
0da1db75
JR
571 return;
572
a2b2d4bf 573 per_cpu(svm_data, cpu) = NULL;
70cd94e6 574 kfree(sd->sev_vmcbs);
0fe1e009
TH
575 __free_page(sd->save_area);
576 kfree(sd);
0da1db75
JR
577}
578
6aa8b732
AK
579static int svm_cpu_init(int cpu)
580{
0fe1e009 581 struct svm_cpu_data *sd;
b95c221c 582 int ret = -ENOMEM;
6aa8b732 583
0fe1e009
TH
584 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
585 if (!sd)
b95c221c 586 return ret;
0fe1e009 587 sd->cpu = cpu;
70cd94e6 588 sd->save_area = alloc_page(GFP_KERNEL);
0fe1e009 589 if (!sd->save_area)
d80b64ff 590 goto free_cpu_data;
b95c221c 591
85ca8be9 592 clear_page(page_address(sd->save_area));
6aa8b732 593
b95c221c
SC
594 ret = sev_cpu_init(sd);
595 if (ret)
596 goto free_save_area;
70cd94e6 597
0fe1e009 598 per_cpu(svm_data, cpu) = sd;
6aa8b732
AK
599
600 return 0;
601
d80b64ff
ML
602free_save_area:
603 __free_page(sd->save_area);
604free_cpu_data:
0fe1e009 605 kfree(sd);
b95c221c 606 return ret;
6aa8b732
AK
607
608}
609
fd6fa73d 610static int direct_access_msr_slot(u32 msr)
ac72a9b7 611{
fd6fa73d 612 u32 i;
ac72a9b7
JR
613
614 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
fd6fa73d
AG
615 if (direct_access_msrs[i].index == msr)
616 return i;
ac72a9b7 617
fd6fa73d
AG
618 return -ENOENT;
619}
620
621static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
622 int write)
623{
624 struct vcpu_svm *svm = to_svm(vcpu);
625 int slot = direct_access_msr_slot(msr);
626
627 if (slot == -ENOENT)
628 return;
629
630 /* Set the shadow bitmaps to the desired intercept states */
631 if (read)
632 set_bit(slot, svm->shadow_msr_intercept.read);
633 else
634 clear_bit(slot, svm->shadow_msr_intercept.read);
635
636 if (write)
637 set_bit(slot, svm->shadow_msr_intercept.write);
638 else
639 clear_bit(slot, svm->shadow_msr_intercept.write);
ac72a9b7
JR
640}
641
fd6fa73d
AG
642static bool valid_msr_intercept(u32 index)
643{
644 return direct_access_msr_slot(index) != -ENOENT;
ac72a9b7
JR
645}
646
476c9bd8 647static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
b2ac58f9
KA
648{
649 u8 bit_write;
650 unsigned long tmp;
651 u32 offset;
652 u32 *msrpm;
653
654 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
655 to_svm(vcpu)->msrpm;
656
657 offset = svm_msrpm_offset(msr);
658 bit_write = 2 * (msr & 0x0f) + 1;
659 tmp = msrpm[offset];
660
661 BUG_ON(offset == MSR_INVALID);
662
663 return !!test_bit(bit_write, &tmp);
664}
665
fd6fa73d
AG
666static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
667 u32 msr, int read, int write)
6aa8b732 668{
455716fa
JR
669 u8 bit_read, bit_write;
670 unsigned long tmp;
671 u32 offset;
6aa8b732 672
ac72a9b7
JR
673 /*
674 * If this warning triggers extend the direct_access_msrs list at the
675 * beginning of the file
676 */
677 WARN_ON(!valid_msr_intercept(msr));
678
fd6fa73d
AG
679 /* Enforce non allowed MSRs to trap */
680 if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
681 read = 0;
682
683 if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
684 write = 0;
685
455716fa
JR
686 offset = svm_msrpm_offset(msr);
687 bit_read = 2 * (msr & 0x0f);
688 bit_write = 2 * (msr & 0x0f) + 1;
689 tmp = msrpm[offset];
690
691 BUG_ON(offset == MSR_INVALID);
692
693 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
694 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
695
696 msrpm[offset] = tmp;
c4327f15
VP
697
698 svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
699
6aa8b732
AK
700}
701
376c6d28
TL
702void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
703 int read, int write)
6aa8b732 704{
fd6fa73d
AG
705 set_shadow_msr_intercept(vcpu, msr, read, write);
706 set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
707}
708
2fcf4876 709u32 *svm_vcpu_alloc_msrpm(void)
6aa8b732 710{
47903dc1
KS
711 unsigned int order = get_order(MSRPM_SIZE);
712 struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
476c9bd8 713 u32 *msrpm;
f4c847a9
ML
714
715 if (!pages)
716 return NULL;
6aa8b732 717
f4c847a9 718 msrpm = page_address(pages);
47903dc1 719 memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
f65c229c 720
476c9bd8
AL
721 return msrpm;
722}
723
2fcf4876 724void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
476c9bd8
AL
725{
726 int i;
727
ac72a9b7
JR
728 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
729 if (!direct_access_msrs[i].always)
730 continue;
476c9bd8 731 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
ac72a9b7 732 }
f4c847a9 733}
ac72a9b7 734
2fcf4876
ML
735
736void svm_vcpu_free_msrpm(u32 *msrpm)
f4c847a9 737{
47903dc1 738 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
f65c229c
JR
739}
740
fd6fa73d
AG
741static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
742{
743 struct vcpu_svm *svm = to_svm(vcpu);
744 u32 i;
745
746 /*
747 * Set intercept permissions for all direct access MSRs again. They
748 * will automatically get filtered through the MSR filter, so we are
749 * back in sync after this.
750 */
751 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
752 u32 msr = direct_access_msrs[i].index;
753 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
754 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
755
756 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
ac72a9b7 757 }
f65c229c
JR
758}
759
323c3d80
JR
760static void add_msr_offset(u32 offset)
761{
762 int i;
763
764 for (i = 0; i < MSRPM_OFFSETS; ++i) {
765
766 /* Offset already in list? */
767 if (msrpm_offsets[i] == offset)
bfc733a7 768 return;
323c3d80
JR
769
770 /* Slot used by another offset? */
771 if (msrpm_offsets[i] != MSR_INVALID)
772 continue;
773
774 /* Add offset to list */
775 msrpm_offsets[i] = offset;
776
777 return;
6aa8b732 778 }
323c3d80
JR
779
780 /*
781 * If this BUG triggers the msrpm_offsets table has an overflow. Just
782 * increase MSRPM_OFFSETS in this case.
783 */
bfc733a7 784 BUG();
6aa8b732
AK
785}
786
323c3d80 787static void init_msrpm_offsets(void)
f65c229c 788{
323c3d80 789 int i;
f65c229c 790
323c3d80
JR
791 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
792
793 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
794 u32 offset;
795
796 offset = svm_msrpm_offset(direct_access_msrs[i].index);
797 BUG_ON(offset == MSR_INVALID);
798
799 add_msr_offset(offset);
800 }
f65c229c
JR
801}
802
476c9bd8 803static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
24e09cbf 804{
476c9bd8 805 struct vcpu_svm *svm = to_svm(vcpu);
24e09cbf 806
0dc92119 807 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
476c9bd8
AL
808 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
809 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
810 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
811 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
24e09cbf
JR
812}
813
476c9bd8 814static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
24e09cbf 815{
476c9bd8 816 struct vcpu_svm *svm = to_svm(vcpu);
24e09cbf 817
0dc92119 818 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
476c9bd8
AL
819 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
820 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
821 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
822 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
24e09cbf
JR
823}
824
883b0a91 825void disable_nmi_singlestep(struct vcpu_svm *svm)
4aebd0e9
LP
826{
827 svm->nmi_singlestep = false;
640bd6e5 828
ab2f4d73
LP
829 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
830 /* Clear our flags if they were not set by the guest */
831 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
832 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
833 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
834 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
835 }
4aebd0e9
LP
836}
837
8566ac8b
BM
838static void grow_ple_window(struct kvm_vcpu *vcpu)
839{
840 struct vcpu_svm *svm = to_svm(vcpu);
841 struct vmcb_control_area *control = &svm->vmcb->control;
842 int old = control->pause_filter_count;
843
844 control->pause_filter_count = __grow_ple_window(old,
845 pause_filter_count,
846 pause_filter_count_grow,
847 pause_filter_count_max);
848
4f75bcc3 849 if (control->pause_filter_count != old) {
06e7852c 850 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
4f75bcc3
PX
851 trace_kvm_ple_window_update(vcpu->vcpu_id,
852 control->pause_filter_count, old);
853 }
8566ac8b
BM
854}
855
856static void shrink_ple_window(struct kvm_vcpu *vcpu)
857{
858 struct vcpu_svm *svm = to_svm(vcpu);
859 struct vmcb_control_area *control = &svm->vmcb->control;
860 int old = control->pause_filter_count;
861
862 control->pause_filter_count =
863 __shrink_ple_window(old,
864 pause_filter_count,
865 pause_filter_count_shrink,
866 pause_filter_count);
4f75bcc3 867 if (control->pause_filter_count != old) {
06e7852c 868 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
4f75bcc3
PX
869 trace_kvm_ple_window_update(vcpu->vcpu_id,
870 control->pause_filter_count, old);
871 }
8566ac8b
BM
872}
873
52918ed5
TL
874/*
875 * The default MMIO mask is a single bit (excluding the present bit),
876 * which could conflict with the memory encryption bit. Check for
877 * memory encryption support and override the default MMIO mask if
878 * memory encryption is enabled.
879 */
880static __init void svm_adjust_mmio_mask(void)
881{
882 unsigned int enc_bit, mask_bit;
883 u64 msr, mask;
884
885 /* If there is no memory encryption support, use existing mask */
886 if (cpuid_eax(0x80000000) < 0x8000001f)
887 return;
888
889 /* If memory encryption is not enabled, use existing mask */
059e5c32
BS
890 rdmsrl(MSR_AMD64_SYSCFG, msr);
891 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
52918ed5
TL
892 return;
893
894 enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
895 mask_bit = boot_cpu_data.x86_phys_bits;
896
897 /* Increment the mask bit if it is the same as the encryption bit */
898 if (enc_bit == mask_bit)
899 mask_bit++;
900
901 /*
902 * If the mask bit location is below 52, then some bits above the
903 * physical addressing limit will always be reserved, so use the
904 * rsvd_bits() function to generate the mask. This mask, along with
905 * the present bit, will be used to generate a page fault with
906 * PFER.RSV = 1.
907 *
908 * If the mask bit location is 52 (or above), then clear the mask.
909 */
910 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
911
8120337a 912 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
52918ed5
TL
913}
914
dd58f3c9
LR
915static void svm_hardware_teardown(void)
916{
917 int cpu;
918
4cafd0c5 919 sev_hardware_teardown();
dd58f3c9
LR
920
921 for_each_possible_cpu(cpu)
922 svm_cpu_uninit(cpu);
923
47903dc1
KS
924 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
925 get_order(IOPM_SIZE));
dd58f3c9
LR
926 iopm_base = 0;
927}
928
9b58b985
SC
929static __init void svm_set_cpu_caps(void)
930{
931 kvm_set_cpu_caps();
932
408e9a31
PB
933 supported_xss = 0;
934
a50718cc
SC
935 /* CPUID 0x80000001 and 0x8000000A (SVM features) */
936 if (nested) {
9b58b985
SC
937 kvm_cpu_cap_set(X86_FEATURE_SVM);
938
4eb87460 939 if (nrips)
a50718cc
SC
940 kvm_cpu_cap_set(X86_FEATURE_NRIPS);
941
942 if (npt_enabled)
943 kvm_cpu_cap_set(X86_FEATURE_NPT);
14c2bf81 944
5228eb96
ML
945 if (tsc_scaling)
946 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
947
14c2bf81
WH
948 /* Nested VM can receive #VMEXIT instead of triggering #GP */
949 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
a50718cc
SC
950 }
951
93c380e7
SC
952 /* CPUID 0x80000008 */
953 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
954 boot_cpu_has(X86_FEATURE_AMD_SSBD))
955 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
d9db0fd6
PB
956
957 /* CPUID 0x8000001F (SME/SEV features) */
958 sev_set_cpu_caps();
9b58b985
SC
959}
960
6aa8b732
AK
961static __init int svm_hardware_setup(void)
962{
963 int cpu;
964 struct page *iopm_pages;
f65c229c 965 void *iopm_va;
6aa8b732 966 int r;
47903dc1 967 unsigned int order = get_order(IOPM_SIZE);
6aa8b732 968
b26a71a1
SC
969 /*
970 * NX is required for shadow paging and for NPT if the NX huge pages
971 * mitigation is enabled.
972 */
973 if (!boot_cpu_has(X86_FEATURE_NX)) {
974 pr_err_ratelimited("NX (Execute Disable) not supported\n");
975 return -EOPNOTSUPP;
976 }
977 kvm_enable_efer_bits(EFER_NX);
978
47903dc1 979 iopm_pages = alloc_pages(GFP_KERNEL, order);
6aa8b732
AK
980
981 if (!iopm_pages)
982 return -ENOMEM;
c8681339
AL
983
984 iopm_va = page_address(iopm_pages);
47903dc1 985 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
6aa8b732
AK
986 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
987
323c3d80
JR
988 init_msrpm_offsets();
989
cfc48181
SC
990 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
991
1b2fd70c
AG
992 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
993 kvm_enable_efer_bits(EFER_FFXSR);
994
f800650a
ML
995 if (tsc_scaling) {
996 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
997 tsc_scaling = false;
998 } else {
999 pr_info("TSC scaling supported\n");
1000 kvm_has_tsc_control = true;
1001 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
1002 kvm_tsc_scaling_ratio_frac_bits = 32;
1003 }
92a1f12d
JR
1004 }
1005
e5fda4bb 1006 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
844d69c2 1007
8566ac8b
BM
1008 /* Check for pause filtering support */
1009 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1010 pause_filter_count = 0;
1011 pause_filter_thresh = 0;
1012 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
1013 pause_filter_thresh = 0;
1014 }
1015
236de055
AG
1016 if (nested) {
1017 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
eec4b140 1018 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
236de055
AG
1019 }
1020
99840a75
SC
1021 /*
1022 * KVM's MMU doesn't support using 2-level paging for itself, and thus
1023 * NPT isn't supported if the host is using 2-level paging since host
1024 * CR4 is unchanged on VMRUN.
1025 */
1026 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
e3da3acd
JR
1027 npt_enabled = false;
1028
99840a75 1029 if (!boot_cpu_has(X86_FEATURE_NPT))
6c7dac72 1030 npt_enabled = false;
6c7dac72 1031
746700d2
WH
1032 /* Force VM NPT level equal to the host's max NPT level */
1033 kvm_configure_mmu(npt_enabled, get_max_npt_level(),
1034 get_max_npt_level(), PG_LEVEL_1G);
213e0e1f 1035 pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
e3da3acd 1036
e8126bda
SC
1037 /* Note, SEV setup consumes npt_enabled. */
1038 sev_hardware_setup();
fa13680f 1039
1e0c7d40
VP
1040 svm_hv_hardware_setup();
1041
fa13680f
SC
1042 svm_adjust_mmio_mask();
1043
1044 for_each_possible_cpu(cpu) {
1045 r = svm_cpu_init(cpu);
1046 if (r)
1047 goto err;
1048 }
1049
d647eb63
PB
1050 if (nrips) {
1051 if (!boot_cpu_has(X86_FEATURE_NRIPS))
1052 nrips = false;
1053 }
1054
fdf513e3 1055 enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
5881f737 1056
fdf513e3
VK
1057 if (enable_apicv) {
1058 pr_info("AVIC enabled\n");
1059
1060 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
5b8abf1f 1061 }
44a95dae 1062
89c8a498
JN
1063 if (vls) {
1064 if (!npt_enabled ||
5442c269 1065 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
89c8a498
JN
1066 !IS_ENABLED(CONFIG_X86_64)) {
1067 vls = false;
1068 } else {
1069 pr_info("Virtual VMLOAD VMSAVE supported\n");
1070 }
1071 }
1072
3b9c723e
WH
1073 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
1074 svm_gp_erratum_intercept = false;
1075
640bd6e5
JN
1076 if (vgif) {
1077 if (!boot_cpu_has(X86_FEATURE_VGIF))
1078 vgif = false;
1079 else
1080 pr_info("Virtual GIF supported\n");
1081 }
1082
4c84926e
ML
1083 if (lbrv) {
1084 if (!boot_cpu_has(X86_FEATURE_LBRV))
1085 lbrv = false;
1086 else
1087 pr_info("LBR virtualization supported\n");
1088 }
1089
9b58b985 1090 svm_set_cpu_caps();
66a6950f 1091
3edd6839
MG
1092 /*
1093 * It seems that on AMD processors PTE's accessed bit is
1094 * being set by the CPU hardware before the NPF vmexit.
1095 * This is not expected behaviour and our tests fail because
1096 * of it.
1097 * A workaround here is to disable support for
1098 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
1099 * In this case userspace can know if there is support using
1100 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
1101 * it
1102 * If future AMD CPU models change the behaviour described above,
1103 * this variable can be changed accordingly
1104 */
1105 allow_smaller_maxphyaddr = !npt_enabled;
1106
6aa8b732
AK
1107 return 0;
1108
f65c229c 1109err:
dd58f3c9 1110 svm_hardware_teardown();
6aa8b732
AK
1111 return r;
1112}
1113
6aa8b732
AK
1114static void init_seg(struct vmcb_seg *seg)
1115{
1116 seg->selector = 0;
1117 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
e0231715 1118 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
6aa8b732
AK
1119 seg->limit = 0xffff;
1120 seg->base = 0;
1121}
1122
1123static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1124{
1125 seg->selector = 0;
1126 seg->attrib = SVM_SELECTOR_P_MASK | type;
1127 seg->limit = 0xffff;
1128 seg->base = 0;
1129}
1130
307a94c7
IS
1131static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1132{
1133 struct vcpu_svm *svm = to_svm(vcpu);
1134
1135 return svm->nested.ctl.tsc_offset;
1136}
1137
1138static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1139{
5228eb96
ML
1140 struct vcpu_svm *svm = to_svm(vcpu);
1141
1142 return svm->tsc_ratio_msr;
307a94c7
IS
1143}
1144
edcfe540 1145static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
f4e1b3c8
ZA
1146{
1147 struct vcpu_svm *svm = to_svm(vcpu);
116a0a23 1148
edcfe540
IS
1149 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1150 svm->vmcb->control.tsc_offset = offset;
06e7852c 1151 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
f4e1b3c8
ZA
1152}
1153
5228eb96 1154void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1ab9287a
IS
1155{
1156 wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
1157}
1158
3b195ac9
SC
1159/* Evaluate instruction intercepts that depend on guest CPUID features. */
1160static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1161 struct vcpu_svm *svm)
4407a797
BM
1162{
1163 /*
0a8ed2ea
SC
1164 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1165 * roots, or if INVPCID is disabled in the guest to inject #UD.
4407a797
BM
1166 */
1167 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
0a8ed2ea
SC
1168 if (!npt_enabled ||
1169 !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
4407a797
BM
1170 svm_set_intercept(svm, INTERCEPT_INVPCID);
1171 else
1172 svm_clr_intercept(svm, INTERCEPT_INVPCID);
1173 }
3b195ac9
SC
1174
1175 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1176 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1177 svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1178 else
1179 svm_set_intercept(svm, INTERCEPT_RDTSCP);
1180 }
4407a797
BM
1181}
1182
36e8194d
PB
1183static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1184{
1185 struct vcpu_svm *svm = to_svm(vcpu);
1186
1187 if (guest_cpuid_is_intel(vcpu)) {
1188 /*
1189 * We must intercept SYSENTER_EIP and SYSENTER_ESP
1190 * accesses because the processor only stores 32 bits.
1191 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1192 */
1193 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1194 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1195 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1196
1197 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1198 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1199 } else {
1200 /*
1201 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1202 * in VMCB and clear intercepts to avoid #VMEXIT.
1203 */
1204 if (vls) {
1205 svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1206 svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1207 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1208 }
1209 /* No need to intercept these MSRs */
1210 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1211 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1212 }
1213}
1214
63129754 1215static void init_vmcb(struct kvm_vcpu *vcpu)
6aa8b732 1216{
63129754 1217 struct vcpu_svm *svm = to_svm(vcpu);
e6101a96
JR
1218 struct vmcb_control_area *control = &svm->vmcb->control;
1219 struct vmcb_save_area *save = &svm->vmcb->save;
6aa8b732 1220
830bd71f
BM
1221 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1222 svm_set_intercept(svm, INTERCEPT_CR3_READ);
1223 svm_set_intercept(svm, INTERCEPT_CR4_READ);
1224 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1225 svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1226 svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
63129754 1227 if (!kvm_vcpu_apicv_active(vcpu))
830bd71f 1228 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
6aa8b732 1229
5315c716 1230 set_dr_intercepts(svm);
6aa8b732 1231
18c918c5
JR
1232 set_exception_intercept(svm, PF_VECTOR);
1233 set_exception_intercept(svm, UD_VECTOR);
1234 set_exception_intercept(svm, MC_VECTOR);
54a20552 1235 set_exception_intercept(svm, AC_VECTOR);
cbdb967a 1236 set_exception_intercept(svm, DB_VECTOR);
9718420e
LA
1237 /*
1238 * Guest access to VMware backdoor ports could legitimately
1239 * trigger #GP because of TSS I/O permission bitmap.
1240 * We intercept those #GP and allow access to them anyway
1241 * as VMware does.
1242 */
1243 if (enable_vmware_backdoor)
1244 set_exception_intercept(svm, GP_VECTOR);
6aa8b732 1245
a284ba56
JR
1246 svm_set_intercept(svm, INTERCEPT_INTR);
1247 svm_set_intercept(svm, INTERCEPT_NMI);
4b639a9f
ML
1248
1249 if (intercept_smi)
1250 svm_set_intercept(svm, INTERCEPT_SMI);
1251
a284ba56
JR
1252 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1253 svm_set_intercept(svm, INTERCEPT_RDPMC);
1254 svm_set_intercept(svm, INTERCEPT_CPUID);
1255 svm_set_intercept(svm, INTERCEPT_INVD);
1256 svm_set_intercept(svm, INTERCEPT_INVLPG);
1257 svm_set_intercept(svm, INTERCEPT_INVLPGA);
1258 svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1259 svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1260 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1261 svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1262 svm_set_intercept(svm, INTERCEPT_VMRUN);
1263 svm_set_intercept(svm, INTERCEPT_VMMCALL);
1264 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1265 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1266 svm_set_intercept(svm, INTERCEPT_STGI);
1267 svm_set_intercept(svm, INTERCEPT_CLGI);
1268 svm_set_intercept(svm, INTERCEPT_SKINIT);
1269 svm_set_intercept(svm, INTERCEPT_WBINVD);
1270 svm_set_intercept(svm, INTERCEPT_XSETBV);
1271 svm_set_intercept(svm, INTERCEPT_RDPRU);
1272 svm_set_intercept(svm, INTERCEPT_RSM);
6aa8b732 1273
63129754 1274 if (!kvm_mwait_in_guest(vcpu->kvm)) {
a284ba56
JR
1275 svm_set_intercept(svm, INTERCEPT_MONITOR);
1276 svm_set_intercept(svm, INTERCEPT_MWAIT);
668fffa3
MT
1277 }
1278
63129754 1279 if (!kvm_hlt_in_guest(vcpu->kvm))
a284ba56 1280 svm_set_intercept(svm, INTERCEPT_HLT);
caa057a2 1281
d0ec49d4
TL
1282 control->iopm_base_pa = __sme_set(iopm_base);
1283 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
6aa8b732
AK
1284 control->int_ctl = V_INTR_MASKING_MASK;
1285
1286 init_seg(&save->es);
1287 init_seg(&save->ss);
1288 init_seg(&save->ds);
1289 init_seg(&save->fs);
1290 init_seg(&save->gs);
1291
1292 save->cs.selector = 0xf000;
04b66839 1293 save->cs.base = 0xffff0000;
6aa8b732
AK
1294 /* Executable/Readable Code Segment */
1295 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1296 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1297 save->cs.limit = 0xffff;
6aa8b732 1298
4f117ce4 1299 save->gdtr.base = 0;
6aa8b732 1300 save->gdtr.limit = 0xffff;
4f117ce4 1301 save->idtr.base = 0;
6aa8b732
AK
1302 save->idtr.limit = 0xffff;
1303
1304 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1305 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1306
709ddebf
JR
1307 if (npt_enabled) {
1308 /* Setup VMCB for Nested Paging */
cea3a19b 1309 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
a284ba56 1310 svm_clr_intercept(svm, INTERCEPT_INVLPG);
18c918c5 1311 clr_exception_intercept(svm, PF_VECTOR);
830bd71f
BM
1312 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1313 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
63129754 1314 save->g_pat = vcpu->arch.pat;
709ddebf 1315 save->cr3 = 0;
709ddebf 1316 }
193015ad 1317 svm->current_vmcb->asid_generation = 0;
7e8e6eed 1318 svm->asid = 0;
1371d904 1319
c74ad08f
ML
1320 svm->nested.vmcb12_gpa = INVALID_GPA;
1321 svm->nested.last_vmcb12_gpa = INVALID_GPA;
2af9194d 1322
63129754 1323 if (!kvm_pause_in_guest(vcpu->kvm)) {
8566ac8b
BM
1324 control->pause_filter_count = pause_filter_count;
1325 if (pause_filter_thresh)
1326 control->pause_filter_thresh = pause_filter_thresh;
a284ba56 1327 svm_set_intercept(svm, INTERCEPT_PAUSE);
8566ac8b 1328 } else {
a284ba56 1329 svm_clr_intercept(svm, INTERCEPT_PAUSE);
565d0998
ML
1330 }
1331
3b195ac9 1332 svm_recalc_instruction_intercepts(vcpu, svm);
4407a797 1333
89c8a498 1334 /*
d00b99c5
BM
1335 * If the host supports V_SPEC_CTRL then disable the interception
1336 * of MSR_IA32_SPEC_CTRL.
89c8a498 1337 */
d00b99c5
BM
1338 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1339 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1340
63129754 1341 if (kvm_vcpu_apicv_active(vcpu))
44a95dae 1342 avic_init_vmcb(svm);
89c8a498 1343
640bd6e5 1344 if (vgif) {
a284ba56
JR
1345 svm_clr_intercept(svm, INTERCEPT_STGI);
1346 svm_clr_intercept(svm, INTERCEPT_CLGI);
640bd6e5
JN
1347 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1348 }
1349
63129754 1350 if (sev_guest(vcpu->kvm)) {
1654efcb 1351 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
35c6f649 1352 clr_exception_intercept(svm, UD_VECTOR);
376c6d28 1353
63129754 1354 if (sev_es_guest(vcpu->kvm)) {
376c6d28
TL
1355 /* Perform SEV-ES specific VMCB updates */
1356 sev_es_init_vmcb(svm);
1357 }
35c6f649 1358 }
1654efcb 1359
1e0c7d40 1360 svm_hv_init_vmcb(svm->vmcb);
36e8194d 1361 init_vmcb_after_set_cpuid(vcpu);
1e0c7d40 1362
06e7852c 1363 vmcb_mark_all_dirty(svm->vmcb);
8d28fec4 1364
2af9194d 1365 enable_gif(svm);
44a95dae
SS
1366}
1367
9ebe530b
SC
1368static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1369{
1370 struct vcpu_svm *svm = to_svm(vcpu);
44a95dae 1371
9ebe530b
SC
1372 svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1373
1374 svm_init_osvw(vcpu);
1375 vcpu->arch.microcode_version = 0x01000065;
5228eb96 1376 svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio;
9ebe530b
SC
1377
1378 if (sev_es_guest(vcpu->kvm))
1379 sev_es_vcpu_reset(svm);
44a95dae
SS
1380}
1381
d28bc9dd 1382static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
04d2cc77
AK
1383{
1384 struct vcpu_svm *svm = to_svm(vcpu);
1385
b2ac58f9 1386 svm->spec_ctrl = 0;
ccbcd267 1387 svm->virt_spec_ctrl = 0;
b2ac58f9 1388
63129754 1389 init_vmcb(vcpu);
9ebe530b
SC
1390
1391 if (!init_event)
1392 __svm_vcpu_reset(vcpu);
04d2cc77
AK
1393}
1394
4995a368
CA
1395void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1396{
1397 svm->current_vmcb = target_vmcb;
1398 svm->vmcb = target_vmcb->ptr;
4995a368
CA
1399}
1400
987b2594 1401static int svm_create_vcpu(struct kvm_vcpu *vcpu)
6aa8b732 1402{
a2fa3e9f 1403 struct vcpu_svm *svm;
4995a368 1404 struct page *vmcb01_page;
add5e2f0 1405 struct page *vmsa_page = NULL;
fb3f0f51 1406 int err;
6aa8b732 1407
a9dd6f09
SC
1408 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1409 svm = to_svm(vcpu);
fb3f0f51 1410
b7af4043 1411 err = -ENOMEM;
4995a368
CA
1412 vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1413 if (!vmcb01_page)
987b2594 1414 goto out;
6aa8b732 1415
63129754 1416 if (sev_es_guest(vcpu->kvm)) {
add5e2f0
TL
1417 /*
1418 * SEV-ES guests require a separate VMSA page used to contain
1419 * the encrypted register state of the guest.
1420 */
1421 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1422 if (!vmsa_page)
1423 goto error_free_vmcb_page;
ed02b213
TL
1424
1425 /*
1426 * SEV-ES guests maintain an encrypted version of their FPU
1427 * state which is restored and saved on VMRUN and VMEXIT.
d69c1382
TG
1428 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1429 * do xsave/xrstor on it.
ed02b213 1430 */
d69c1382 1431 fpstate_set_confidential(&vcpu->arch.guest_fpu);
add5e2f0
TL
1432 }
1433
dfa20099
SS
1434 err = avic_init_vcpu(svm);
1435 if (err)
add5e2f0 1436 goto error_free_vmsa_page;
44a95dae 1437
8221c137
SS
1438 /* We initialize this flag to true to make sure that the is_running
1439 * bit would be set the first time the vcpu is loaded.
1440 */
6c3e4422
SS
1441 if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
1442 svm->avic_is_running = true;
8221c137 1443
476c9bd8 1444 svm->msrpm = svm_vcpu_alloc_msrpm();
054409ab
CZ
1445 if (!svm->msrpm) {
1446 err = -ENOMEM;
add5e2f0 1447 goto error_free_vmsa_page;
054409ab 1448 }
b7af4043 1449
4995a368
CA
1450 svm->vmcb01.ptr = page_address(vmcb01_page);
1451 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
9ebe530b 1452 svm_switch_vmcb(svm, &svm->vmcb01);
add5e2f0
TL
1453
1454 if (vmsa_page)
b67a4cc3 1455 svm->sev_es.vmsa = page_address(vmsa_page);
add5e2f0 1456
a7fc06dd 1457 svm->guest_state_loaded = false;
4995a368 1458
a9dd6f09 1459 return 0;
36241b8c 1460
add5e2f0
TL
1461error_free_vmsa_page:
1462 if (vmsa_page)
1463 __free_page(vmsa_page);
8d22b90e 1464error_free_vmcb_page:
4995a368 1465 __free_page(vmcb01_page);
987b2594 1466out:
a9dd6f09 1467 return err;
6aa8b732
AK
1468}
1469
fd65d314
JM
1470static void svm_clear_current_vmcb(struct vmcb *vmcb)
1471{
1472 int i;
1473
1474 for_each_online_cpu(i)
1475 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1476}
1477
6aa8b732
AK
1478static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1479{
a2fa3e9f
GH
1480 struct vcpu_svm *svm = to_svm(vcpu);
1481
fd65d314
JM
1482 /*
1483 * The vmcb page can be recycled, causing a false negative in
1484 * svm_vcpu_load(). So, ensure that no logical CPU has this
1485 * vmcb page recorded as its current vmcb.
1486 */
1487 svm_clear_current_vmcb(svm->vmcb);
1488
2fcf4876
ML
1489 svm_free_nested(svm);
1490
add5e2f0
TL
1491 sev_free_vcpu(vcpu);
1492
4995a368 1493 __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
47903dc1 1494 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
6aa8b732
AK
1495}
1496
a7fc06dd 1497static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
6aa8b732 1498{
a2fa3e9f 1499 struct vcpu_svm *svm = to_svm(vcpu);
a7fc06dd 1500 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
0cc5064d 1501
ce7ea0cf
TL
1502 if (sev_es_guest(vcpu->kvm))
1503 sev_es_unmap_ghcb(svm);
1504
a7fc06dd
MR
1505 if (svm->guest_state_loaded)
1506 return;
1507
a7fc06dd
MR
1508 /*
1509 * Save additional host state that will be restored on VMEXIT (sev-es)
1510 * or subsequent vmload of host save area.
1511 */
63129754 1512 if (sev_es_guest(vcpu->kvm)) {
a7fc06dd 1513 sev_es_prepare_guest_switch(svm, vcpu->cpu);
86137773 1514 } else {
e79b91bb 1515 vmsave(__sme_page_pa(sd->save_area));
86137773 1516 }
fbc0db76 1517
f800650a 1518 if (tsc_scaling) {
ad721883
HZ
1519 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1520 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1521 __this_cpu_write(current_tsc_ratio, tsc_ratio);
1522 wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1523 }
fbc0db76 1524 }
a7fc06dd 1525
0caa0a77
SC
1526 if (likely(tsc_aux_uret_slot >= 0))
1527 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
8221c137 1528
a7fc06dd
MR
1529 svm->guest_state_loaded = true;
1530}
1531
1532static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1533{
844d69c2 1534 to_svm(vcpu)->guest_state_loaded = false;
a7fc06dd
MR
1535}
1536
1537static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1538{
1539 struct vcpu_svm *svm = to_svm(vcpu);
1540 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1541
15d45071
AR
1542 if (sd->current_vmcb != svm->vmcb) {
1543 sd->current_vmcb = svm->vmcb;
1544 indirect_branch_prediction_barrier();
1545 }
bf5f6b9d
ML
1546 if (kvm_vcpu_apicv_active(vcpu))
1547 avic_vcpu_load(vcpu, cpu);
6aa8b732
AK
1548}
1549
1550static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1551{
bf5f6b9d
ML
1552 if (kvm_vcpu_apicv_active(vcpu))
1553 avic_vcpu_put(vcpu);
1554
a7fc06dd 1555 svm_prepare_host_switch(vcpu);
8221c137 1556
e1beb1d3 1557 ++vcpu->stat.host_state_reload;
6aa8b732
AK
1558}
1559
6aa8b732
AK
1560static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1561{
9b611747
LP
1562 struct vcpu_svm *svm = to_svm(vcpu);
1563 unsigned long rflags = svm->vmcb->save.rflags;
1564
1565 if (svm->nmi_singlestep) {
1566 /* Hide our flags if they were not set by the guest */
1567 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1568 rflags &= ~X86_EFLAGS_TF;
1569 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1570 rflags &= ~X86_EFLAGS_RF;
1571 }
1572 return rflags;
6aa8b732
AK
1573}
1574
1575static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1576{
9b611747
LP
1577 if (to_svm(vcpu)->nmi_singlestep)
1578 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1579
ae9fedc7 1580 /*
bb3541f1 1581 * Any change of EFLAGS.VM is accompanied by a reload of SS
ae9fedc7
PB
1582 * (caused by either a task switch or an inter-privilege IRET),
1583 * so we do not need to update the CPL here.
1584 */
a2fa3e9f 1585 to_svm(vcpu)->vmcb->save.rflags = rflags;
6aa8b732
AK
1586}
1587
6de4f3ad
AK
1588static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1589{
1590 switch (reg) {
1591 case VCPU_EXREG_PDPTR:
1592 BUG_ON(!npt_enabled);
9f8fe504 1593 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
6de4f3ad
AK
1594 break;
1595 default:
67369273 1596 KVM_BUG_ON(1, vcpu->kvm);
6de4f3ad
AK
1597 }
1598}
1599
e14b7786 1600static void svm_set_vintr(struct vcpu_svm *svm)
64b5bd27
PB
1601{
1602 struct vmcb_control_area *control;
1603
f1577ab2
ML
1604 /*
1605 * The following fields are ignored when AVIC is enabled
1606 */
1607 WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
1608
a284ba56 1609 svm_set_intercept(svm, INTERCEPT_VINTR);
64b5bd27
PB
1610
1611 /*
1612 * This is just a dummy VINTR to actually cause a vmexit to happen.
1613 * Actual injection of virtual interrupts happens through EVENTINJ.
1614 */
1615 control = &svm->vmcb->control;
1616 control->int_vector = 0x0;
1617 control->int_ctl &= ~V_INTR_PRIO_MASK;
1618 control->int_ctl |= V_IRQ_MASK |
1619 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
06e7852c 1620 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
64b5bd27
PB
1621}
1622
f0b85051
AG
1623static void svm_clear_vintr(struct vcpu_svm *svm)
1624{
a284ba56 1625 svm_clr_intercept(svm, INTERCEPT_VINTR);
64b5bd27 1626
d8e4e58f 1627 /* Drop int_ctl fields related to VINTR injection. */
0f923e07 1628 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
d8e4e58f 1629 if (is_guest_mode(&svm->vcpu)) {
0f923e07 1630 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
fb7333df 1631
d8e4e58f
PB
1632 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1633 (svm->nested.ctl.int_ctl & V_TPR_MASK));
0f923e07
ML
1634
1635 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1636 V_IRQ_INJECTION_BITS_MASK;
aee77e11
ML
1637
1638 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
d8e4e58f
PB
1639 }
1640
06e7852c 1641 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
f0b85051
AG
1642}
1643
6aa8b732
AK
1644static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1645{
a2fa3e9f 1646 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
cc3ed80a 1647 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
6aa8b732
AK
1648
1649 switch (seg) {
1650 case VCPU_SREG_CS: return &save->cs;
1651 case VCPU_SREG_DS: return &save->ds;
1652 case VCPU_SREG_ES: return &save->es;
cc3ed80a
ML
1653 case VCPU_SREG_FS: return &save01->fs;
1654 case VCPU_SREG_GS: return &save01->gs;
6aa8b732 1655 case VCPU_SREG_SS: return &save->ss;
cc3ed80a
ML
1656 case VCPU_SREG_TR: return &save01->tr;
1657 case VCPU_SREG_LDTR: return &save01->ldtr;
6aa8b732
AK
1658 }
1659 BUG();
8b6d44c7 1660 return NULL;
6aa8b732
AK
1661}
1662
1663static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1664{
1665 struct vmcb_seg *s = svm_seg(vcpu, seg);
1666
1667 return s->base;
1668}
1669
1670static void svm_get_segment(struct kvm_vcpu *vcpu,
1671 struct kvm_segment *var, int seg)
1672{
1673 struct vmcb_seg *s = svm_seg(vcpu, seg);
1674
1675 var->base = s->base;
1676 var->limit = s->limit;
1677 var->selector = s->selector;
1678 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1679 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1680 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1681 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1682 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1683 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1684 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
80112c89
JM
1685
1686 /*
1687 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1688 * However, the SVM spec states that the G bit is not observed by the
1689 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1690 * So let's synthesize a legal G bit for all segments, this helps
1691 * running KVM nested. It also helps cross-vendor migration, because
1692 * Intel's vmentry has a check on the 'G' bit.
1693 */
1694 var->g = s->limit > 0xfffff;
25022acc 1695
e0231715
JR
1696 /*
1697 * AMD's VMCB does not have an explicit unusable field, so emulate it
19bca6ab
AP
1698 * for cross vendor migration purposes by "not present"
1699 */
8eae9570 1700 var->unusable = !var->present;
19bca6ab 1701
1fbdc7a5 1702 switch (seg) {
1fbdc7a5
AP
1703 case VCPU_SREG_TR:
1704 /*
1705 * Work around a bug where the busy flag in the tr selector
1706 * isn't exposed
1707 */
c0d09828 1708 var->type |= 0x2;
1fbdc7a5
AP
1709 break;
1710 case VCPU_SREG_DS:
1711 case VCPU_SREG_ES:
1712 case VCPU_SREG_FS:
1713 case VCPU_SREG_GS:
1714 /*
1715 * The accessed bit must always be set in the segment
1716 * descriptor cache, although it can be cleared in the
1717 * descriptor, the cached bit always remains at 1. Since
1718 * Intel has a check on this, set it here to support
1719 * cross-vendor migration.
1720 */
1721 if (!var->unusable)
1722 var->type |= 0x1;
1723 break;
b586eb02 1724 case VCPU_SREG_SS:
e0231715
JR
1725 /*
1726 * On AMD CPUs sometimes the DB bit in the segment
b586eb02
AP
1727 * descriptor is left as 1, although the whole segment has
1728 * been made unusable. Clear it here to pass an Intel VMX
1729 * entry check when cross vendor migrating.
1730 */
1731 if (var->unusable)
1732 var->db = 0;
d9c1b543 1733 /* This is symmetric with svm_set_segment() */
33b458d2 1734 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
b586eb02 1735 break;
1fbdc7a5 1736 }
6aa8b732
AK
1737}
1738
2e4d2653
IE
1739static int svm_get_cpl(struct kvm_vcpu *vcpu)
1740{
1741 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1742
1743 return save->cpl;
1744}
1745
89a27f4d 1746static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1747{
a2fa3e9f
GH
1748 struct vcpu_svm *svm = to_svm(vcpu);
1749
89a27f4d
GN
1750 dt->size = svm->vmcb->save.idtr.limit;
1751 dt->address = svm->vmcb->save.idtr.base;
6aa8b732
AK
1752}
1753
89a27f4d 1754static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1755{
a2fa3e9f
GH
1756 struct vcpu_svm *svm = to_svm(vcpu);
1757
89a27f4d
GN
1758 svm->vmcb->save.idtr.limit = dt->size;
1759 svm->vmcb->save.idtr.base = dt->address ;
06e7852c 1760 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
6aa8b732
AK
1761}
1762
89a27f4d 1763static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1764{
a2fa3e9f
GH
1765 struct vcpu_svm *svm = to_svm(vcpu);
1766
89a27f4d
GN
1767 dt->size = svm->vmcb->save.gdtr.limit;
1768 dt->address = svm->vmcb->save.gdtr.base;
6aa8b732
AK
1769}
1770
89a27f4d 1771static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1772{
a2fa3e9f
GH
1773 struct vcpu_svm *svm = to_svm(vcpu);
1774
89a27f4d
GN
1775 svm->vmcb->save.gdtr.limit = dt->size;
1776 svm->vmcb->save.gdtr.base = dt->address ;
06e7852c 1777 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
6aa8b732
AK
1778}
1779
883b0a91 1780void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
6aa8b732 1781{
a2fa3e9f 1782 struct vcpu_svm *svm = to_svm(vcpu);
2a32a77c 1783 u64 hcr0 = cr0;
a2fa3e9f 1784
05b3e0c2 1785#ifdef CONFIG_X86_64
f1c6366e 1786 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
707d92fa 1787 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
f6801dff 1788 vcpu->arch.efer |= EFER_LMA;
2b5203ee 1789 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
6aa8b732
AK
1790 }
1791
d77c26fc 1792 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
f6801dff 1793 vcpu->arch.efer &= ~EFER_LMA;
2b5203ee 1794 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
6aa8b732
AK
1795 }
1796 }
1797#endif
ad312c7c 1798 vcpu->arch.cr0 = cr0;
888f9f3e
AK
1799
1800 if (!npt_enabled)
2a32a77c 1801 hcr0 |= X86_CR0_PG | X86_CR0_WP;
02daab21 1802
bcf166a9
PB
1803 /*
1804 * re-enable caching here because the QEMU bios
1805 * does not do it - this results in some delay at
1806 * reboot
1807 */
1808 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
2a32a77c
PB
1809 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1810
1811 svm->vmcb->save.cr0 = hcr0;
06e7852c 1812 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
2a32a77c
PB
1813
1814 /*
1815 * SEV-ES guests must always keep the CR intercepts cleared. CR
1816 * tracking is done using the CR write traps.
1817 */
63129754 1818 if (sev_es_guest(vcpu->kvm))
2a32a77c
PB
1819 return;
1820
1821 if (hcr0 == cr0) {
1822 /* Selective CR0 write remains on. */
1823 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1824 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1825 } else {
1826 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1827 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1828 }
6aa8b732
AK
1829}
1830
c2fe3cd4
SC
1831static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1832{
1833 return true;
1834}
1835
1836void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6aa8b732 1837{
1e02ce4c 1838 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
dc924b06 1839 unsigned long old_cr4 = vcpu->arch.cr4;
e5eab0ce
JR
1840
1841 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
f55ac304 1842 svm_flush_tlb(vcpu);
6394b649 1843
ec077263
JR
1844 vcpu->arch.cr4 = cr4;
1845 if (!npt_enabled)
1846 cr4 |= X86_CR4_PAE;
6394b649 1847 cr4 |= host_cr4_mce;
ec077263 1848 to_svm(vcpu)->vmcb->save.cr4 = cr4;
06e7852c 1849 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
2259c17f
JM
1850
1851 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1852 kvm_update_cpuid_runtime(vcpu);
6aa8b732
AK
1853}
1854
1855static void svm_set_segment(struct kvm_vcpu *vcpu,
1856 struct kvm_segment *var, int seg)
1857{
a2fa3e9f 1858 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732
AK
1859 struct vmcb_seg *s = svm_seg(vcpu, seg);
1860
1861 s->base = var->base;
1862 s->limit = var->limit;
1863 s->selector = var->selector;
d9c1b543
RP
1864 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1865 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1866 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1867 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1868 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1869 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1870 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1871 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
ae9fedc7
PB
1872
1873 /*
1874 * This is always accurate, except if SYSRET returned to a segment
1875 * with SS.DPL != 3. Intel does not have this quirk, and always
1876 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1877 * would entail passing the CPL to userspace and back.
1878 */
1879 if (seg == VCPU_SREG_SS)
d9c1b543
RP
1880 /* This is symmetric with svm_get_segment() */
1881 svm->vmcb->save.cpl = (var->dpl & 3);
6aa8b732 1882
06e7852c 1883 vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
6aa8b732
AK
1884}
1885
b6a7cc35 1886static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
6aa8b732 1887{
d0bfb940
JK
1888 struct vcpu_svm *svm = to_svm(vcpu);
1889
18c918c5 1890 clr_exception_intercept(svm, BP_VECTOR);
44c11430 1891
d0bfb940 1892 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
d0bfb940 1893 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
18c918c5 1894 set_exception_intercept(svm, BP_VECTOR);
6986982f 1895 }
44c11430
GN
1896}
1897
0fe1e009 1898static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
6aa8b732 1899{
0fe1e009
TH
1900 if (sd->next_asid > sd->max_asid) {
1901 ++sd->asid_generation;
4faefff3 1902 sd->next_asid = sd->min_asid;
a2fa3e9f 1903 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
7e8e6eed 1904 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
6aa8b732
AK
1905 }
1906
193015ad 1907 svm->current_vmcb->asid_generation = sd->asid_generation;
7e8e6eed 1908 svm->asid = sd->next_asid++;
6aa8b732
AK
1909}
1910
d67668e9 1911static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
73aaf249 1912{
d67668e9 1913 struct vmcb *vmcb = svm->vmcb;
73aaf249 1914
8d4846b9
TL
1915 if (svm->vcpu.arch.guest_state_protected)
1916 return;
1917
d67668e9
PB
1918 if (unlikely(value != vmcb->save.dr6)) {
1919 vmcb->save.dr6 = value;
06e7852c 1920 vmcb_mark_dirty(vmcb, VMCB_DR);
d67668e9 1921 }
73aaf249
JK
1922}
1923
facb0139
PB
1924static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1925{
1926 struct vcpu_svm *svm = to_svm(vcpu);
1927
8d4846b9
TL
1928 if (vcpu->arch.guest_state_protected)
1929 return;
1930
facb0139
PB
1931 get_debugreg(vcpu->arch.db[0], 0);
1932 get_debugreg(vcpu->arch.db[1], 1);
1933 get_debugreg(vcpu->arch.db[2], 2);
1934 get_debugreg(vcpu->arch.db[3], 3);
d67668e9 1935 /*
9a3ecd5e 1936 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
d67668e9
PB
1937 * because db_interception might need it. We can do it before vmentry.
1938 */
5679b803 1939 vcpu->arch.dr6 = svm->vmcb->save.dr6;
facb0139 1940 vcpu->arch.dr7 = svm->vmcb->save.dr7;
facb0139
PB
1941 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1942 set_dr_intercepts(svm);
1943}
1944
020df079 1945static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
6aa8b732 1946{
42dbaa5a 1947 struct vcpu_svm *svm = to_svm(vcpu);
42dbaa5a 1948
8d4846b9
TL
1949 if (vcpu->arch.guest_state_protected)
1950 return;
1951
020df079 1952 svm->vmcb->save.dr7 = value;
06e7852c 1953 vmcb_mark_dirty(svm->vmcb, VMCB_DR);
6aa8b732
AK
1954}
1955
63129754 1956static int pf_interception(struct kvm_vcpu *vcpu)
6aa8b732 1957{
63129754
PB
1958 struct vcpu_svm *svm = to_svm(vcpu);
1959
6d1b867d 1960 u64 fault_address = svm->vmcb->control.exit_info_2;
1261bfa3 1961 u64 error_code = svm->vmcb->control.exit_info_1;
6aa8b732 1962
63129754 1963 return kvm_handle_page_fault(vcpu, error_code, fault_address,
00b10fe1
BS
1964 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1965 svm->vmcb->control.insn_bytes : NULL,
d0006530
PB
1966 svm->vmcb->control.insn_len);
1967}
1968
63129754 1969static int npf_interception(struct kvm_vcpu *vcpu)
d0006530 1970{
63129754
PB
1971 struct vcpu_svm *svm = to_svm(vcpu);
1972
76ff371b 1973 u64 fault_address = svm->vmcb->control.exit_info_2;
d0006530
PB
1974 u64 error_code = svm->vmcb->control.exit_info_1;
1975
1976 trace_kvm_page_fault(fault_address, error_code);
63129754 1977 return kvm_mmu_page_fault(vcpu, fault_address, error_code,
00b10fe1
BS
1978 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1979 svm->vmcb->control.insn_bytes : NULL,
d0006530 1980 svm->vmcb->control.insn_len);
6aa8b732
AK
1981}
1982
63129754 1983static int db_interception(struct kvm_vcpu *vcpu)
d0bfb940 1984{
63129754
PB
1985 struct kvm_run *kvm_run = vcpu->run;
1986 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 1987
63129754 1988 if (!(vcpu->guest_debug &
44c11430 1989 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
6be7d306 1990 !svm->nmi_singlestep) {
9a3ecd5e 1991 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
63129754 1992 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
d0bfb940
JK
1993 return 1;
1994 }
44c11430 1995
6be7d306 1996 if (svm->nmi_singlestep) {
4aebd0e9 1997 disable_nmi_singlestep(svm);
99c22179
VK
1998 /* Make sure we check for pending NMIs upon entry */
1999 kvm_make_request(KVM_REQ_EVENT, vcpu);
44c11430
GN
2000 }
2001
63129754 2002 if (vcpu->guest_debug &
e0231715 2003 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
44c11430 2004 kvm_run->exit_reason = KVM_EXIT_DEBUG;
dee919d1
PB
2005 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2006 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
44c11430
GN
2007 kvm_run->debug.arch.pc =
2008 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2009 kvm_run->debug.arch.exception = DB_VECTOR;
2010 return 0;
2011 }
2012
2013 return 1;
d0bfb940
JK
2014}
2015
63129754 2016static int bp_interception(struct kvm_vcpu *vcpu)
d0bfb940 2017{
63129754
PB
2018 struct vcpu_svm *svm = to_svm(vcpu);
2019 struct kvm_run *kvm_run = vcpu->run;
851ba692 2020
d0bfb940
JK
2021 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2022 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2023 kvm_run->debug.arch.exception = BP_VECTOR;
2024 return 0;
2025}
2026
63129754 2027static int ud_interception(struct kvm_vcpu *vcpu)
7aa81cc0 2028{
63129754 2029 return handle_ud(vcpu);
7aa81cc0
AL
2030}
2031
63129754 2032static int ac_interception(struct kvm_vcpu *vcpu)
54a20552 2033{
63129754 2034 kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
54a20552
EN
2035 return 1;
2036}
2037
67ec6607
JR
2038static bool is_erratum_383(void)
2039{
2040 int err, i;
2041 u64 value;
2042
2043 if (!erratum_383_found)
2044 return false;
2045
2046 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2047 if (err)
2048 return false;
2049
2050 /* Bit 62 may or may not be set for this mce */
2051 value &= ~(1ULL << 62);
2052
2053 if (value != 0xb600000000010015ULL)
2054 return false;
2055
2056 /* Clear MCi_STATUS registers */
2057 for (i = 0; i < 6; ++i)
2058 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2059
2060 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2061 if (!err) {
2062 u32 low, high;
2063
2064 value &= ~(1ULL << 2);
2065 low = lower_32_bits(value);
2066 high = upper_32_bits(value);
2067
2068 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2069 }
2070
2071 /* Flush tlb to evict multi-match entries */
2072 __flush_tlb_all();
2073
2074 return true;
2075}
2076
63129754 2077static void svm_handle_mce(struct kvm_vcpu *vcpu)
53371b50 2078{
67ec6607
JR
2079 if (is_erratum_383()) {
2080 /*
2081 * Erratum 383 triggered. Guest state is corrupt so kill the
2082 * guest.
2083 */
2084 pr_err("KVM: Guest triggered AMD Erratum 383\n");
2085
63129754 2086 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
67ec6607
JR
2087
2088 return;
2089 }
2090
53371b50
JR
2091 /*
2092 * On an #MC intercept the MCE handler is not called automatically in
2093 * the host. So do it by hand here.
2094 */
1c164cb3 2095 kvm_machine_check();
fe5913e4
JR
2096}
2097
63129754 2098static int mc_interception(struct kvm_vcpu *vcpu)
fe5913e4 2099{
53371b50
JR
2100 return 1;
2101}
2102
63129754 2103static int shutdown_interception(struct kvm_vcpu *vcpu)
46fe4ddd 2104{
63129754
PB
2105 struct kvm_run *kvm_run = vcpu->run;
2106 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 2107
8164a5ff
TL
2108 /*
2109 * The VM save area has already been encrypted so it
2110 * cannot be reinitialized - just terminate.
2111 */
63129754 2112 if (sev_es_guest(vcpu->kvm))
8164a5ff
TL
2113 return -EINVAL;
2114
46fe4ddd 2115 /*
265e4353
SC
2116 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
2117 * the VMCB in a known good state. Unfortuately, KVM doesn't have
2118 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2119 * userspace. At a platform view, INIT is acceptable behavior as
2120 * there exist bare metal platforms that automatically INIT the CPU
2121 * in response to shutdown.
46fe4ddd 2122 */
a2fa3e9f 2123 clear_page(svm->vmcb);
265e4353 2124 kvm_vcpu_reset(vcpu, true);
46fe4ddd
JR
2125
2126 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2127 return 0;
2128}
2129
63129754 2130static int io_interception(struct kvm_vcpu *vcpu)
6aa8b732 2131{
63129754 2132 struct vcpu_svm *svm = to_svm(vcpu);
d77c26fc 2133 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
dca7f128 2134 int size, in, string;
039576c0 2135 unsigned port;
6aa8b732 2136
63129754 2137 ++vcpu->stat.io_exits;
e70669ab 2138 string = (io_info & SVM_IOIO_STR_MASK) != 0;
039576c0
AK
2139 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2140 port = io_info >> 16;
2141 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
7ed9abfe
TL
2142
2143 if (string) {
2144 if (sev_es_guest(vcpu->kvm))
2145 return sev_es_string_io(svm, size, port, in);
2146 else
2147 return kvm_emulate_instruction(vcpu, 0);
2148 }
2149
cf8f70bf 2150 svm->next_rip = svm->vmcb->control.exit_info_2;
cf8f70bf 2151
63129754 2152 return kvm_fast_pio(vcpu, size, port, in);
c47f098d
JR
2153}
2154
63129754 2155static int nmi_interception(struct kvm_vcpu *vcpu)
a0698055 2156{
a0698055
JR
2157 return 1;
2158}
2159
991afbbe
ML
2160static int smi_interception(struct kvm_vcpu *vcpu)
2161{
2162 return 1;
2163}
2164
63129754 2165static int intr_interception(struct kvm_vcpu *vcpu)
6aa8b732 2166{
63129754 2167 ++vcpu->stat.irq_exits;
6aa8b732
AK
2168 return 1;
2169}
2170
2ac636a6 2171static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
6aa8b732 2172{
63129754 2173 struct vcpu_svm *svm = to_svm(vcpu);
9e8f0fbf 2174 struct vmcb *vmcb12;
8c5fbf1a 2175 struct kvm_host_map map;
b742c1e6 2176 int ret;
9966bf68 2177
63129754 2178 if (nested_svm_check_permissions(vcpu))
5542675b
AG
2179 return 1;
2180
63129754 2181 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
8c5fbf1a
KA
2182 if (ret) {
2183 if (ret == -EINVAL)
63129754 2184 kvm_inject_gp(vcpu, 0);
9966bf68 2185 return 1;
8c5fbf1a
KA
2186 }
2187
9e8f0fbf 2188 vmcb12 = map.hva;
9966bf68 2189
63129754 2190 ret = kvm_skip_emulated_instruction(vcpu);
9966bf68 2191
adc2a237 2192 if (vmload) {
2bb16bea 2193 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
adc2a237
ML
2194 svm->sysenter_eip_hi = 0;
2195 svm->sysenter_esp_hi = 0;
9a9e7481 2196 } else {
2bb16bea 2197 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
9a9e7481 2198 }
e3e9ed3d 2199
63129754 2200 kvm_vcpu_unmap(vcpu, &map, true);
5542675b 2201
b742c1e6 2202 return ret;
5542675b
AG
2203}
2204
2ac636a6 2205static int vmload_interception(struct kvm_vcpu *vcpu)
5542675b 2206{
2ac636a6
SC
2207 return vmload_vmsave_interception(vcpu, true);
2208}
5542675b 2209
2ac636a6
SC
2210static int vmsave_interception(struct kvm_vcpu *vcpu)
2211{
2212 return vmload_vmsave_interception(vcpu, false);
5542675b
AG
2213}
2214
63129754 2215static int vmrun_interception(struct kvm_vcpu *vcpu)
3d6368ef 2216{
63129754 2217 if (nested_svm_check_permissions(vcpu))
3d6368ef
AG
2218 return 1;
2219
63129754 2220 return nested_svm_vmrun(vcpu);
3d6368ef
AG
2221}
2222
82a11e9c
BD
2223enum {
2224 NONE_SVM_INSTR,
2225 SVM_INSTR_VMRUN,
2226 SVM_INSTR_VMLOAD,
2227 SVM_INSTR_VMSAVE,
2228};
2229
2230/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2231static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2232{
2233 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2234
2235 if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2236 return NONE_SVM_INSTR;
2237
2238 switch (ctxt->modrm) {
2239 case 0xd8: /* VMRUN */
2240 return SVM_INSTR_VMRUN;
2241 case 0xda: /* VMLOAD */
2242 return SVM_INSTR_VMLOAD;
2243 case 0xdb: /* VMSAVE */
2244 return SVM_INSTR_VMSAVE;
2245 default:
2246 break;
2247 }
2248
2249 return NONE_SVM_INSTR;
2250}
2251
2252static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2253{
14c2bf81
WH
2254 const int guest_mode_exit_codes[] = {
2255 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2256 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2257 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2258 };
63129754 2259 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
82a11e9c
BD
2260 [SVM_INSTR_VMRUN] = vmrun_interception,
2261 [SVM_INSTR_VMLOAD] = vmload_interception,
2262 [SVM_INSTR_VMSAVE] = vmsave_interception,
2263 };
2264 struct vcpu_svm *svm = to_svm(vcpu);
2df8d380 2265 int ret;
82a11e9c 2266
14c2bf81 2267 if (is_guest_mode(vcpu)) {
2df8d380 2268 /* Returns '1' or -errno on failure, '0' on success. */
3a87c7e0 2269 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2df8d380
SC
2270 if (ret)
2271 return ret;
2272 return 1;
2273 }
63129754 2274 return svm_instr_handlers[opcode](vcpu);
82a11e9c
BD
2275}
2276
2277/*
2278 * #GP handling code. Note that #GP can be triggered under the following two
2279 * cases:
2280 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2281 * some AMD CPUs when EAX of these instructions are in the reserved memory
2282 * regions (e.g. SMM memory on host).
2283 * 2) VMware backdoor
2284 */
63129754 2285static int gp_interception(struct kvm_vcpu *vcpu)
82a11e9c 2286{
63129754 2287 struct vcpu_svm *svm = to_svm(vcpu);
82a11e9c
BD
2288 u32 error_code = svm->vmcb->control.exit_info_1;
2289 int opcode;
2290
2291 /* Both #GP cases have zero error_code */
2292 if (error_code)
2293 goto reinject;
2294
d1cba6c9
ML
2295 /* All SVM instructions expect page aligned RAX */
2296 if (svm->vmcb->save.rax & ~PAGE_MASK)
2297 goto reinject;
2298
82a11e9c
BD
2299 /* Decode the instruction for usage later */
2300 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2301 goto reinject;
2302
2303 opcode = svm_instr_opcode(vcpu);
2304
2305 if (opcode == NONE_SVM_INSTR) {
2306 if (!enable_vmware_backdoor)
2307 goto reinject;
2308
2309 /*
2310 * VMware backdoor emulation on #GP interception only handles
2311 * IN{S}, OUT{S}, and RDPMC.
2312 */
14c2bf81
WH
2313 if (!is_guest_mode(vcpu))
2314 return kvm_emulate_instruction(vcpu,
82a11e9c
BD
2315 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2316 } else
2317 return emulate_svm_instr(vcpu, opcode);
2318
2319reinject:
2320 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2321 return 1;
2322}
2323
ffdf7f9e
PB
2324void svm_set_gif(struct vcpu_svm *svm, bool value)
2325{
2326 if (value) {
2327 /*
2328 * If VGIF is enabled, the STGI intercept is only added to
2329 * detect the opening of the SMI/NMI window; remove it now.
2330 * Likewise, clear the VINTR intercept, we will set it
2331 * again while processing KVM_REQ_EVENT if needed.
2332 */
2333 if (vgif_enabled(svm))
a284ba56
JR
2334 svm_clr_intercept(svm, INTERCEPT_STGI);
2335 if (svm_is_intercept(svm, INTERCEPT_VINTR))
ffdf7f9e
PB
2336 svm_clear_vintr(svm);
2337
2338 enable_gif(svm);
2339 if (svm->vcpu.arch.smi_pending ||
2340 svm->vcpu.arch.nmi_pending ||
2341 kvm_cpu_has_injectable_intr(&svm->vcpu))
2342 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2343 } else {
2344 disable_gif(svm);
2345
2346 /*
2347 * After a CLGI no interrupts should come. But if vGIF is
2348 * in use, we still rely on the VINTR intercept (rather than
2349 * STGI) to detect an open interrupt window.
2350 */
2351 if (!vgif_enabled(svm))
2352 svm_clear_vintr(svm);
2353 }
2354}
2355
63129754 2356static int stgi_interception(struct kvm_vcpu *vcpu)
1371d904 2357{
b742c1e6
LP
2358 int ret;
2359
63129754 2360 if (nested_svm_check_permissions(vcpu))
1371d904
AG
2361 return 1;
2362
63129754
PB
2363 ret = kvm_skip_emulated_instruction(vcpu);
2364 svm_set_gif(to_svm(vcpu), true);
b742c1e6 2365 return ret;
1371d904
AG
2366}
2367
63129754 2368static int clgi_interception(struct kvm_vcpu *vcpu)
1371d904 2369{
b742c1e6
LP
2370 int ret;
2371
63129754 2372 if (nested_svm_check_permissions(vcpu))
1371d904
AG
2373 return 1;
2374
63129754
PB
2375 ret = kvm_skip_emulated_instruction(vcpu);
2376 svm_set_gif(to_svm(vcpu), false);
b742c1e6 2377 return ret;
1371d904
AG
2378}
2379
63129754 2380static int invlpga_interception(struct kvm_vcpu *vcpu)
ff092385 2381{
bc9eff67
SC
2382 gva_t gva = kvm_rax_read(vcpu);
2383 u32 asid = kvm_rcx_read(vcpu);
ff092385 2384
bc9eff67
SC
2385 /* FIXME: Handle an address size prefix. */
2386 if (!is_long_mode(vcpu))
2387 gva = (u32)gva;
ff092385 2388
bc9eff67 2389 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
532a46b9 2390
ff092385 2391 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
bc9eff67 2392 kvm_mmu_invlpg(vcpu, gva);
532a46b9 2393
63129754 2394 return kvm_skip_emulated_instruction(vcpu);
dab429a7
DK
2395}
2396
63129754 2397static int skinit_interception(struct kvm_vcpu *vcpu)
81dd35d4 2398{
63129754 2399 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
81dd35d4 2400
63129754 2401 kvm_queue_exception(vcpu, UD_VECTOR);
0cb8410b
JM
2402 return 1;
2403}
2404
63129754 2405static int task_switch_interception(struct kvm_vcpu *vcpu)
6aa8b732 2406{
63129754 2407 struct vcpu_svm *svm = to_svm(vcpu);
37817f29 2408 u16 tss_selector;
64a7ec06
GN
2409 int reason;
2410 int int_type = svm->vmcb->control.exit_int_info &
2411 SVM_EXITINTINFO_TYPE_MASK;
8317c298 2412 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
fe8e7f83
GN
2413 uint32_t type =
2414 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2415 uint32_t idt_v =
2416 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
e269fb21
JK
2417 bool has_error_code = false;
2418 u32 error_code = 0;
37817f29
IE
2419
2420 tss_selector = (u16)svm->vmcb->control.exit_info_1;
64a7ec06 2421
37817f29
IE
2422 if (svm->vmcb->control.exit_info_2 &
2423 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
64a7ec06
GN
2424 reason = TASK_SWITCH_IRET;
2425 else if (svm->vmcb->control.exit_info_2 &
2426 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2427 reason = TASK_SWITCH_JMP;
fe8e7f83 2428 else if (idt_v)
64a7ec06
GN
2429 reason = TASK_SWITCH_GATE;
2430 else
2431 reason = TASK_SWITCH_CALL;
2432
fe8e7f83
GN
2433 if (reason == TASK_SWITCH_GATE) {
2434 switch (type) {
2435 case SVM_EXITINTINFO_TYPE_NMI:
63129754 2436 vcpu->arch.nmi_injected = false;
fe8e7f83
GN
2437 break;
2438 case SVM_EXITINTINFO_TYPE_EXEPT:
e269fb21
JK
2439 if (svm->vmcb->control.exit_info_2 &
2440 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2441 has_error_code = true;
2442 error_code =
2443 (u32)svm->vmcb->control.exit_info_2;
2444 }
63129754 2445 kvm_clear_exception_queue(vcpu);
fe8e7f83
GN
2446 break;
2447 case SVM_EXITINTINFO_TYPE_INTR:
63129754 2448 kvm_clear_interrupt_queue(vcpu);
fe8e7f83
GN
2449 break;
2450 default:
2451 break;
2452 }
2453 }
64a7ec06 2454
8317c298
GN
2455 if (reason != TASK_SWITCH_GATE ||
2456 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2457 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
f8ea7c60 2458 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
63129754 2459 if (!skip_emulated_instruction(vcpu))
738fece4 2460 return 0;
f8ea7c60 2461 }
64a7ec06 2462
7f3d35fd
KW
2463 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2464 int_vec = -1;
2465
63129754 2466 return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
60fc3d02 2467 has_error_code, error_code);
6aa8b732
AK
2468}
2469
63129754 2470static int iret_interception(struct kvm_vcpu *vcpu)
6aa8b732 2471{
63129754 2472 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732 2473
63129754
PB
2474 ++vcpu->stat.nmi_window_exits;
2475 vcpu->arch.hflags |= HF_IRET_MASK;
2476 if (!sev_es_guest(vcpu->kvm)) {
4444dfe4 2477 svm_clr_intercept(svm, INTERCEPT_IRET);
63129754 2478 svm->nmi_iret_rip = kvm_rip_read(vcpu);
4444dfe4 2479 }
63129754 2480 kvm_make_request(KVM_REQ_EVENT, vcpu);
95ba8273
GN
2481 return 1;
2482}
2483
63129754 2484static int invlpg_interception(struct kvm_vcpu *vcpu)
a7052897 2485{
df4f3108 2486 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2487 return kvm_emulate_instruction(vcpu, 0);
df4f3108 2488
63129754
PB
2489 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2490 return kvm_skip_emulated_instruction(vcpu);
a7052897
MT
2491}
2492
63129754 2493static int emulate_on_interception(struct kvm_vcpu *vcpu)
6aa8b732 2494{
63129754 2495 return kvm_emulate_instruction(vcpu, 0);
6aa8b732
AK
2496}
2497
63129754 2498static int rsm_interception(struct kvm_vcpu *vcpu)
7607b717 2499{
63129754 2500 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
7607b717
BS
2501}
2502
63129754 2503static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
52eb5a6d 2504 unsigned long val)
628afd2a 2505{
63129754
PB
2506 struct vcpu_svm *svm = to_svm(vcpu);
2507 unsigned long cr0 = vcpu->arch.cr0;
628afd2a 2508 bool ret = false;
628afd2a 2509
63129754 2510 if (!is_guest_mode(vcpu) ||
c62e2e94 2511 (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
628afd2a
JR
2512 return false;
2513
2514 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2515 val &= ~SVM_CR0_SELECTIVE_MASK;
2516
2517 if (cr0 ^ val) {
2518 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2519 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2520 }
2521
2522 return ret;
2523}
2524
7ff76d58
AP
2525#define CR_VALID (1ULL << 63)
2526
63129754 2527static int cr_interception(struct kvm_vcpu *vcpu)
7ff76d58 2528{
63129754 2529 struct vcpu_svm *svm = to_svm(vcpu);
7ff76d58
AP
2530 int reg, cr;
2531 unsigned long val;
2532 int err;
2533
2534 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2535 return emulate_on_interception(vcpu);
7ff76d58
AP
2536
2537 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
63129754 2538 return emulate_on_interception(vcpu);
7ff76d58
AP
2539
2540 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
5e57518d
DK
2541 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2542 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2543 else
2544 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
7ff76d58
AP
2545
2546 err = 0;
2547 if (cr >= 16) { /* mov to cr */
2548 cr -= 16;
27b4a9c4 2549 val = kvm_register_read(vcpu, reg);
95b28ac9 2550 trace_kvm_cr_write(cr, val);
7ff76d58
AP
2551 switch (cr) {
2552 case 0:
63129754
PB
2553 if (!check_selective_cr0_intercepted(vcpu, val))
2554 err = kvm_set_cr0(vcpu, val);
977b2d03
JR
2555 else
2556 return 1;
2557
7ff76d58
AP
2558 break;
2559 case 3:
63129754 2560 err = kvm_set_cr3(vcpu, val);
7ff76d58
AP
2561 break;
2562 case 4:
63129754 2563 err = kvm_set_cr4(vcpu, val);
7ff76d58
AP
2564 break;
2565 case 8:
63129754 2566 err = kvm_set_cr8(vcpu, val);
7ff76d58
AP
2567 break;
2568 default:
2569 WARN(1, "unhandled write to CR%d", cr);
63129754 2570 kvm_queue_exception(vcpu, UD_VECTOR);
7ff76d58
AP
2571 return 1;
2572 }
2573 } else { /* mov from cr */
2574 switch (cr) {
2575 case 0:
63129754 2576 val = kvm_read_cr0(vcpu);
7ff76d58
AP
2577 break;
2578 case 2:
63129754 2579 val = vcpu->arch.cr2;
7ff76d58
AP
2580 break;
2581 case 3:
63129754 2582 val = kvm_read_cr3(vcpu);
7ff76d58
AP
2583 break;
2584 case 4:
63129754 2585 val = kvm_read_cr4(vcpu);
7ff76d58
AP
2586 break;
2587 case 8:
63129754 2588 val = kvm_get_cr8(vcpu);
7ff76d58
AP
2589 break;
2590 default:
2591 WARN(1, "unhandled read from CR%d", cr);
63129754 2592 kvm_queue_exception(vcpu, UD_VECTOR);
7ff76d58
AP
2593 return 1;
2594 }
27b4a9c4 2595 kvm_register_write(vcpu, reg, val);
95b28ac9 2596 trace_kvm_cr_read(cr, val);
7ff76d58 2597 }
63129754 2598 return kvm_complete_insn_gp(vcpu, err);
7ff76d58
AP
2599}
2600
63129754 2601static int cr_trap(struct kvm_vcpu *vcpu)
f27ad38a 2602{
63129754 2603 struct vcpu_svm *svm = to_svm(vcpu);
f27ad38a
TL
2604 unsigned long old_value, new_value;
2605 unsigned int cr;
d1949b93 2606 int ret = 0;
f27ad38a
TL
2607
2608 new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2609
2610 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2611 switch (cr) {
2612 case 0:
2613 old_value = kvm_read_cr0(vcpu);
2614 svm_set_cr0(vcpu, new_value);
2615
2616 kvm_post_set_cr0(vcpu, old_value, new_value);
2617 break;
5b51cb13
TL
2618 case 4:
2619 old_value = kvm_read_cr4(vcpu);
2620 svm_set_cr4(vcpu, new_value);
2621
2622 kvm_post_set_cr4(vcpu, old_value, new_value);
2623 break;
d1949b93 2624 case 8:
63129754 2625 ret = kvm_set_cr8(vcpu, new_value);
d1949b93 2626 break;
f27ad38a
TL
2627 default:
2628 WARN(1, "unhandled CR%d write trap", cr);
2629 kvm_queue_exception(vcpu, UD_VECTOR);
2630 return 1;
2631 }
2632
d1949b93 2633 return kvm_complete_insn_gp(vcpu, ret);
f27ad38a
TL
2634}
2635
63129754 2636static int dr_interception(struct kvm_vcpu *vcpu)
cae3797a 2637{
63129754 2638 struct vcpu_svm *svm = to_svm(vcpu);
cae3797a
AP
2639 int reg, dr;
2640 unsigned long val;
996ff542 2641 int err = 0;
cae3797a 2642
63129754 2643 if (vcpu->guest_debug == 0) {
facb0139
PB
2644 /*
2645 * No more DR vmexits; force a reload of the debug registers
2646 * and reenter on this instruction. The next vmexit will
2647 * retrieve the full state of the debug registers.
2648 */
2649 clr_dr_intercepts(svm);
63129754 2650 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
facb0139
PB
2651 return 1;
2652 }
2653
cae3797a 2654 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2655 return emulate_on_interception(vcpu);
cae3797a
AP
2656
2657 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2658 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
996ff542
PB
2659 if (dr >= 16) { /* mov to DRn */
2660 dr -= 16;
27b4a9c4 2661 val = kvm_register_read(vcpu, reg);
63129754 2662 err = kvm_set_dr(vcpu, dr, val);
cae3797a 2663 } else {
63129754 2664 kvm_get_dr(vcpu, dr, &val);
27b4a9c4 2665 kvm_register_write(vcpu, reg, val);
cae3797a
AP
2666 }
2667
63129754 2668 return kvm_complete_insn_gp(vcpu, err);
cae3797a
AP
2669}
2670
63129754 2671static int cr8_write_interception(struct kvm_vcpu *vcpu)
1d075434 2672{
eea1cff9 2673 int r;
851ba692 2674
63129754 2675 u8 cr8_prev = kvm_get_cr8(vcpu);
0a5fff19 2676 /* instruction emulation calls kvm_set_cr8() */
63129754
PB
2677 r = cr_interception(vcpu);
2678 if (lapic_in_kernel(vcpu))
7ff76d58 2679 return r;
63129754 2680 if (cr8_prev <= kvm_get_cr8(vcpu))
7ff76d58 2681 return r;
63129754 2682 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
1d075434
JR
2683 return 0;
2684}
2685
63129754 2686static int efer_trap(struct kvm_vcpu *vcpu)
2985afbc
TL
2687{
2688 struct msr_data msr_info;
2689 int ret;
2690
2691 /*
2692 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2693 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2694 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2695 * the guest doesn't have X86_FEATURE_SVM.
2696 */
2697 msr_info.host_initiated = false;
2698 msr_info.index = MSR_EFER;
63129754
PB
2699 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2700 ret = kvm_set_msr_common(vcpu, &msr_info);
2985afbc 2701
63129754 2702 return kvm_complete_insn_gp(vcpu, ret);
2985afbc
TL
2703}
2704
801e459a
TL
2705static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2706{
d1d93fa9
TL
2707 msr->data = 0;
2708
2709 switch (msr->index) {
2710 case MSR_F10H_DECFG:
2711 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2712 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2713 break;
d574c539
VK
2714 case MSR_IA32_PERF_CAPABILITIES:
2715 return 0;
d1d93fa9 2716 default:
12bc2132 2717 return KVM_MSR_RET_INVALID;
d1d93fa9
TL
2718 }
2719
2720 return 0;
801e459a
TL
2721}
2722
609e36d3 2723static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 2724{
a2fa3e9f
GH
2725 struct vcpu_svm *svm = to_svm(vcpu);
2726
609e36d3 2727 switch (msr_info->index) {
5228eb96
ML
2728 case MSR_AMD64_TSC_RATIO:
2729 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2730 return 1;
2731 msr_info->data = svm->tsc_ratio_msr;
2732 break;
8c06585d 2733 case MSR_STAR:
cc3ed80a 2734 msr_info->data = svm->vmcb01.ptr->save.star;
6aa8b732 2735 break;
0e859cac 2736#ifdef CONFIG_X86_64
6aa8b732 2737 case MSR_LSTAR:
cc3ed80a 2738 msr_info->data = svm->vmcb01.ptr->save.lstar;
6aa8b732
AK
2739 break;
2740 case MSR_CSTAR:
cc3ed80a 2741 msr_info->data = svm->vmcb01.ptr->save.cstar;
6aa8b732
AK
2742 break;
2743 case MSR_KERNEL_GS_BASE:
cc3ed80a 2744 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
6aa8b732
AK
2745 break;
2746 case MSR_SYSCALL_MASK:
cc3ed80a 2747 msr_info->data = svm->vmcb01.ptr->save.sfmask;
6aa8b732
AK
2748 break;
2749#endif
2750 case MSR_IA32_SYSENTER_CS:
cc3ed80a 2751 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
6aa8b732
AK
2752 break;
2753 case MSR_IA32_SYSENTER_EIP:
adc2a237
ML
2754 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2755 if (guest_cpuid_is_intel(vcpu))
2756 msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
6aa8b732
AK
2757 break;
2758 case MSR_IA32_SYSENTER_ESP:
adc2a237
ML
2759 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2760 if (guest_cpuid_is_intel(vcpu))
2761 msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
6aa8b732 2762 break;
46896c73 2763 case MSR_TSC_AUX:
46896c73
PB
2764 msr_info->data = svm->tsc_aux;
2765 break;
e0231715
JR
2766 /*
2767 * Nobody will change the following 5 values in the VMCB so we can
2768 * safely return them on rdmsr. They will always be 0 until LBRV is
2769 * implemented.
2770 */
a2938c80 2771 case MSR_IA32_DEBUGCTLMSR:
609e36d3 2772 msr_info->data = svm->vmcb->save.dbgctl;
a2938c80
JR
2773 break;
2774 case MSR_IA32_LASTBRANCHFROMIP:
609e36d3 2775 msr_info->data = svm->vmcb->save.br_from;
a2938c80
JR
2776 break;
2777 case MSR_IA32_LASTBRANCHTOIP:
609e36d3 2778 msr_info->data = svm->vmcb->save.br_to;
a2938c80
JR
2779 break;
2780 case MSR_IA32_LASTINTFROMIP:
609e36d3 2781 msr_info->data = svm->vmcb->save.last_excp_from;
a2938c80
JR
2782 break;
2783 case MSR_IA32_LASTINTTOIP:
609e36d3 2784 msr_info->data = svm->vmcb->save.last_excp_to;
a2938c80 2785 break;
b286d5d8 2786 case MSR_VM_HSAVE_PA:
609e36d3 2787 msr_info->data = svm->nested.hsave_msr;
b286d5d8 2788 break;
eb6f302e 2789 case MSR_VM_CR:
609e36d3 2790 msr_info->data = svm->nested.vm_cr_msr;
eb6f302e 2791 break;
b2ac58f9
KA
2792 case MSR_IA32_SPEC_CTRL:
2793 if (!msr_info->host_initiated &&
39485ed9 2794 !guest_has_spec_ctrl_msr(vcpu))
b2ac58f9
KA
2795 return 1;
2796
d00b99c5
BM
2797 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2798 msr_info->data = svm->vmcb->save.spec_ctrl;
2799 else
2800 msr_info->data = svm->spec_ctrl;
b2ac58f9 2801 break;
bc226f07
TL
2802 case MSR_AMD64_VIRT_SPEC_CTRL:
2803 if (!msr_info->host_initiated &&
2804 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2805 return 1;
2806
2807 msr_info->data = svm->virt_spec_ctrl;
2808 break;
ae8b7875
BP
2809 case MSR_F15H_IC_CFG: {
2810
2811 int family, model;
2812
2813 family = guest_cpuid_family(vcpu);
2814 model = guest_cpuid_model(vcpu);
2815
2816 if (family < 0 || model < 0)
2817 return kvm_get_msr_common(vcpu, msr_info);
2818
2819 msr_info->data = 0;
2820
2821 if (family == 0x15 &&
2822 (model >= 0x2 && model < 0x20))
2823 msr_info->data = 0x1E;
2824 }
2825 break;
d1d93fa9
TL
2826 case MSR_F10H_DECFG:
2827 msr_info->data = svm->msr_decfg;
2828 break;
6aa8b732 2829 default:
609e36d3 2830 return kvm_get_msr_common(vcpu, msr_info);
6aa8b732
AK
2831 }
2832 return 0;
2833}
2834
f1c6366e
TL
2835static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2836{
2837 struct vcpu_svm *svm = to_svm(vcpu);
b67a4cc3 2838 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
63129754 2839 return kvm_complete_insn_gp(vcpu, err);
f1c6366e 2840
b67a4cc3
PG
2841 ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2842 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
f1c6366e
TL
2843 X86_TRAP_GP |
2844 SVM_EVTINJ_TYPE_EXEPT |
2845 SVM_EVTINJ_VALID);
2846 return 1;
2847}
2848
4a810181
JR
2849static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2850{
2851 struct vcpu_svm *svm = to_svm(vcpu);
2852 int svm_dis, chg_mask;
2853
2854 if (data & ~SVM_VM_CR_VALID_MASK)
2855 return 1;
2856
2857 chg_mask = SVM_VM_CR_VALID_MASK;
2858
2859 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2860 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2861
2862 svm->nested.vm_cr_msr &= ~chg_mask;
2863 svm->nested.vm_cr_msr |= (data & chg_mask);
2864
2865 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2866
2867 /* check for svm_disable while efer.svme is set */
2868 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2869 return 1;
2870
2871 return 0;
2872}
2873
8fe8ab46 2874static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
6aa8b732 2875{
a2fa3e9f 2876 struct vcpu_svm *svm = to_svm(vcpu);
844d69c2 2877 int r;
a2fa3e9f 2878
8fe8ab46
WA
2879 u32 ecx = msr->index;
2880 u64 data = msr->data;
6aa8b732 2881 switch (ecx) {
5228eb96
ML
2882 case MSR_AMD64_TSC_RATIO:
2883 if (!msr->host_initiated && !svm->tsc_scaling_enabled)
2884 return 1;
2885
2886 if (data & TSC_RATIO_RSVD)
2887 return 1;
2888
2889 svm->tsc_ratio_msr = data;
2890
2891 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2892 nested_svm_update_tsc_ratio_msr(vcpu);
2893
2894 break;
15038e14
PB
2895 case MSR_IA32_CR_PAT:
2896 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2897 return 1;
2898 vcpu->arch.pat = data;
4995a368
CA
2899 svm->vmcb01.ptr->save.g_pat = data;
2900 if (is_guest_mode(vcpu))
2901 nested_vmcb02_compute_g_pat(svm);
06e7852c 2902 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
15038e14 2903 break;
b2ac58f9
KA
2904 case MSR_IA32_SPEC_CTRL:
2905 if (!msr->host_initiated &&
39485ed9 2906 !guest_has_spec_ctrl_msr(vcpu))
b2ac58f9
KA
2907 return 1;
2908
841c2be0 2909 if (kvm_spec_ctrl_test_value(data))
b2ac58f9
KA
2910 return 1;
2911
d00b99c5
BM
2912 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2913 svm->vmcb->save.spec_ctrl = data;
2914 else
2915 svm->spec_ctrl = data;
b2ac58f9
KA
2916 if (!data)
2917 break;
2918
2919 /*
2920 * For non-nested:
2921 * When it's written (to non-zero) for the first time, pass
2922 * it through.
2923 *
2924 * For nested:
2925 * The handling of the MSR bitmap for L2 guests is done in
2926 * nested_svm_vmrun_msrpm.
2927 * We update the L1 MSR bit as well since it will end up
2928 * touching the MSR anyway now.
2929 */
476c9bd8 2930 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
b2ac58f9 2931 break;
15d45071
AR
2932 case MSR_IA32_PRED_CMD:
2933 if (!msr->host_initiated &&
39485ed9 2934 !guest_has_pred_cmd_msr(vcpu))
15d45071
AR
2935 return 1;
2936
2937 if (data & ~PRED_CMD_IBPB)
2938 return 1;
39485ed9 2939 if (!boot_cpu_has(X86_FEATURE_IBPB))
6441fa61 2940 return 1;
15d45071
AR
2941 if (!data)
2942 break;
2943
2944 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
476c9bd8 2945 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
15d45071 2946 break;
bc226f07
TL
2947 case MSR_AMD64_VIRT_SPEC_CTRL:
2948 if (!msr->host_initiated &&
2949 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2950 return 1;
2951
2952 if (data & ~SPEC_CTRL_SSBD)
2953 return 1;
2954
2955 svm->virt_spec_ctrl = data;
2956 break;
8c06585d 2957 case MSR_STAR:
cc3ed80a 2958 svm->vmcb01.ptr->save.star = data;
6aa8b732 2959 break;
49b14f24 2960#ifdef CONFIG_X86_64
6aa8b732 2961 case MSR_LSTAR:
cc3ed80a 2962 svm->vmcb01.ptr->save.lstar = data;
6aa8b732
AK
2963 break;
2964 case MSR_CSTAR:
cc3ed80a 2965 svm->vmcb01.ptr->save.cstar = data;
6aa8b732
AK
2966 break;
2967 case MSR_KERNEL_GS_BASE:
cc3ed80a 2968 svm->vmcb01.ptr->save.kernel_gs_base = data;
6aa8b732
AK
2969 break;
2970 case MSR_SYSCALL_MASK:
cc3ed80a 2971 svm->vmcb01.ptr->save.sfmask = data;
6aa8b732
AK
2972 break;
2973#endif
2974 case MSR_IA32_SYSENTER_CS:
cc3ed80a 2975 svm->vmcb01.ptr->save.sysenter_cs = data;
6aa8b732
AK
2976 break;
2977 case MSR_IA32_SYSENTER_EIP:
adc2a237
ML
2978 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2979 /*
2980 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2981 * when we spoof an Intel vendor ID (for cross vendor migration).
2982 * In this case we use this intercept to track the high
2983 * 32 bit part of these msrs to support Intel's
2984 * implementation of SYSENTER/SYSEXIT.
2985 */
2986 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
6aa8b732
AK
2987 break;
2988 case MSR_IA32_SYSENTER_ESP:
adc2a237
ML
2989 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2990 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
6aa8b732 2991 break;
46896c73 2992 case MSR_TSC_AUX:
46896c73 2993 /*
844d69c2
SC
2994 * TSC_AUX is usually changed only during boot and never read
2995 * directly. Intercept TSC_AUX instead of exposing it to the
2996 * guest via direct_access_msrs, and switch it via user return.
46896c73 2997 */
844d69c2 2998 preempt_disable();
0caa0a77 2999 r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
844d69c2
SC
3000 preempt_enable();
3001 if (r)
3002 return 1;
3003
46896c73 3004 svm->tsc_aux = data;
46896c73 3005 break;
a2938c80 3006 case MSR_IA32_DEBUGCTLMSR:
4c84926e 3007 if (!lbrv) {
a737f256
CD
3008 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3009 __func__, data);
24e09cbf
JR
3010 break;
3011 }
3012 if (data & DEBUGCTL_RESERVED_BITS)
3013 return 1;
3014
3015 svm->vmcb->save.dbgctl = data;
06e7852c 3016 vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
24e09cbf 3017 if (data & (1ULL<<0))
476c9bd8 3018 svm_enable_lbrv(vcpu);
24e09cbf 3019 else
476c9bd8 3020 svm_disable_lbrv(vcpu);
a2938c80 3021 break;
b286d5d8 3022 case MSR_VM_HSAVE_PA:
fce7e152
VK
3023 /*
3024 * Old kernels did not validate the value written to
3025 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
3026 * value to allow live migrating buggy or malicious guests
3027 * originating from those kernels.
3028 */
3029 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3030 return 1;
3031
3032 svm->nested.hsave_msr = data & PAGE_MASK;
62b9abaa 3033 break;
3c5d0a44 3034 case MSR_VM_CR:
4a810181 3035 return svm_set_vm_cr(vcpu, data);
3c5d0a44 3036 case MSR_VM_IGNNE:
a737f256 3037 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3c5d0a44 3038 break;
d1d93fa9
TL
3039 case MSR_F10H_DECFG: {
3040 struct kvm_msr_entry msr_entry;
3041
3042 msr_entry.index = msr->index;
3043 if (svm_get_msr_feature(&msr_entry))
3044 return 1;
3045
3046 /* Check the supported bits */
3047 if (data & ~msr_entry.data)
3048 return 1;
3049
3050 /* Don't allow the guest to change a bit, #GP */
3051 if (!msr->host_initiated && (data ^ msr_entry.data))
3052 return 1;
3053
3054 svm->msr_decfg = data;
3055 break;
3056 }
6aa8b732 3057 default:
8fe8ab46 3058 return kvm_set_msr_common(vcpu, msr);
6aa8b732
AK
3059 }
3060 return 0;
3061}
3062
63129754 3063static int msr_interception(struct kvm_vcpu *vcpu)
6aa8b732 3064{
63129754 3065 if (to_svm(vcpu)->vmcb->control.exit_info_1)
5ff3a351 3066 return kvm_emulate_wrmsr(vcpu);
6aa8b732 3067 else
5ff3a351 3068 return kvm_emulate_rdmsr(vcpu);
6aa8b732
AK
3069}
3070
63129754 3071static int interrupt_window_interception(struct kvm_vcpu *vcpu)
c1150d8c 3072{
63129754
PB
3073 kvm_make_request(KVM_REQ_EVENT, vcpu);
3074 svm_clear_vintr(to_svm(vcpu));
f3515dc3
SS
3075
3076 /*
3077 * For AVIC, the only reason to end up here is ExtINTs.
3078 * In this case AVIC was temporarily disabled for
3079 * requesting the IRQ window and we have to re-enable it.
3080 */
30eed56a 3081 kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
f3515dc3 3082
63129754 3083 ++vcpu->stat.irq_window_exits;
c1150d8c
DL
3084 return 1;
3085}
3086
63129754 3087static int pause_interception(struct kvm_vcpu *vcpu)
565d0998 3088{
f1c6366e
TL
3089 bool in_kernel;
3090
3091 /*
3092 * CPL is not made available for an SEV-ES guest, therefore
3093 * vcpu->arch.preempted_in_kernel can never be true. Just
3094 * set in_kernel to false as well.
3095 */
63129754 3096 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
de63ad4c 3097
830f01b0 3098 if (!kvm_pause_in_guest(vcpu->kvm))
8566ac8b
BM
3099 grow_ple_window(vcpu);
3100
de63ad4c 3101 kvm_vcpu_on_spin(vcpu, in_kernel);
c8781fea 3102 return kvm_skip_emulated_instruction(vcpu);
87c00572
GS
3103}
3104
63129754 3105static int invpcid_interception(struct kvm_vcpu *vcpu)
87c00572 3106{
63129754 3107 struct vcpu_svm *svm = to_svm(vcpu);
4407a797
BM
3108 unsigned long type;
3109 gva_t gva;
3110
3111 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3112 kvm_queue_exception(vcpu, UD_VECTOR);
3113 return 1;
3114 }
3115
3116 /*
3117 * For an INVPCID intercept:
3118 * EXITINFO1 provides the linear address of the memory operand.
3119 * EXITINFO2 provides the contents of the register operand.
3120 */
3121 type = svm->vmcb->control.exit_info_2;
3122 gva = svm->vmcb->control.exit_info_1;
3123
4407a797
BM
3124 return kvm_handle_invpcid(vcpu, type, gva);
3125}
3126
63129754 3127static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7ff76d58
AP
3128 [SVM_EXIT_READ_CR0] = cr_interception,
3129 [SVM_EXIT_READ_CR3] = cr_interception,
3130 [SVM_EXIT_READ_CR4] = cr_interception,
3131 [SVM_EXIT_READ_CR8] = cr_interception,
5e57518d 3132 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
628afd2a 3133 [SVM_EXIT_WRITE_CR0] = cr_interception,
7ff76d58
AP
3134 [SVM_EXIT_WRITE_CR3] = cr_interception,
3135 [SVM_EXIT_WRITE_CR4] = cr_interception,
e0231715 3136 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
cae3797a
AP
3137 [SVM_EXIT_READ_DR0] = dr_interception,
3138 [SVM_EXIT_READ_DR1] = dr_interception,
3139 [SVM_EXIT_READ_DR2] = dr_interception,
3140 [SVM_EXIT_READ_DR3] = dr_interception,
3141 [SVM_EXIT_READ_DR4] = dr_interception,
3142 [SVM_EXIT_READ_DR5] = dr_interception,
3143 [SVM_EXIT_READ_DR6] = dr_interception,
3144 [SVM_EXIT_READ_DR7] = dr_interception,
3145 [SVM_EXIT_WRITE_DR0] = dr_interception,
3146 [SVM_EXIT_WRITE_DR1] = dr_interception,
3147 [SVM_EXIT_WRITE_DR2] = dr_interception,
3148 [SVM_EXIT_WRITE_DR3] = dr_interception,
3149 [SVM_EXIT_WRITE_DR4] = dr_interception,
3150 [SVM_EXIT_WRITE_DR5] = dr_interception,
3151 [SVM_EXIT_WRITE_DR6] = dr_interception,
3152 [SVM_EXIT_WRITE_DR7] = dr_interception,
d0bfb940
JK
3153 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
3154 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
7aa81cc0 3155 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
e0231715 3156 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
e0231715 3157 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
54a20552 3158 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
9718420e 3159 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
e0231715 3160 [SVM_EXIT_INTR] = intr_interception,
c47f098d 3161 [SVM_EXIT_NMI] = nmi_interception,
991afbbe 3162 [SVM_EXIT_SMI] = smi_interception,
c1150d8c 3163 [SVM_EXIT_VINTR] = interrupt_window_interception,
32c23c7d 3164 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
5ff3a351 3165 [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
95ba8273 3166 [SVM_EXIT_IRET] = iret_interception,
5ff3a351 3167 [SVM_EXIT_INVD] = kvm_emulate_invd,
565d0998 3168 [SVM_EXIT_PAUSE] = pause_interception,
5ff3a351 3169 [SVM_EXIT_HLT] = kvm_emulate_halt,
a7052897 3170 [SVM_EXIT_INVLPG] = invlpg_interception,
ff092385 3171 [SVM_EXIT_INVLPGA] = invlpga_interception,
e0231715 3172 [SVM_EXIT_IOIO] = io_interception,
6aa8b732
AK
3173 [SVM_EXIT_MSR] = msr_interception,
3174 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
46fe4ddd 3175 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
3d6368ef 3176 [SVM_EXIT_VMRUN] = vmrun_interception,
5ff3a351 3177 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
5542675b
AG
3178 [SVM_EXIT_VMLOAD] = vmload_interception,
3179 [SVM_EXIT_VMSAVE] = vmsave_interception,
1371d904
AG
3180 [SVM_EXIT_STGI] = stgi_interception,
3181 [SVM_EXIT_CLGI] = clgi_interception,
532a46b9 3182 [SVM_EXIT_SKINIT] = skinit_interception,
3b195ac9 3183 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
5ff3a351
SC
3184 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
3185 [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
3186 [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
92f9895c 3187 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
5ff3a351 3188 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
2985afbc 3189 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
f27ad38a 3190 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
5b51cb13 3191 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
d1949b93 3192 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
4407a797 3193 [SVM_EXIT_INVPCID] = invpcid_interception,
d0006530 3194 [SVM_EXIT_NPF] = npf_interception,
7607b717 3195 [SVM_EXIT_RSM] = rsm_interception,
18f40c53
SS
3196 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
3197 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
291bd20d 3198 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
6aa8b732
AK
3199};
3200
ae8cc059 3201static void dump_vmcb(struct kvm_vcpu *vcpu)
3f10c846
JR
3202{
3203 struct vcpu_svm *svm = to_svm(vcpu);
3204 struct vmcb_control_area *control = &svm->vmcb->control;
3205 struct vmcb_save_area *save = &svm->vmcb->save;
cc3ed80a 3206 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3f10c846 3207
6f2f8453
PB
3208 if (!dump_invalid_vmcb) {
3209 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3210 return;
3211 }
3212
18f63b15
JM
3213 pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3214 svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3f10c846 3215 pr_err("VMCB Control Area:\n");
03bfeeb9
BM
3216 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3217 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
30abaa88
BM
3218 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3219 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
9780d51d 3220 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
c62e2e94
BM
3221 pr_err("%-20s%08x %08x\n", "intercepts:",
3222 control->intercepts[INTERCEPT_WORD3],
3223 control->intercepts[INTERCEPT_WORD4]);
ae8cc059 3224 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
1d8fb44a
BM
3225 pr_err("%-20s%d\n", "pause filter threshold:",
3226 control->pause_filter_thresh);
ae8cc059
JP
3227 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3228 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3229 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3230 pr_err("%-20s%d\n", "asid:", control->asid);
3231 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3232 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3233 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3234 pr_err("%-20s%08x\n", "int_state:", control->int_state);
3235 pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3236 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3237 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3238 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3239 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3240 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3241 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
44a95dae 3242 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
291bd20d 3243 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
ae8cc059
JP
3244 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3245 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
0dc92119 3246 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
ae8cc059 3247 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
44a95dae
SS
3248 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3249 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3250 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
376c6d28 3251 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3f10c846 3252 pr_err("VMCB State Save Area:\n");
ae8cc059
JP
3253 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3254 "es:",
3255 save->es.selector, save->es.attrib,
3256 save->es.limit, save->es.base);
3257 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3258 "cs:",
3259 save->cs.selector, save->cs.attrib,
3260 save->cs.limit, save->cs.base);
3261 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3262 "ss:",
3263 save->ss.selector, save->ss.attrib,
3264 save->ss.limit, save->ss.base);
3265 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3266 "ds:",
3267 save->ds.selector, save->ds.attrib,
3268 save->ds.limit, save->ds.base);
3269 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3270 "fs:",
cc3ed80a
ML
3271 save01->fs.selector, save01->fs.attrib,
3272 save01->fs.limit, save01->fs.base);
ae8cc059
JP
3273 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3274 "gs:",
cc3ed80a
ML
3275 save01->gs.selector, save01->gs.attrib,
3276 save01->gs.limit, save01->gs.base);
ae8cc059
JP
3277 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3278 "gdtr:",
3279 save->gdtr.selector, save->gdtr.attrib,
3280 save->gdtr.limit, save->gdtr.base);
3281 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3282 "ldtr:",
cc3ed80a
ML
3283 save01->ldtr.selector, save01->ldtr.attrib,
3284 save01->ldtr.limit, save01->ldtr.base);
ae8cc059
JP
3285 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3286 "idtr:",
3287 save->idtr.selector, save->idtr.attrib,
3288 save->idtr.limit, save->idtr.base);
3289 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3290 "tr:",
cc3ed80a
ML
3291 save01->tr.selector, save01->tr.attrib,
3292 save01->tr.limit, save01->tr.base);
3f10c846
JR
3293 pr_err("cpl: %d efer: %016llx\n",
3294 save->cpl, save->efer);
ae8cc059
JP
3295 pr_err("%-15s %016llx %-13s %016llx\n",
3296 "cr0:", save->cr0, "cr2:", save->cr2);
3297 pr_err("%-15s %016llx %-13s %016llx\n",
3298 "cr3:", save->cr3, "cr4:", save->cr4);
3299 pr_err("%-15s %016llx %-13s %016llx\n",
3300 "dr6:", save->dr6, "dr7:", save->dr7);
3301 pr_err("%-15s %016llx %-13s %016llx\n",
3302 "rip:", save->rip, "rflags:", save->rflags);
3303 pr_err("%-15s %016llx %-13s %016llx\n",
3304 "rsp:", save->rsp, "rax:", save->rax);
3305 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a 3306 "star:", save01->star, "lstar:", save01->lstar);
ae8cc059 3307 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a 3308 "cstar:", save01->cstar, "sfmask:", save01->sfmask);
ae8cc059 3309 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a
ML
3310 "kernel_gs_base:", save01->kernel_gs_base,
3311 "sysenter_cs:", save01->sysenter_cs);
ae8cc059 3312 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a
ML
3313 "sysenter_esp:", save01->sysenter_esp,
3314 "sysenter_eip:", save01->sysenter_eip);
ae8cc059
JP
3315 pr_err("%-15s %016llx %-13s %016llx\n",
3316 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3317 pr_err("%-15s %016llx %-13s %016llx\n",
3318 "br_from:", save->br_from, "br_to:", save->br_to);
3319 pr_err("%-15s %016llx %-13s %016llx\n",
3320 "excp_from:", save->last_excp_from,
3321 "excp_to:", save->last_excp_to);
3f10c846
JR
3322}
3323
7a4bca85 3324static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
e9093fd4 3325{
7a4bca85
ML
3326 return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3327 svm_exit_handlers[exit_code]);
3328}
e9093fd4 3329
7a4bca85
ML
3330static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3331{
e9093fd4
TL
3332 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3333 dump_vmcb(vcpu);
3334 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3335 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3336 vcpu->run->internal.ndata = 2;
3337 vcpu->run->internal.data[0] = exit_code;
3338 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
7a4bca85 3339 return 0;
e9093fd4
TL
3340}
3341
63129754 3342int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
e9093fd4 3343{
7a4bca85
ML
3344 if (!svm_check_exit_valid(vcpu, exit_code))
3345 return svm_handle_invalid_exit(vcpu, exit_code);
e9093fd4
TL
3346
3347#ifdef CONFIG_RETPOLINE
3348 if (exit_code == SVM_EXIT_MSR)
63129754 3349 return msr_interception(vcpu);
e9093fd4 3350 else if (exit_code == SVM_EXIT_VINTR)
63129754 3351 return interrupt_window_interception(vcpu);
e9093fd4 3352 else if (exit_code == SVM_EXIT_INTR)
63129754 3353 return intr_interception(vcpu);
e9093fd4 3354 else if (exit_code == SVM_EXIT_HLT)
5ff3a351 3355 return kvm_emulate_halt(vcpu);
e9093fd4 3356 else if (exit_code == SVM_EXIT_NPF)
63129754 3357 return npf_interception(vcpu);
e9093fd4 3358#endif
63129754 3359 return svm_exit_handlers[exit_code](vcpu);
e9093fd4
TL
3360}
3361
0a62a031
DE
3362static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3363 u64 *info1, u64 *info2,
235ba74f 3364 u32 *intr_info, u32 *error_code)
586f9607
AK
3365{
3366 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3367
0a62a031 3368 *reason = control->exit_code;
586f9607
AK
3369 *info1 = control->exit_info_1;
3370 *info2 = control->exit_info_2;
235ba74f
SC
3371 *intr_info = control->exit_int_info;
3372 if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3373 (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3374 *error_code = control->exit_int_info_err;
3375 else
3376 *error_code = 0;
586f9607
AK
3377}
3378
404d5d7b 3379static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6aa8b732 3380{
04d2cc77 3381 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 3382 struct kvm_run *kvm_run = vcpu->run;
a2fa3e9f 3383 u32 exit_code = svm->vmcb->control.exit_code;
6aa8b732 3384
0a62a031 3385 trace_kvm_exit(vcpu, KVM_ISA_SVM);
8b89fe1f 3386
f1c6366e
TL
3387 /* SEV-ES guests must use the CR write traps to track CR registers. */
3388 if (!sev_es_guest(vcpu->kvm)) {
3389 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3390 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3391 if (npt_enabled)
3392 vcpu->arch.cr3 = svm->vmcb->save.cr3;
3393 }
af9ca2d7 3394
2030753d 3395 if (is_guest_mode(vcpu)) {
410e4d57
JR
3396 int vmexit;
3397
0a62a031 3398 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
d8cabddf 3399
410e4d57
JR
3400 vmexit = nested_svm_exit_special(svm);
3401
3402 if (vmexit == NESTED_EXIT_CONTINUE)
3403 vmexit = nested_svm_exit_handled(svm);
3404
3405 if (vmexit == NESTED_EXIT_DONE)
cf74a78b 3406 return 1;
cf74a78b
AG
3407 }
3408
04d2cc77
AK
3409 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3410 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3411 kvm_run->fail_entry.hardware_entry_failure_reason
3412 = svm->vmcb->control.exit_code;
8a14fe4f 3413 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3f10c846 3414 dump_vmcb(vcpu);
04d2cc77
AK
3415 return 0;
3416 }
3417
a2fa3e9f 3418 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
709ddebf 3419 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
55c5e464
JR
3420 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3421 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
6614c7d0 3422 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
6aa8b732 3423 "exit_code 0x%x\n",
b8688d51 3424 __func__, svm->vmcb->control.exit_int_info,
6aa8b732
AK
3425 exit_code);
3426
404d5d7b 3427 if (exit_fastpath != EXIT_FASTPATH_NONE)
1e9e2622 3428 return 1;
404d5d7b 3429
63129754 3430 return svm_invoke_exit_handler(vcpu, exit_code);
6aa8b732
AK
3431}
3432
3433static void reload_tss(struct kvm_vcpu *vcpu)
3434{
73cd6e5f 3435 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
6aa8b732 3436
0fe1e009 3437 sd->tss_desc->type = 9; /* available 32/64-bit TSS */
6aa8b732
AK
3438 load_TR_desc();
3439}
3440
63129754 3441static void pre_svm_run(struct kvm_vcpu *vcpu)
6aa8b732 3442{
63129754
PB
3443 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3444 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732 3445
af18fa77 3446 /*
44f1b558
SC
3447 * If the previous vmrun of the vmcb occurred on a different physical
3448 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
3449 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3450 */
63129754 3451 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
193015ad 3452 svm->current_vmcb->asid_generation = 0;
af18fa77 3453 vmcb_mark_all_dirty(svm->vmcb);
63129754 3454 svm->current_vmcb->cpu = vcpu->cpu;
af18fa77
CA
3455 }
3456
63129754
PB
3457 if (sev_guest(vcpu->kvm))
3458 return pre_sev_run(svm, vcpu->cpu);
70cd94e6 3459
4b656b12 3460 /* FIXME: handle wraparound of asid_generation */
193015ad 3461 if (svm->current_vmcb->asid_generation != sd->asid_generation)
0fe1e009 3462 new_asid(svm, sd);
6aa8b732
AK
3463}
3464
95ba8273
GN
3465static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3466{
3467 struct vcpu_svm *svm = to_svm(vcpu);
3468
3469 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3470 vcpu->arch.hflags |= HF_NMI_MASK;
63129754 3471 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3472 svm_set_intercept(svm, INTERCEPT_IRET);
95ba8273
GN
3473 ++vcpu->stat.nmi_injections;
3474}
6aa8b732 3475
66fd3f7f 3476static void svm_set_irq(struct kvm_vcpu *vcpu)
2a8067f1
ED
3477{
3478 struct vcpu_svm *svm = to_svm(vcpu);
3479
2af9194d 3480 BUG_ON(!(gif_set(svm)));
cf74a78b 3481
9fb2d2b4
GN
3482 trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3483 ++vcpu->stat.irq_injections;
3484
219b65dc
AG
3485 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3486 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2a8067f1
ED
3487}
3488
b6a7cc35 3489static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
aaacfc9a
JR
3490{
3491 struct vcpu_svm *svm = to_svm(vcpu);
aaacfc9a 3492
f1c6366e
TL
3493 /*
3494 * SEV-ES guests must always keep the CR intercepts cleared. CR
3495 * tracking is done using the CR write traps.
3496 */
3497 if (sev_es_guest(vcpu->kvm))
3498 return;
3499
01c3b2b5 3500 if (nested_svm_virtualize_tpr(vcpu))
88ab24ad
JR
3501 return;
3502
830bd71f 3503 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
596f3142 3504
95ba8273 3505 if (irr == -1)
aaacfc9a
JR
3506 return;
3507
95ba8273 3508 if (tpr >= irr)
830bd71f 3509 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
95ba8273 3510}
aaacfc9a 3511
cae96af1 3512bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
95ba8273
GN
3513{
3514 struct vcpu_svm *svm = to_svm(vcpu);
3515 struct vmcb *vmcb = svm->vmcb;
88c604b6 3516 bool ret;
9c3d370a 3517
cae96af1 3518 if (!gif_set(svm))
bbdad0b5
PB
3519 return true;
3520
cae96af1
PB
3521 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3522 return false;
3523
3524 ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
63129754 3525 (vcpu->arch.hflags & HF_NMI_MASK);
924584cc
JR
3526
3527 return ret;
aaacfc9a
JR
3528}
3529
c9d40913 3530static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
3531{
3532 struct vcpu_svm *svm = to_svm(vcpu);
3533 if (svm->nested.nested_run_pending)
c9d40913 3534 return -EBUSY;
cae96af1 3535
c300ab9f
PB
3536 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
3537 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
c9d40913 3538 return -EBUSY;
c300ab9f
PB
3539
3540 return !svm_nmi_blocked(vcpu);
cae96af1
PB
3541}
3542
3cfc3092
JK
3543static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3544{
63129754 3545 return !!(vcpu->arch.hflags & HF_NMI_MASK);
3cfc3092
JK
3546}
3547
3548static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3549{
3550 struct vcpu_svm *svm = to_svm(vcpu);
3551
3552 if (masked) {
63129754
PB
3553 vcpu->arch.hflags |= HF_NMI_MASK;
3554 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3555 svm_set_intercept(svm, INTERCEPT_IRET);
3cfc3092 3556 } else {
63129754
PB
3557 vcpu->arch.hflags &= ~HF_NMI_MASK;
3558 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3559 svm_clr_intercept(svm, INTERCEPT_IRET);
3cfc3092
JK
3560 }
3561}
3562
cae96af1 3563bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
78646121
GN
3564{
3565 struct vcpu_svm *svm = to_svm(vcpu);
3566 struct vmcb *vmcb = svm->vmcb;
7fcdb510 3567
fc6f7c03 3568 if (!gif_set(svm))
cae96af1 3569 return true;
7fcdb510 3570
63129754 3571 if (sev_es_guest(vcpu->kvm)) {
f1c6366e
TL
3572 /*
3573 * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
3574 * bit to determine the state of the IF flag.
3575 */
3576 if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
3577 return true;
3578 } else if (is_guest_mode(vcpu)) {
fc6f7c03 3579 /* As long as interrupts are being delivered... */
e9fd761a 3580 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
4995a368 3581 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
fc6f7c03
PB
3582 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3583 return true;
3584
3585 /* ... vmexits aren't blocked by the interrupt shadow */
3586 if (nested_exit_on_intr(svm))
3587 return false;
3588 } else {
3589 if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3590 return true;
3591 }
3592
3593 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
cae96af1
PB
3594}
3595
c9d40913 3596static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
3597{
3598 struct vcpu_svm *svm = to_svm(vcpu);
3599 if (svm->nested.nested_run_pending)
c9d40913 3600 return -EBUSY;
cae96af1 3601
c300ab9f
PB
3602 /*
3603 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3604 * e.g. if the IRQ arrived asynchronously after checking nested events.
3605 */
3606 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
c9d40913 3607 return -EBUSY;
c300ab9f
PB
3608
3609 return !svm_interrupt_blocked(vcpu);
78646121
GN
3610}
3611
b6a7cc35 3612static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
6aa8b732 3613{
219b65dc 3614 struct vcpu_svm *svm = to_svm(vcpu);
219b65dc 3615
e0231715
JR
3616 /*
3617 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3618 * 1, because that's a separate STGI/VMRUN intercept. The next time we
3619 * get that intercept, this function will be called again though and
640bd6e5
JN
3620 * we'll get the vintr intercept. However, if the vGIF feature is
3621 * enabled, the STGI interception will not occur. Enable the irq
3622 * window under the assumption that the hardware will set the GIF.
e0231715 3623 */
b518ba9f 3624 if (vgif_enabled(svm) || gif_set(svm)) {
f3515dc3
SS
3625 /*
3626 * IRQ window is not needed when AVIC is enabled,
3627 * unless we have pending ExtINT since it cannot be injected
3628 * via AVIC. In such case, we need to temporarily disable AVIC,
3629 * and fallback to injecting IRQ via V_IRQ.
3630 */
30eed56a 3631 kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
219b65dc 3632 svm_set_vintr(svm);
219b65dc 3633 }
85f455f7
ED
3634}
3635
b6a7cc35 3636static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
c1150d8c 3637{
04d2cc77 3638 struct vcpu_svm *svm = to_svm(vcpu);
c1150d8c 3639
63129754 3640 if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
c9a7953f 3641 return; /* IRET will cause a vm exit */
44c11430 3642
640bd6e5
JN
3643 if (!gif_set(svm)) {
3644 if (vgif_enabled(svm))
a284ba56 3645 svm_set_intercept(svm, INTERCEPT_STGI);
1a5e1852 3646 return; /* STGI will cause a vm exit */
640bd6e5 3647 }
1a5e1852 3648
e0231715
JR
3649 /*
3650 * Something prevents NMI from been injected. Single step over possible
3651 * problem (IRET or exception injection or interrupt shadow)
3652 */
ab2f4d73 3653 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
6be7d306 3654 svm->nmi_singlestep = true;
44c11430 3655 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
c1150d8c
DL
3656}
3657
cbc94022
IE
3658static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3659{
3660 return 0;
3661}
3662
2ac52ab8
SC
3663static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
3664{
3665 return 0;
3666}
3667
f55ac304 3668void svm_flush_tlb(struct kvm_vcpu *vcpu)
d9e368d6 3669{
38e5e92f
JR
3670 struct vcpu_svm *svm = to_svm(vcpu);
3671
4a41e43c
SC
3672 /*
3673 * Flush only the current ASID even if the TLB flush was invoked via
3674 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
3675 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3676 * unconditionally does a TLB flush on both nested VM-Enter and nested
3677 * VM-Exit (via kvm_mmu_reset_context()).
3678 */
38e5e92f
JR
3679 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3680 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3681 else
193015ad 3682 svm->current_vmcb->asid_generation--;
d9e368d6
AK
3683}
3684
faff8758
JS
3685static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3686{
3687 struct vcpu_svm *svm = to_svm(vcpu);
3688
3689 invlpga(gva, svm->vmcb->control.asid);
3690}
3691
d7bf8221
JR
3692static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3693{
3694 struct vcpu_svm *svm = to_svm(vcpu);
3695
01c3b2b5 3696 if (nested_svm_virtualize_tpr(vcpu))
88ab24ad
JR
3697 return;
3698
830bd71f 3699 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
d7bf8221 3700 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
615d5193 3701 kvm_set_cr8(vcpu, cr8);
d7bf8221
JR
3702 }
3703}
3704
649d6864
JR
3705static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3706{
3707 struct vcpu_svm *svm = to_svm(vcpu);
3708 u64 cr8;
3709
01c3b2b5 3710 if (nested_svm_virtualize_tpr(vcpu) ||
3bbf3565 3711 kvm_vcpu_apicv_active(vcpu))
88ab24ad
JR
3712 return;
3713
649d6864
JR
3714 cr8 = kvm_get_cr8(vcpu);
3715 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3716 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3717}
3718
63129754 3719static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
9222be18 3720{
63129754 3721 struct vcpu_svm *svm = to_svm(vcpu);
9222be18
GN
3722 u8 vector;
3723 int type;
3724 u32 exitintinfo = svm->vmcb->control.exit_int_info;
66b7138f
JK
3725 unsigned int3_injected = svm->int3_injected;
3726
3727 svm->int3_injected = 0;
9222be18 3728
bd3d1ec3
AK
3729 /*
3730 * If we've made progress since setting HF_IRET_MASK, we've
3731 * executed an IRET and can allow NMI injection.
3732 */
63129754
PB
3733 if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3734 (sev_es_guest(vcpu->kvm) ||
3735 kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3736 vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3737 kvm_make_request(KVM_REQ_EVENT, vcpu);
3842d135 3738 }
44c11430 3739
63129754
PB
3740 vcpu->arch.nmi_injected = false;
3741 kvm_clear_exception_queue(vcpu);
3742 kvm_clear_interrupt_queue(vcpu);
9222be18
GN
3743
3744 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3745 return;
3746
63129754 3747 kvm_make_request(KVM_REQ_EVENT, vcpu);
3842d135 3748
9222be18
GN
3749 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3750 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3751
3752 switch (type) {
3753 case SVM_EXITINTINFO_TYPE_NMI:
63129754 3754 vcpu->arch.nmi_injected = true;
9222be18
GN
3755 break;
3756 case SVM_EXITINTINFO_TYPE_EXEPT:
f1c6366e
TL
3757 /*
3758 * Never re-inject a #VC exception.
3759 */
3760 if (vector == X86_TRAP_VC)
3761 break;
3762
66b7138f
JK
3763 /*
3764 * In case of software exceptions, do not reinject the vector,
3765 * but re-execute the instruction instead. Rewind RIP first
3766 * if we emulated INT3 before.
3767 */
3768 if (kvm_exception_is_soft(vector)) {
3769 if (vector == BP_VECTOR && int3_injected &&
63129754
PB
3770 kvm_is_linear_rip(vcpu, svm->int3_rip))
3771 kvm_rip_write(vcpu,
3772 kvm_rip_read(vcpu) - int3_injected);
9222be18 3773 break;
66b7138f 3774 }
9222be18
GN
3775 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3776 u32 err = svm->vmcb->control.exit_int_info_err;
63129754 3777 kvm_requeue_exception_e(vcpu, vector, err);
9222be18
GN
3778
3779 } else
63129754 3780 kvm_requeue_exception(vcpu, vector);
9222be18
GN
3781 break;
3782 case SVM_EXITINTINFO_TYPE_INTR:
63129754 3783 kvm_queue_interrupt(vcpu, vector, false);
9222be18
GN
3784 break;
3785 default:
3786 break;
3787 }
3788}
3789
b463a6f7
AK
3790static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3791{
3792 struct vcpu_svm *svm = to_svm(vcpu);
3793 struct vmcb_control_area *control = &svm->vmcb->control;
3794
3795 control->exit_int_info = control->event_inj;
3796 control->exit_int_info_err = control->event_inj_err;
3797 control->event_inj = 0;
63129754 3798 svm_complete_interrupts(vcpu);
b463a6f7
AK
3799}
3800
404d5d7b 3801static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
a9ab13ff 3802{
4e810adb 3803 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
a9ab13ff
WL
3804 to_svm(vcpu)->vmcb->control.exit_info_1)
3805 return handle_fastpath_set_msr_irqoff(vcpu);
3806
3807 return EXIT_FASTPATH_NONE;
3808}
3809
63129754 3810static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
135961e0 3811{
63129754 3812 struct vcpu_svm *svm = to_svm(vcpu);
d1788191 3813 unsigned long vmcb_pa = svm->current_vmcb->pa;
63129754 3814
bc908e09 3815 kvm_guest_enter_irqoff();
135961e0 3816
63129754 3817 if (sev_es_guest(vcpu->kvm)) {
d1788191 3818 __svm_sev_es_vcpu_run(vmcb_pa);
16809ecd 3819 } else {
e79b91bb
MR
3820 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3821
d1788191
SC
3822 /*
3823 * Use a single vmcb (vmcb01 because it's always valid) for
3824 * context switching guest state via VMLOAD/VMSAVE, that way
3825 * the state doesn't need to be copied between vmcb01 and
3826 * vmcb02 when switching vmcbs for nested virtualization.
3827 */
cc3ed80a 3828 vmload(svm->vmcb01.pa);
d1788191 3829 __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
cc3ed80a 3830 vmsave(svm->vmcb01.pa);
135961e0 3831
e79b91bb 3832 vmload(__sme_page_pa(sd->save_area));
16809ecd 3833 }
135961e0 3834
bc908e09 3835 kvm_guest_exit_irqoff();
135961e0
TG
3836}
3837
b95273f1 3838static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
6aa8b732 3839{
a2fa3e9f 3840 struct vcpu_svm *svm = to_svm(vcpu);
d9e368d6 3841
d95df951
LB
3842 trace_kvm_entry(vcpu);
3843
2041a06a
JR
3844 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3845 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3846 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3847
a12713c2
LP
3848 /*
3849 * Disable singlestep if we're injecting an interrupt/exception.
3850 * We don't want our modified rflags to be pushed on the stack where
3851 * we might not be able to easily reset them if we disabled NMI
3852 * singlestep later.
3853 */
3854 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3855 /*
3856 * Event injection happens before external interrupts cause a
3857 * vmexit and interrupts are disabled here, so smp_send_reschedule
3858 * is enough to force an immediate vmexit.
3859 */
3860 disable_nmi_singlestep(svm);
3861 smp_send_reschedule(vcpu->cpu);
3862 }
3863
63129754 3864 pre_svm_run(vcpu);
6aa8b732 3865
649d6864
JR
3866 sync_lapic_to_cr8(vcpu);
3867
7e8e6eed
CA
3868 if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3869 svm->vmcb->control.asid = svm->asid;
3870 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3871 }
cda0ffdd 3872 svm->vmcb->save.cr2 = vcpu->arch.cr2;
6aa8b732 3873
1183646a
VP
3874 svm_hv_update_vp_id(svm->vmcb, vcpu);
3875
d67668e9
PB
3876 /*
3877 * Run with all-zero DR6 unless needed, so that we can get the exact cause
3878 * of a #DB.
3879 */
63129754 3880 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
d67668e9
PB
3881 svm_set_dr6(svm, vcpu->arch.dr6);
3882 else
9a3ecd5e 3883 svm_set_dr6(svm, DR6_ACTIVE_LOW);
d67668e9 3884
04d2cc77 3885 clgi();
139a12cf 3886 kvm_load_guest_xsave_state(vcpu);
04d2cc77 3887
010fd37f 3888 kvm_wait_lapic_expire(vcpu);
b6c4bc65 3889
b2ac58f9
KA
3890 /*
3891 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3892 * it's non-zero. Since vmentry is serialising on affected CPUs, there
3893 * is no need to worry about the conditional branch over the wrmsr
3894 * being speculatively taken.
3895 */
d00b99c5
BM
3896 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3897 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
b2ac58f9 3898
63129754 3899 svm_vcpu_enter_exit(vcpu);
15e6c22f 3900
b2ac58f9
KA
3901 /*
3902 * We do not use IBRS in the kernel. If this vCPU has used the
3903 * SPEC_CTRL MSR it may have left it on; save the value and
3904 * turn it off. This is much more efficient than blindly adding
3905 * it to the atomic save/restore list. Especially as the former
3906 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
3907 *
3908 * For non-nested case:
3909 * If the L01 MSR bitmap does not intercept the MSR, then we need to
3910 * save it.
3911 *
3912 * For nested case:
3913 * If the L02 MSR bitmap does not intercept the MSR, then we need to
3914 * save it.
3915 */
d00b99c5
BM
3916 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
3917 unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
ecb586bd 3918 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
b2ac58f9 3919
63129754 3920 if (!sev_es_guest(vcpu->kvm))
16809ecd 3921 reload_tss(vcpu);
6aa8b732 3922
d00b99c5
BM
3923 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3924 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
024d83ca 3925
63129754 3926 if (!sev_es_guest(vcpu->kvm)) {
16809ecd
TL
3927 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3928 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3929 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3930 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3931 }
13c34e07 3932
3781c01c 3933 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
63129754 3934 kvm_before_interrupt(vcpu);
3781c01c 3935
139a12cf 3936 kvm_load_host_xsave_state(vcpu);
3781c01c
JR
3937 stgi();
3938
3939 /* Any pending NMI will happen here */
3940
3941 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
63129754 3942 kvm_after_interrupt(vcpu);
3781c01c 3943
d7bf8221
JR
3944 sync_cr8_to_lapic(vcpu);
3945
a2fa3e9f 3946 svm->next_rip = 0;
63129754 3947 if (is_guest_mode(vcpu)) {
9e8f0fbf 3948 nested_sync_control_from_vmcb02(svm);
b93af02c
KS
3949
3950 /* Track VMRUNs that have made past consistency checking */
3951 if (svm->nested.nested_run_pending &&
3952 svm->vmcb->control.exit_code != SVM_EXIT_ERR)
3953 ++vcpu->stat.nested_run;
3954
2d8a42be
PB
3955 svm->nested.nested_run_pending = 0;
3956 }
9222be18 3957
38e5e92f 3958 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
e42c6828 3959 vmcb_mark_all_clean(svm->vmcb);
38e5e92f 3960
631bc487
GN
3961 /* if exit due to PF check for async PF */
3962 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
63129754 3963 vcpu->arch.apf.host_apf_flags =
68fd66f1 3964 kvm_read_and_reset_apf_flags();
631bc487 3965
329675dd
ML
3966 if (npt_enabled)
3967 kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR);
fe5913e4
JR
3968
3969 /*
3970 * We need to handle MC intercepts here before the vcpu has a chance to
3971 * change the physical cpu
3972 */
3973 if (unlikely(svm->vmcb->control.exit_code ==
3974 SVM_EXIT_EXCP_BASE + MC_VECTOR))
63129754 3975 svm_handle_mce(vcpu);
8d28fec4 3976
63129754 3977 svm_complete_interrupts(vcpu);
4e810adb
WL
3978
3979 if (is_guest_mode(vcpu))
3980 return EXIT_FASTPATH_NONE;
3981
3982 return svm_exit_handlers_fastpath(vcpu);
6aa8b732
AK
3983}
3984
e83bc09c 3985static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
2a40b900 3986 int root_level)
6aa8b732 3987{
a2fa3e9f 3988 struct vcpu_svm *svm = to_svm(vcpu);
689f3bf2 3989 unsigned long cr3;
a2fa3e9f 3990
689f3bf2 3991 if (npt_enabled) {
4a98623d 3992 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
06e7852c 3993 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
1c97f0a0 3994
1e0c7d40
VP
3995 hv_track_root_tdp(vcpu, root_hpa);
3996
689f3bf2 3997 /* Loading L2's CR3 is handled by enter_svm_guest_mode. */
978ce583
PB
3998 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3999 return;
4000 cr3 = vcpu->arch.cr3;
e83bc09c 4001 } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
4a98623d 4002 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
e83bc09c
SC
4003 } else {
4004 /* PCID in the guest should be impossible with a 32-bit MMU. */
4005 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4006 cr3 = root_hpa;
689f3bf2 4007 }
1c97f0a0 4008
978ce583 4009 svm->vmcb->save.cr3 = cr3;
06e7852c 4010 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1c97f0a0
JR
4011}
4012
6aa8b732
AK
4013static int is_disabled(void)
4014{
6031a61c
JR
4015 u64 vm_cr;
4016
4017 rdmsrl(MSR_VM_CR, vm_cr);
4018 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4019 return 1;
4020
6aa8b732
AK
4021 return 0;
4022}
4023
102d8325
IM
4024static void
4025svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4026{
4027 /*
4028 * Patch in the VMMCALL instruction:
4029 */
4030 hypercall[0] = 0x0f;
4031 hypercall[1] = 0x01;
4032 hypercall[2] = 0xd9;
102d8325
IM
4033}
4034
f257d6dc 4035static int __init svm_check_processor_compat(void)
002c7f7c 4036{
f257d6dc 4037 return 0;
002c7f7c
YS
4038}
4039
774ead3a
AK
4040static bool svm_cpu_has_accelerated_tpr(void)
4041{
4042 return false;
4043}
4044
5719455f
TL
4045/*
4046 * The kvm parameter can be NULL (module initialization, or invocation before
4047 * VM creation). Be sure to check the kvm parameter before using it.
4048 */
4049static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
6d396b55 4050{
e87555e5
VK
4051 switch (index) {
4052 case MSR_IA32_MCG_EXT_CTL:
95c5c7c7 4053 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
e87555e5 4054 return false;
5719455f
TL
4055 case MSR_IA32_SMBASE:
4056 /* SEV-ES guests do not support SMM, so report false */
4057 if (kvm && sev_es_guest(kvm))
4058 return false;
4059 break;
e87555e5
VK
4060 default:
4061 break;
4062 }
4063
6d396b55
PB
4064 return true;
4065}
4066
fc07e76a
PB
4067static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4068{
4069 return 0;
4070}
4071
7c1b761b 4072static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
0e851880 4073{
6092d3d3 4074 struct vcpu_svm *svm = to_svm(vcpu);
96308b06 4075 struct kvm_cpuid_entry2 *best;
6092d3d3 4076
7204160e 4077 vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
96be4e06 4078 boot_cpu_has(X86_FEATURE_XSAVE) &&
7204160e
AL
4079 boot_cpu_has(X86_FEATURE_XSAVES);
4080
6092d3d3 4081 /* Update nrips enabled cache */
4eb87460 4082 svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
63129754 4083 guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
46781eae 4084
5228eb96
ML
4085 svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
4086
3b195ac9 4087 svm_recalc_instruction_intercepts(vcpu, svm);
4407a797 4088
96308b06
BM
4089 /* For sev guests, the memory encryption bit is not reserved in CR3. */
4090 if (sev_guest(vcpu->kvm)) {
4091 best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
4092 if (best)
ca29e145 4093 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
96308b06
BM
4094 }
4095
adc2a237
ML
4096 if (kvm_vcpu_apicv_active(vcpu)) {
4097 /*
4098 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
4099 * is exposed to the guest, disable AVIC.
4100 */
4101 if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
4102 kvm_request_apicv_update(vcpu->kvm, false,
4103 APICV_INHIBIT_REASON_X2APIC);
46781eae 4104
adc2a237
ML
4105 /*
4106 * Currently, AVIC does not work with nested virtualization.
4107 * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
4108 */
4109 if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4110 kvm_request_apicv_update(vcpu->kvm, false,
4111 APICV_INHIBIT_REASON_NESTED);
4112 }
36e8194d 4113 init_vmcb_after_set_cpuid(vcpu);
0e851880
SY
4114}
4115
f5f48ee1
SY
4116static bool svm_has_wbinvd_exit(void)
4117{
4118 return true;
4119}
4120
8061252e 4121#define PRE_EX(exit) { .exit_code = (exit), \
40e19b51 4122 .stage = X86_ICPT_PRE_EXCEPT, }
cfec82cb 4123#define POST_EX(exit) { .exit_code = (exit), \
40e19b51 4124 .stage = X86_ICPT_POST_EXCEPT, }
d7eb8203 4125#define POST_MEM(exit) { .exit_code = (exit), \
40e19b51 4126 .stage = X86_ICPT_POST_MEMACCESS, }
cfec82cb 4127
09941fbb 4128static const struct __x86_intercept {
cfec82cb
JR
4129 u32 exit_code;
4130 enum x86_intercept_stage stage;
cfec82cb
JR
4131} x86_intercept_map[] = {
4132 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
4133 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
4134 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
4135 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
4136 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
3b88e41a
JR
4137 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
4138 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
dee6bb70
JR
4139 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
4140 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
4141 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
4142 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
4143 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
4144 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
4145 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
4146 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
01de8b09
JR
4147 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4148 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4149 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4150 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4151 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4152 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4153 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4154 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
d7eb8203
JR
4155 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4156 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4157 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
8061252e
JR
4158 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4159 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4160 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4161 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4162 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4163 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4164 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4165 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4166 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
bf608f88
JR
4167 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4168 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4169 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4170 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4171 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4172 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4173 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
f6511935
JR
4174 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4175 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4176 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4177 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
02d4160f 4178 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
cfec82cb
JR
4179};
4180
8061252e 4181#undef PRE_EX
cfec82cb 4182#undef POST_EX
d7eb8203 4183#undef POST_MEM
cfec82cb 4184
8a76d7f2
JR
4185static int svm_check_intercept(struct kvm_vcpu *vcpu,
4186 struct x86_instruction_info *info,
21f1b8f2
SC
4187 enum x86_intercept_stage stage,
4188 struct x86_exception *exception)
8a76d7f2 4189{
cfec82cb
JR
4190 struct vcpu_svm *svm = to_svm(vcpu);
4191 int vmexit, ret = X86EMUL_CONTINUE;
4192 struct __x86_intercept icpt_info;
4193 struct vmcb *vmcb = svm->vmcb;
4194
4195 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4196 goto out;
4197
4198 icpt_info = x86_intercept_map[info->intercept];
4199
40e19b51 4200 if (stage != icpt_info.stage)
cfec82cb
JR
4201 goto out;
4202
4203 switch (icpt_info.exit_code) {
4204 case SVM_EXIT_READ_CR0:
4205 if (info->intercept == x86_intercept_cr_read)
4206 icpt_info.exit_code += info->modrm_reg;
4207 break;
4208 case SVM_EXIT_WRITE_CR0: {
4209 unsigned long cr0, val;
cfec82cb
JR
4210
4211 if (info->intercept == x86_intercept_cr_write)
4212 icpt_info.exit_code += info->modrm_reg;
4213
62baf44c
JK
4214 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4215 info->intercept == x86_intercept_clts)
cfec82cb
JR
4216 break;
4217
c62e2e94
BM
4218 if (!(vmcb_is_intercept(&svm->nested.ctl,
4219 INTERCEPT_SELECTIVE_CR0)))
cfec82cb
JR
4220 break;
4221
4222 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4223 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4224
4225 if (info->intercept == x86_intercept_lmsw) {
4226 cr0 &= 0xfUL;
4227 val &= 0xfUL;
4228 /* lmsw can't clear PE - catch this here */
4229 if (cr0 & X86_CR0_PE)
4230 val |= X86_CR0_PE;
4231 }
4232
4233 if (cr0 ^ val)
4234 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4235
4236 break;
4237 }
3b88e41a
JR
4238 case SVM_EXIT_READ_DR0:
4239 case SVM_EXIT_WRITE_DR0:
4240 icpt_info.exit_code += info->modrm_reg;
4241 break;
8061252e
JR
4242 case SVM_EXIT_MSR:
4243 if (info->intercept == x86_intercept_wrmsr)
4244 vmcb->control.exit_info_1 = 1;
4245 else
4246 vmcb->control.exit_info_1 = 0;
4247 break;
bf608f88
JR
4248 case SVM_EXIT_PAUSE:
4249 /*
4250 * We get this for NOP only, but pause
4251 * is rep not, check this here
4252 */
4253 if (info->rep_prefix != REPE_PREFIX)
4254 goto out;
49a8afca 4255 break;
f6511935
JR
4256 case SVM_EXIT_IOIO: {
4257 u64 exit_info;
4258 u32 bytes;
4259
f6511935
JR
4260 if (info->intercept == x86_intercept_in ||
4261 info->intercept == x86_intercept_ins) {
6cbc5f5a
JK
4262 exit_info = ((info->src_val & 0xffff) << 16) |
4263 SVM_IOIO_TYPE_MASK;
f6511935 4264 bytes = info->dst_bytes;
6493f157 4265 } else {
6cbc5f5a 4266 exit_info = (info->dst_val & 0xffff) << 16;
6493f157 4267 bytes = info->src_bytes;
f6511935
JR
4268 }
4269
4270 if (info->intercept == x86_intercept_outs ||
4271 info->intercept == x86_intercept_ins)
4272 exit_info |= SVM_IOIO_STR_MASK;
4273
4274 if (info->rep_prefix)
4275 exit_info |= SVM_IOIO_REP_MASK;
4276
4277 bytes = min(bytes, 4u);
4278
4279 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4280
4281 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4282
4283 vmcb->control.exit_info_1 = exit_info;
4284 vmcb->control.exit_info_2 = info->next_rip;
4285
4286 break;
4287 }
cfec82cb
JR
4288 default:
4289 break;
4290 }
4291
f104765b
BD
4292 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4293 if (static_cpu_has(X86_FEATURE_NRIPS))
4294 vmcb->control.next_rip = info->next_rip;
cfec82cb
JR
4295 vmcb->control.exit_code = icpt_info.exit_code;
4296 vmexit = nested_svm_exit_handled(svm);
4297
4298 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4299 : X86EMUL_CONTINUE;
4300
4301out:
4302 return ret;
8a76d7f2
JR
4303}
4304
a9ab13ff 4305static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
a547c6db 4306{
a547c6db
YZ
4307}
4308
ae97a3b8
RK
4309static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4310{
830f01b0 4311 if (!kvm_pause_in_guest(vcpu->kvm))
8566ac8b 4312 shrink_ple_window(vcpu);
ae97a3b8
RK
4313}
4314
74f16909
BP
4315static void svm_setup_mce(struct kvm_vcpu *vcpu)
4316{
4317 /* [63:9] are reserved. */
4318 vcpu->arch.mcg_cap &= 0x1ff;
4319}
4320
cae96af1 4321bool svm_smi_blocked(struct kvm_vcpu *vcpu)
72d7b374 4322{
05cade71
LP
4323 struct vcpu_svm *svm = to_svm(vcpu);
4324
4325 /* Per APM Vol.2 15.22.2 "Response to SMI" */
4326 if (!gif_set(svm))
cae96af1
PB
4327 return true;
4328
4329 return is_smm(vcpu);
4330}
4331
c9d40913 4332static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
4333{
4334 struct vcpu_svm *svm = to_svm(vcpu);
4335 if (svm->nested.nested_run_pending)
c9d40913 4336 return -EBUSY;
05cade71 4337
c300ab9f
PB
4338 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
4339 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
c9d40913 4340 return -EBUSY;
c300ab9f 4341
cae96af1 4342 return !svm_smi_blocked(vcpu);
72d7b374
LP
4343}
4344
ecc513e5 4345static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
0234bf88 4346{
05cade71 4347 struct vcpu_svm *svm = to_svm(vcpu);
37be407b 4348 struct kvm_host_map map_save;
05cade71
LP
4349 int ret;
4350
136a55c0
ML
4351 if (!is_guest_mode(vcpu))
4352 return 0;
05cade71 4353
136a55c0
ML
4354 /* FED8h - SVM Guest */
4355 put_smstate(u64, smstate, 0x7ed8, 1);
4356 /* FEE0h - SVM Guest VMCB Physical Address */
4357 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
05cade71 4358
136a55c0
ML
4359 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4360 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4361 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
37be407b 4362
136a55c0
ML
4363 ret = nested_svm_vmexit(svm);
4364 if (ret)
4365 return ret;
4366
4367 /*
4368 * KVM uses VMCB01 to store L1 host state while L2 runs but
4369 * VMCB01 is going to be used during SMM and thus the state will
4370 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4371 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4372 * format of the area is identical to guest save area offsetted
4373 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4374 * within 'struct vmcb'). Note: HSAVE area may also be used by
4375 * L1 hypervisor to save additional host context (e.g. KVM does
4376 * that, see svm_prepare_guest_switch()) which must be
4377 * preserved.
4378 */
4379 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
4380 &map_save) == -EINVAL)
4381 return 1;
37be407b 4382
136a55c0 4383 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
37be407b 4384
136a55c0
ML
4385 svm_copy_vmrun_state(map_save.hva + 0x400,
4386 &svm->vmcb01.ptr->save);
37be407b 4387
136a55c0 4388 kvm_vcpu_unmap(vcpu, &map_save, true);
0234bf88
LP
4389 return 0;
4390}
4391
ecc513e5 4392static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
0234bf88 4393{
05cade71 4394 struct vcpu_svm *svm = to_svm(vcpu);
37be407b 4395 struct kvm_host_map map, map_save;
136a55c0
ML
4396 u64 saved_efer, vmcb12_gpa;
4397 struct vmcb *vmcb12;
4398 int ret;
05cade71 4399
136a55c0
ML
4400 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4401 return 0;
05cade71 4402
136a55c0
ML
4403 /* Non-zero if SMI arrived while vCPU was in guest mode. */
4404 if (!GET_SMSTATE(u64, smstate, 0x7ed8))
4405 return 0;
3ebb5d26 4406
136a55c0
ML
4407 if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4408 return 1;
3ebb5d26 4409
136a55c0
ML
4410 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
4411 if (!(saved_efer & EFER_SVME))
4412 return 1;
3ebb5d26 4413
136a55c0
ML
4414 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
4415 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
4416 return 1;
3ebb5d26 4417
136a55c0
ML
4418 ret = 1;
4419 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
4420 goto unmap_map;
37be407b 4421
136a55c0
ML
4422 if (svm_allocate_nested(svm))
4423 goto unmap_save;
37be407b 4424
136a55c0
ML
4425 /*
4426 * Restore L1 host state from L1 HSAVE area as VMCB01 was
4427 * used during SMM (see svm_enter_smm())
4428 */
37be407b 4429
136a55c0 4430 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
e2e6e449 4431
136a55c0
ML
4432 /*
4433 * Enter the nested guest now
4434 */
59cd9bc5 4435
136a55c0 4436 vmcb12 = map.hva;
7907160d 4437 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
f2740a8d 4438 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
136a55c0
ML
4439 ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
4440
4441unmap_save:
4442 kvm_vcpu_unmap(vcpu, &map_save, true);
4443unmap_map:
4444 kvm_vcpu_unmap(vcpu, &map, true);
59cd9bc5 4445 return ret;
0234bf88
LP
4446}
4447
b6a7cc35 4448static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
cc3d967f
LP
4449{
4450 struct vcpu_svm *svm = to_svm(vcpu);
4451
4452 if (!gif_set(svm)) {
4453 if (vgif_enabled(svm))
a284ba56 4454 svm_set_intercept(svm, INTERCEPT_STGI);
cc3d967f 4455 /* STGI will cause a vm exit */
c9d40913
PB
4456 } else {
4457 /* We must be in SMM; RSM will cause a vmexit anyway. */
cc3d967f 4458 }
cc3d967f
LP
4459}
4460
09e3e2a1 4461static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
05d5a486 4462{
09e3e2a1
SC
4463 bool smep, smap, is_user;
4464 unsigned long cr4;
e72436bc 4465
bc624d9f
TL
4466 /*
4467 * When the guest is an SEV-ES guest, emulation is not possible.
4468 */
4469 if (sev_es_guest(vcpu->kvm))
4470 return false;
4471
05d5a486 4472 /*
118154bd
LA
4473 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4474 *
4475 * Errata:
4476 * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
4477 * possible that CPU microcode implementing DecodeAssist will fail
4478 * to read bytes of instruction which caused #NPF. In this case,
4479 * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
4480 * return 0 instead of the correct guest instruction bytes.
4481 *
4482 * This happens because CPU microcode reading instruction bytes
4483 * uses a special opcode which attempts to read data using CPL=0
d9f6e12f 4484 * privileges. The microcode reads CS:RIP and if it hits a SMAP
118154bd
LA
4485 * fault, it gives up and returns no instruction bytes.
4486 *
4487 * Detection:
4488 * We reach here in case CPU supports DecodeAssist, raised #NPF and
4489 * returned 0 in GuestIntrBytes field of the VMCB.
4490 * First, errata can only be triggered in case vCPU CR4.SMAP=1.
4491 * Second, if vCPU CR4.SMEP=1, errata could only be triggered
4492 * in case vCPU CPL==3 (Because otherwise guest would have triggered
4493 * a SMEP fault instead of #NPF).
4494 * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
4495 * As most guests enable SMAP if they have also enabled SMEP, use above
4496 * logic in order to attempt minimize false-positive of detecting errata
4497 * while still preserving all cases semantic correctness.
4498 *
4499 * Workaround:
4500 * To determine what instruction the guest was executing, the hypervisor
4501 * will have to decode the instruction at the instruction pointer.
05d5a486
SB
4502 *
4503 * In non SEV guest, hypervisor will be able to read the guest
4504 * memory to decode the instruction pointer when insn_len is zero
4505 * so we return true to indicate that decoding is possible.
4506 *
4507 * But in the SEV guest, the guest memory is encrypted with the
4508 * guest specific key and hypervisor will not be able to decode the
4509 * instruction pointer so we will not able to workaround it. Lets
4510 * print the error and request to kill the guest.
4511 */
09e3e2a1
SC
4512 if (likely(!insn || insn_len))
4513 return true;
4514
4515 /*
4516 * If RIP is invalid, go ahead with emulation which will cause an
4517 * internal error exit.
4518 */
4519 if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
4520 return true;
4521
4522 cr4 = kvm_read_cr4(vcpu);
4523 smep = cr4 & X86_CR4_SMEP;
4524 smap = cr4 & X86_CR4_SMAP;
4525 is_user = svm_get_cpl(vcpu) == 3;
118154bd 4526 if (smap && (!smep || is_user)) {
05d5a486
SB
4527 if (!sev_guest(vcpu->kvm))
4528 return true;
4529
118154bd 4530 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
05d5a486
SB
4531 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4532 }
4533
4534 return false;
4535}
4536
4b9852f4
LA
4537static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4538{
4539 struct vcpu_svm *svm = to_svm(vcpu);
4540
4541 /*
4542 * TODO: Last condition latch INIT signals on vCPU when
4543 * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
33b22172
PB
4544 * To properly emulate the INIT intercept,
4545 * svm_check_nested_events() should call nested_svm_vmexit()
4546 * if an INIT signal is pending.
4b9852f4
LA
4547 */
4548 return !gif_set(svm) ||
c62e2e94 4549 (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
4b9852f4
LA
4550}
4551
647daca2
TL
4552static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4553{
4554 if (!sev_es_guest(vcpu->kvm))
4555 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4556
4557 sev_vcpu_deliver_sipi_vector(vcpu, vector);
4558}
4559
eaf78265
JR
4560static void svm_vm_destroy(struct kvm *kvm)
4561{
4562 avic_vm_destroy(kvm);
4563 sev_vm_destroy(kvm);
4564}
4565
4566static int svm_vm_init(struct kvm *kvm)
4567{
830f01b0
WL
4568 if (!pause_filter_count || !pause_filter_thresh)
4569 kvm->arch.pause_in_guest = true;
4570
fdf513e3 4571 if (enable_apicv) {
eaf78265
JR
4572 int ret = avic_vm_init(kvm);
4573 if (ret)
4574 return ret;
4575 }
4576
eaf78265
JR
4577 return 0;
4578}
4579
9c14ee21 4580static struct kvm_x86_ops svm_x86_ops __initdata = {
9dadfc4a
SC
4581 .name = "kvm_amd",
4582
dd58f3c9 4583 .hardware_unsetup = svm_hardware_teardown,
6aa8b732
AK
4584 .hardware_enable = svm_hardware_enable,
4585 .hardware_disable = svm_hardware_disable,
774ead3a 4586 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
bc226f07 4587 .has_emulated_msr = svm_has_emulated_msr,
6aa8b732
AK
4588
4589 .vcpu_create = svm_create_vcpu,
4590 .vcpu_free = svm_free_vcpu,
04d2cc77 4591 .vcpu_reset = svm_vcpu_reset,
6aa8b732 4592
562b6b08 4593 .vm_size = sizeof(struct kvm_svm),
4e19c36f 4594 .vm_init = svm_vm_init,
1654efcb 4595 .vm_destroy = svm_vm_destroy,
44a95dae 4596
04d2cc77 4597 .prepare_guest_switch = svm_prepare_guest_switch,
6aa8b732
AK
4598 .vcpu_load = svm_vcpu_load,
4599 .vcpu_put = svm_vcpu_put,
8221c137
SS
4600 .vcpu_blocking = svm_vcpu_blocking,
4601 .vcpu_unblocking = svm_vcpu_unblocking,
6aa8b732 4602
b6a7cc35 4603 .update_exception_bitmap = svm_update_exception_bitmap,
801e459a 4604 .get_msr_feature = svm_get_msr_feature,
6aa8b732
AK
4605 .get_msr = svm_get_msr,
4606 .set_msr = svm_set_msr,
4607 .get_segment_base = svm_get_segment_base,
4608 .get_segment = svm_get_segment,
4609 .set_segment = svm_set_segment,
2e4d2653 4610 .get_cpl = svm_get_cpl,
1747fb71 4611 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
6aa8b732 4612 .set_cr0 = svm_set_cr0,
c2fe3cd4 4613 .is_valid_cr4 = svm_is_valid_cr4,
6aa8b732
AK
4614 .set_cr4 = svm_set_cr4,
4615 .set_efer = svm_set_efer,
4616 .get_idt = svm_get_idt,
4617 .set_idt = svm_set_idt,
4618 .get_gdt = svm_get_gdt,
4619 .set_gdt = svm_set_gdt,
020df079 4620 .set_dr7 = svm_set_dr7,
facb0139 4621 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
6de4f3ad 4622 .cache_reg = svm_cache_reg,
6aa8b732
AK
4623 .get_rflags = svm_get_rflags,
4624 .set_rflags = svm_set_rflags,
be94f6b7 4625
7780938c 4626 .tlb_flush_all = svm_flush_tlb,
eeeb4f67 4627 .tlb_flush_current = svm_flush_tlb,
faff8758 4628 .tlb_flush_gva = svm_flush_tlb_gva,
72b38320 4629 .tlb_flush_guest = svm_flush_tlb,
6aa8b732 4630
6aa8b732 4631 .run = svm_vcpu_run,
04d2cc77 4632 .handle_exit = handle_exit,
6aa8b732 4633 .skip_emulated_instruction = skip_emulated_instruction,
5ef8acbd 4634 .update_emulated_instruction = NULL,
2809f5d2
GC
4635 .set_interrupt_shadow = svm_set_interrupt_shadow,
4636 .get_interrupt_shadow = svm_get_interrupt_shadow,
102d8325 4637 .patch_hypercall = svm_patch_hypercall,
2a8067f1 4638 .set_irq = svm_set_irq,
95ba8273 4639 .set_nmi = svm_inject_nmi,
298101da 4640 .queue_exception = svm_queue_exception,
b463a6f7 4641 .cancel_injection = svm_cancel_injection,
78646121 4642 .interrupt_allowed = svm_interrupt_allowed,
95ba8273 4643 .nmi_allowed = svm_nmi_allowed,
3cfc3092
JK
4644 .get_nmi_mask = svm_get_nmi_mask,
4645 .set_nmi_mask = svm_set_nmi_mask,
b6a7cc35
JB
4646 .enable_nmi_window = svm_enable_nmi_window,
4647 .enable_irq_window = svm_enable_irq_window,
4648 .update_cr8_intercept = svm_update_cr8_intercept,
8d860bbe 4649 .set_virtual_apic_mode = svm_set_virtual_apic_mode,
d62caabb 4650 .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
ef8efd7a 4651 .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
c7c9c56c 4652 .load_eoi_exitmap = svm_load_eoi_exitmap,
44a95dae
SS
4653 .hwapic_irr_update = svm_hwapic_irr_update,
4654 .hwapic_isr_update = svm_hwapic_isr_update,
be8ca170 4655 .apicv_post_state_restore = avic_post_state_restore,
cbc94022
IE
4656
4657 .set_tss_addr = svm_set_tss_addr,
2ac52ab8 4658 .set_identity_map_addr = svm_set_identity_map_addr,
4b12f0de 4659 .get_mt_mask = svm_get_mt_mask,
229456fc 4660
586f9607 4661 .get_exit_info = svm_get_exit_info,
586f9607 4662
7c1b761b 4663 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4e47c7a6 4664
f5f48ee1 4665 .has_wbinvd_exit = svm_has_wbinvd_exit,
99e3e30a 4666
307a94c7
IS
4667 .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4668 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
edcfe540 4669 .write_tsc_offset = svm_write_tsc_offset,
1ab9287a 4670 .write_tsc_multiplier = svm_write_tsc_multiplier,
1c97f0a0 4671
727a7e27 4672 .load_mmu_pgd = svm_load_mmu_pgd,
8a76d7f2
JR
4673
4674 .check_intercept = svm_check_intercept,
95b5a48c 4675 .handle_exit_irqoff = svm_handle_exit_irqoff,
ae97a3b8 4676
d264ee0c
SC
4677 .request_immediate_exit = __kvm_request_immediate_exit,
4678
ae97a3b8 4679 .sched_in = svm_sched_in,
25462f7f
WH
4680
4681 .pmu_ops = &amd_pmu_ops,
33b22172
PB
4682 .nested_ops = &svm_nested_ops,
4683
340d3bc3 4684 .deliver_posted_interrupt = svm_deliver_avic_intr,
17e433b5 4685 .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
411b44ba 4686 .update_pi_irte = svm_update_pi_irte,
74f16909 4687 .setup_mce = svm_setup_mce,
0234bf88 4688
72d7b374 4689 .smi_allowed = svm_smi_allowed,
ecc513e5
SC
4690 .enter_smm = svm_enter_smm,
4691 .leave_smm = svm_leave_smm,
b6a7cc35 4692 .enable_smi_window = svm_enable_smi_window,
1654efcb
BS
4693
4694 .mem_enc_op = svm_mem_enc_op,
1e80fdc0
BS
4695 .mem_enc_reg_region = svm_register_enc_region,
4696 .mem_enc_unreg_region = svm_unregister_enc_region,
57b119da 4697
54526d1f 4698 .vm_copy_enc_context_from = svm_vm_copy_asid_from,
b5663931 4699 .vm_move_enc_context_from = svm_vm_migrate_from,
54526d1f 4700
09e3e2a1 4701 .can_emulate_instruction = svm_can_emulate_instruction,
4b9852f4
LA
4702
4703 .apic_init_signal_blocked = svm_apic_init_signal_blocked,
fd6fa73d
AG
4704
4705 .msr_filter_changed = svm_msr_filter_changed,
f1c6366e 4706 .complete_emulated_msr = svm_complete_emulated_msr,
647daca2
TL
4707
4708 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
6aa8b732
AK
4709};
4710
d008dfdb
SC
4711static struct kvm_x86_init_ops svm_init_ops __initdata = {
4712 .cpu_has_kvm_support = has_svm,
4713 .disabled_by_bios = is_disabled,
4714 .hardware_setup = svm_hardware_setup,
4715 .check_processor_compatibility = svm_check_processor_compat,
4716
4717 .runtime_ops = &svm_x86_ops,
6aa8b732
AK
4718};
4719
4720static int __init svm_init(void)
4721{
d07f46f9
TL
4722 __unused_size_checks();
4723
d008dfdb 4724 return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
0ee75bea 4725 __alignof__(struct vcpu_svm), THIS_MODULE);
6aa8b732
AK
4726}
4727
4728static void __exit svm_exit(void)
4729{
cb498ea2 4730 kvm_exit();
6aa8b732
AK
4731}
4732
4733module_init(svm_init)
4734module_exit(svm_exit)