KVM: SVM: Stuff next_rip on emulated INT3 injection if NRIPS is supported
[linux-2.6-block.git] / arch / x86 / kvm / svm / svm.c
CommitLineData
44a95dae
SS
1#define pr_fmt(fmt) "SVM: " fmt
2
edf88417
AK
3#include <linux/kvm_host.h>
4
85f455f7 5#include "irq.h"
1d737c8a 6#include "mmu.h"
5fdbf976 7#include "kvm_cache_regs.h"
fe4c7b19 8#include "x86.h"
66f7b72e 9#include "cpuid.h"
25462f7f 10#include "pmu.h"
e495606d 11
6aa8b732 12#include <linux/module.h>
ae759544 13#include <linux/mod_devicetable.h>
9d8f549d 14#include <linux/kernel.h>
6aa8b732
AK
15#include <linux/vmalloc.h>
16#include <linux/highmem.h>
ef0f6496 17#include <linux/amd-iommu.h>
e8edc6e0 18#include <linux/sched.h>
af658dca 19#include <linux/trace_events.h>
5a0e3ad6 20#include <linux/slab.h>
5881f737 21#include <linux/hashtable.h>
00089c04 22#include <linux/objtool.h>
e9df0942 23#include <linux/psp-sev.h>
1654efcb 24#include <linux/file.h>
89c50580
BS
25#include <linux/pagemap.h>
26#include <linux/swap.h>
33af3a7e 27#include <linux/rwsem.h>
4d96f910 28#include <linux/cc_platform.h>
6aa8b732 29
8221c137 30#include <asm/apic.h>
1018faa6 31#include <asm/perf_event.h>
67ec6607 32#include <asm/tlbflush.h>
e495606d 33#include <asm/desc.h>
facb0139 34#include <asm/debugreg.h>
631bc487 35#include <asm/kvm_para.h>
411b44ba 36#include <asm/irq_remapping.h>
28a27752 37#include <asm/spec-ctrl.h>
ba5bade4 38#include <asm/cpu_device_id.h>
f1c6366e 39#include <asm/traps.h>
d69c1382 40#include <asm/fpu/api.h>
6aa8b732 41
63d1142f 42#include <asm/virtext.h>
229456fc 43#include "trace.h"
63d1142f 44
883b0a91 45#include "svm.h"
35a78319 46#include "svm_ops.h"
883b0a91 47
1e0c7d40
VP
48#include "kvm_onhyperv.h"
49#include "svm_onhyperv.h"
50
6aa8b732
AK
51MODULE_AUTHOR("Qumranet");
52MODULE_LICENSE("GPL");
53
575b255c 54#ifdef MODULE
ae759544 55static const struct x86_cpu_id svm_cpu_id[] = {
320debe5 56 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
ae759544
JT
57 {}
58};
59MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
575b255c 60#endif
ae759544 61
6aa8b732
AK
62#define SEG_TYPE_LDT 2
63#define SEG_TYPE_BUSY_TSS16 3
64
67ec6607
JR
65static bool erratum_383_found __read_mostly;
66
883b0a91 67u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
323c3d80 68
2b036c6b
BO
69/*
70 * Set osvw_len to higher value when updated Revision Guides
71 * are published and we know what the new status bits are
72 */
73static uint64_t osvw_len = 4, osvw_status;
74
fbc0db76 75static DEFINE_PER_CPU(u64, current_tsc_ratio);
fbc0db76 76
09941fbb 77static const struct svm_direct_access_msrs {
ac72a9b7 78 u32 index; /* Index of the MSR */
376c6d28 79 bool always; /* True if intercept is initially cleared */
fd6fa73d 80} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
8c06585d 81 { .index = MSR_STAR, .always = true },
ac72a9b7 82 { .index = MSR_IA32_SYSENTER_CS, .always = true },
adc2a237
ML
83 { .index = MSR_IA32_SYSENTER_EIP, .always = false },
84 { .index = MSR_IA32_SYSENTER_ESP, .always = false },
ac72a9b7
JR
85#ifdef CONFIG_X86_64
86 { .index = MSR_GS_BASE, .always = true },
87 { .index = MSR_FS_BASE, .always = true },
88 { .index = MSR_KERNEL_GS_BASE, .always = true },
89 { .index = MSR_LSTAR, .always = true },
90 { .index = MSR_CSTAR, .always = true },
91 { .index = MSR_SYSCALL_MASK, .always = true },
92#endif
b2ac58f9 93 { .index = MSR_IA32_SPEC_CTRL, .always = false },
15d45071 94 { .index = MSR_IA32_PRED_CMD, .always = false },
ac72a9b7
JR
95 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
96 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
97 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
98 { .index = MSR_IA32_LASTINTTOIP, .always = false },
376c6d28
TL
99 { .index = MSR_EFER, .always = false },
100 { .index = MSR_IA32_CR_PAT, .always = false },
101 { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
296d5a17 102 { .index = MSR_TSC_AUX, .always = false },
ac72a9b7 103 { .index = MSR_INVALID, .always = false },
6c8166a7
AK
104};
105
8566ac8b
BM
106/*
107 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
108 * pause_filter_count: On processors that support Pause filtering(indicated
109 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
110 * count value. On VMRUN this value is loaded into an internal counter.
111 * Each time a pause instruction is executed, this counter is decremented
112 * until it reaches zero at which time a #VMEXIT is generated if pause
113 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
114 * Intercept Filtering for more details.
115 * This also indicate if ple logic enabled.
116 *
117 * pause_filter_thresh: In addition, some processor families support advanced
118 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
119 * the amount of time a guest is allowed to execute in a pause loop.
120 * In this mode, a 16-bit pause filter threshold field is added in the
121 * VMCB. The threshold value is a cycle count that is used to reset the
122 * pause counter. As with simple pause filtering, VMRUN loads the pause
123 * count value from VMCB into an internal counter. Then, on each pause
124 * instruction the hardware checks the elapsed number of cycles since
125 * the most recent pause instruction against the pause filter threshold.
126 * If the elapsed cycle count is greater than the pause filter threshold,
127 * then the internal pause count is reloaded from the VMCB and execution
128 * continues. If the elapsed cycle count is less than the pause filter
129 * threshold, then the internal pause count is decremented. If the count
130 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
131 * triggered. If advanced pause filtering is supported and pause filter
132 * threshold field is set to zero, the filter will operate in the simpler,
133 * count only mode.
134 */
135
136static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
137module_param(pause_filter_thresh, ushort, 0444);
138
139static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
140module_param(pause_filter_count, ushort, 0444);
141
142/* Default doubles per-vcpu window every exit. */
143static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
144module_param(pause_filter_count_grow, ushort, 0444);
145
146/* Default resets per-vcpu window every exit to pause_filter_count. */
147static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
148module_param(pause_filter_count_shrink, ushort, 0444);
149
150/* Default is to compute the maximum so we can never overflow. */
151static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
152module_param(pause_filter_count_max, ushort, 0444);
153
99840a75
SC
154/*
155 * Use nested page tables by default. Note, NPT may get forced off by
156 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
157 */
158bool npt_enabled = true;
159module_param_named(npt, npt_enabled, bool, 0444);
e3da3acd 160
e2358851
DB
161/* allow nested virtualization in KVM/SVM */
162static int nested = true;
236de055
AG
163module_param(nested, int, S_IRUGO);
164
d647eb63
PB
165/* enable/disable Next RIP Save */
166static int nrips = true;
167module_param(nrips, int, 0444);
168
89c8a498
JN
169/* enable/disable Virtual VMLOAD VMSAVE */
170static int vls = true;
171module_param(vls, int, 0444);
172
640bd6e5 173/* enable/disable Virtual GIF */
ea91559b 174int vgif = true;
640bd6e5 175module_param(vgif, int, 0444);
5ea11f2b 176
4c84926e
ML
177/* enable/disable LBR virtualization */
178static int lbrv = true;
179module_param(lbrv, int, 0444);
180
f800650a
ML
181static int tsc_scaling = true;
182module_param(tsc_scaling, int, 0444);
183
fdf513e3
VK
184/*
185 * enable / disable AVIC. Because the defaults differ for APICv
186 * support between VMX and SVM we cannot use module_param_named.
187 */
188static bool avic;
189module_param(avic, bool, 0444);
190
edf72123
ML
191static bool force_avic;
192module_param_unsafe(force_avic, bool, 0444);
193
291bd20d 194bool __read_mostly dump_invalid_vmcb;
6f2f8453
PB
195module_param(dump_invalid_vmcb, bool, 0644);
196
4b639a9f
ML
197
198bool intercept_smi = true;
199module_param(intercept_smi, bool, 0444);
200
201
2e215216 202static bool svm_gp_erratum_intercept = true;
82a11e9c 203
7607b717
BS
204static u8 rsm_ins_bytes[] = "\x0f\xaa";
205
4866d5e3 206static unsigned long iopm_base;
6aa8b732
AK
207
208struct kvm_ldttss_desc {
209 u16 limit0;
210 u16 base0;
e0231715
JR
211 unsigned base1:8, type:5, dpl:2, p:1;
212 unsigned limit1:4, zero0:3, g:1, base2:8;
6aa8b732
AK
213 u32 base3;
214 u32 zero1;
215} __attribute__((packed));
216
eaf78265 217DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
6aa8b732 218
844d69c2
SC
219/*
220 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
221 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
222 *
223 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
224 * defer the restoration of TSC_AUX until the CPU returns to userspace.
225 */
0caa0a77 226static int tsc_aux_uret_slot __read_mostly = -1;
844d69c2 227
09941fbb 228static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
6aa8b732 229
9d8f549d 230#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
6aa8b732
AK
231#define MSRS_RANGE_SIZE 2048
232#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
233
883b0a91 234u32 svm_msrpm_offset(u32 msr)
455716fa
JR
235{
236 u32 offset;
237 int i;
238
239 for (i = 0; i < NUM_MSR_MAPS; i++) {
240 if (msr < msrpm_ranges[i] ||
241 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
242 continue;
243
244 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
245 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
246
247 /* Now we have the u8 offset - but need the u32 offset */
248 return offset / 4;
249 }
250
251 /* MSR not in any range */
252 return MSR_INVALID;
253}
254
4d9c83f5
SC
255static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
256
1af4a119 257static int get_npt_level(void)
4b16184c
JR
258{
259#ifdef CONFIG_X86_64
43e540cc 260 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4b16184c
JR
261#else
262 return PT32E_ROOT_LEVEL;
263#endif
264}
265
72f211ec 266int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
6aa8b732 267{
c513f484 268 struct vcpu_svm *svm = to_svm(vcpu);
2fcf4876 269 u64 old_efer = vcpu->arch.efer;
6dc696d4 270 vcpu->arch.efer = efer;
9167ab79
PB
271
272 if (!npt_enabled) {
273 /* Shadow paging assumes NX to be available. */
274 efer |= EFER_NX;
275
276 if (!(efer & EFER_LMA))
277 efer &= ~EFER_LME;
278 }
6aa8b732 279
2fcf4876
ML
280 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
281 if (!(efer & EFER_SVME)) {
f7e57078 282 svm_leave_nested(vcpu);
2fcf4876 283 svm_set_gif(svm, true);
82a11e9c
BD
284 /* #GP intercept is still needed for vmware backdoor */
285 if (!enable_vmware_backdoor)
286 clr_exception_intercept(svm, GP_VECTOR);
2fcf4876
ML
287
288 /*
289 * Free the nested guest state, unless we are in SMM.
290 * In this case we will return to the nested guest
291 * as soon as we leave SMM.
292 */
63129754 293 if (!is_smm(vcpu))
2fcf4876
ML
294 svm_free_nested(svm);
295
296 } else {
297 int ret = svm_allocate_nested(svm);
298
299 if (ret) {
300 vcpu->arch.efer = old_efer;
301 return ret;
302 }
82a11e9c 303
0b0be065
SC
304 /*
305 * Never intercept #GP for SEV guests, KVM can't
306 * decrypt guest memory to workaround the erratum.
307 */
308 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
82a11e9c 309 set_exception_intercept(svm, GP_VECTOR);
2fcf4876 310 }
c513f484
PB
311 }
312
313 svm->vmcb->save.efer = efer | EFER_SVME;
06e7852c 314 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
72f211ec 315 return 0;
6aa8b732
AK
316}
317
6aa8b732
AK
318static int is_external_interrupt(u32 info)
319{
320 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
321 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
322}
323
37ccdcbe 324static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2809f5d2
GC
325{
326 struct vcpu_svm *svm = to_svm(vcpu);
327 u32 ret = 0;
328
329 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
37ccdcbe
PB
330 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
331 return ret;
2809f5d2
GC
332}
333
334static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
335{
336 struct vcpu_svm *svm = to_svm(vcpu);
337
338 if (mask == 0)
339 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
340 else
341 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
342
343}
344
23e5092b 345static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
6aa8b732 346{
a2fa3e9f
GH
347 struct vcpu_svm *svm = to_svm(vcpu);
348
f1c6366e
TL
349 /*
350 * SEV-ES does not expose the next RIP. The RIP update is controlled by
351 * the type of exit and the #VC handler in the guest.
352 */
353 if (sev_es_guest(vcpu->kvm))
354 goto done;
355
d647eb63 356 if (nrips && svm->vmcb->control.next_rip != 0) {
d2922422 357 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
6bc31bdc 358 svm->next_rip = svm->vmcb->control.next_rip;
f104765b 359 }
6bc31bdc 360
1957aa63
SC
361 if (!svm->next_rip) {
362 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
363 return 0;
364 } else {
1957aa63
SC
365 kvm_rip_write(vcpu, svm->next_rip);
366 }
f1c6366e
TL
367
368done:
2809f5d2 369 svm_set_interrupt_shadow(vcpu, 0);
f8ea7c60 370
60fc3d02 371 return 1;
6aa8b732
AK
372}
373
cfcd20e5 374static void svm_queue_exception(struct kvm_vcpu *vcpu)
116a4752
JK
375{
376 struct vcpu_svm *svm = to_svm(vcpu);
cfcd20e5
WL
377 unsigned nr = vcpu->arch.exception.nr;
378 bool has_error_code = vcpu->arch.exception.has_error_code;
cfcd20e5 379 u32 error_code = vcpu->arch.exception.error_code;
116a4752 380
63129754 381 kvm_deliver_exception_payload(vcpu);
da998b46 382
d647eb63 383 if (nr == BP_VECTOR && !nrips) {
63129754 384 unsigned long rip, old_rip = kvm_rip_read(vcpu);
66b7138f
JK
385
386 /*
387 * For guest debugging where we have to reinject #BP if some
388 * INT3 is guest-owned:
389 * Emulate nRIP by moving RIP forward. Will fail if injection
390 * raises a fault that is not intercepted. Still better than
391 * failing in all cases.
392 */
23e5092b 393 (void)svm_skip_emulated_instruction(vcpu);
63129754 394 rip = kvm_rip_read(vcpu);
3741aec4
SC
395
396 if (boot_cpu_has(X86_FEATURE_NRIPS))
397 svm->vmcb->control.next_rip = rip;
398
66b7138f
JK
399 svm->int3_rip = rip + svm->vmcb->save.cs.base;
400 svm->int3_injected = rip - old_rip;
401 }
402
116a4752
JK
403 svm->vmcb->control.event_inj = nr
404 | SVM_EVTINJ_VALID
405 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
406 | SVM_EVTINJ_TYPE_EXEPT;
407 svm->vmcb->control.event_inj_err = error_code;
408}
409
67ec6607
JR
410static void svm_init_erratum_383(void)
411{
412 u32 low, high;
413 int err;
414 u64 val;
415
e6ee94d5 416 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
67ec6607
JR
417 return;
418
419 /* Use _safe variants to not break nested virtualization */
420 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
421 if (err)
422 return;
423
424 val |= (1ULL << 47);
425
426 low = lower_32_bits(val);
427 high = upper_32_bits(val);
428
429 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
430
431 erratum_383_found = true;
432}
433
2b036c6b
BO
434static void svm_init_osvw(struct kvm_vcpu *vcpu)
435{
436 /*
437 * Guests should see errata 400 and 415 as fixed (assuming that
438 * HLT and IO instructions are intercepted).
439 */
440 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
441 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
442
443 /*
444 * By increasing VCPU's osvw.length to 3 we are telling the guest that
445 * all osvw.status bits inside that length, including bit 0 (which is
446 * reserved for erratum 298), are valid. However, if host processor's
447 * osvw_len is 0 then osvw_status[0] carries no information. We need to
448 * be conservative here and therefore we tell the guest that erratum 298
449 * is present (because we really don't know).
450 */
451 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
452 vcpu->arch.osvw.status |= 1;
453}
454
6aa8b732
AK
455static int has_svm(void)
456{
63d1142f 457 const char *msg;
6aa8b732 458
63d1142f 459 if (!cpu_has_svm(&msg)) {
ff81ff10 460 printk(KERN_INFO "has_svm: %s\n", msg);
6aa8b732
AK
461 return 0;
462 }
463
4d96f910 464 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
ccd85d90
SC
465 pr_info("KVM is unsupported when running as an SEV guest\n");
466 return 0;
467 }
468
6aa8b732
AK
469 return 1;
470}
471
11d39e8c
ML
472void __svm_write_tsc_multiplier(u64 multiplier)
473{
474 preempt_disable();
475
476 if (multiplier == __this_cpu_read(current_tsc_ratio))
477 goto out;
478
479 wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
480 __this_cpu_write(current_tsc_ratio, multiplier);
481out:
482 preempt_enable();
483}
484
13a34e06 485static void svm_hardware_disable(void)
6aa8b732 486{
fbc0db76 487 /* Make sure we clean up behind us */
f800650a 488 if (tsc_scaling)
11d39e8c 489 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
fbc0db76 490
2c8dceeb 491 cpu_svm_disable();
1018faa6
JR
492
493 amd_pmu_disable_virt();
6aa8b732
AK
494}
495
13a34e06 496static int svm_hardware_enable(void)
6aa8b732
AK
497{
498
0fe1e009 499 struct svm_cpu_data *sd;
6aa8b732 500 uint64_t efer;
6aa8b732
AK
501 struct desc_struct *gdt;
502 int me = raw_smp_processor_id();
503
10474ae8
AG
504 rdmsrl(MSR_EFER, efer);
505 if (efer & EFER_SVME)
506 return -EBUSY;
507
6aa8b732 508 if (!has_svm()) {
1f5b77f5 509 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
10474ae8 510 return -EINVAL;
6aa8b732 511 }
0fe1e009 512 sd = per_cpu(svm_data, me);
0fe1e009 513 if (!sd) {
1f5b77f5 514 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
10474ae8 515 return -EINVAL;
6aa8b732
AK
516 }
517
0fe1e009
TH
518 sd->asid_generation = 1;
519 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
520 sd->next_asid = sd->max_asid + 1;
ed3cd233 521 sd->min_asid = max_sev_asid + 1;
6aa8b732 522
45fc8757 523 gdt = get_current_gdt_rw();
0fe1e009 524 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
6aa8b732 525
9962d032 526 wrmsrl(MSR_EFER, efer | EFER_SVME);
6aa8b732 527
85ca8be9 528 wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
10474ae8 529
fbc0db76 530 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
f800650a
ML
531 /*
532 * Set the default value, even if we don't use TSC scaling
533 * to avoid having stale value in the msr
534 */
11d39e8c 535 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
fbc0db76
JR
536 }
537
2b036c6b
BO
538
539 /*
540 * Get OSVW bits.
541 *
542 * Note that it is possible to have a system with mixed processor
543 * revisions and therefore different OSVW bits. If bits are not the same
544 * on different processors then choose the worst case (i.e. if erratum
545 * is present on one processor and not on another then assume that the
546 * erratum is present everywhere).
547 */
548 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
549 uint64_t len, status = 0;
550 int err;
551
552 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
553 if (!err)
554 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
555 &err);
556
557 if (err)
558 osvw_status = osvw_len = 0;
559 else {
560 if (len < osvw_len)
561 osvw_len = len;
562 osvw_status |= status;
563 osvw_status &= (1ULL << osvw_len) - 1;
564 }
565 } else
566 osvw_status = osvw_len = 0;
567
67ec6607
JR
568 svm_init_erratum_383();
569
1018faa6
JR
570 amd_pmu_enable_virt();
571
10474ae8 572 return 0;
6aa8b732
AK
573}
574
0da1db75
JR
575static void svm_cpu_uninit(int cpu)
576{
a2b2d4bf 577 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
0da1db75 578
0fe1e009 579 if (!sd)
0da1db75
JR
580 return;
581
a2b2d4bf 582 per_cpu(svm_data, cpu) = NULL;
70cd94e6 583 kfree(sd->sev_vmcbs);
0fe1e009
TH
584 __free_page(sd->save_area);
585 kfree(sd);
0da1db75
JR
586}
587
6aa8b732
AK
588static int svm_cpu_init(int cpu)
589{
0fe1e009 590 struct svm_cpu_data *sd;
b95c221c 591 int ret = -ENOMEM;
6aa8b732 592
0fe1e009
TH
593 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
594 if (!sd)
b95c221c 595 return ret;
0fe1e009 596 sd->cpu = cpu;
58356767 597 sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
0fe1e009 598 if (!sd->save_area)
d80b64ff 599 goto free_cpu_data;
b95c221c 600
b95c221c
SC
601 ret = sev_cpu_init(sd);
602 if (ret)
603 goto free_save_area;
70cd94e6 604
0fe1e009 605 per_cpu(svm_data, cpu) = sd;
6aa8b732
AK
606
607 return 0;
608
d80b64ff
ML
609free_save_area:
610 __free_page(sd->save_area);
611free_cpu_data:
0fe1e009 612 kfree(sd);
b95c221c 613 return ret;
6aa8b732
AK
614
615}
616
fd6fa73d 617static int direct_access_msr_slot(u32 msr)
ac72a9b7 618{
fd6fa73d 619 u32 i;
ac72a9b7
JR
620
621 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
fd6fa73d
AG
622 if (direct_access_msrs[i].index == msr)
623 return i;
ac72a9b7 624
fd6fa73d
AG
625 return -ENOENT;
626}
627
628static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
629 int write)
630{
631 struct vcpu_svm *svm = to_svm(vcpu);
632 int slot = direct_access_msr_slot(msr);
633
634 if (slot == -ENOENT)
635 return;
636
637 /* Set the shadow bitmaps to the desired intercept states */
638 if (read)
639 set_bit(slot, svm->shadow_msr_intercept.read);
640 else
641 clear_bit(slot, svm->shadow_msr_intercept.read);
642
643 if (write)
644 set_bit(slot, svm->shadow_msr_intercept.write);
645 else
646 clear_bit(slot, svm->shadow_msr_intercept.write);
ac72a9b7
JR
647}
648
fd6fa73d
AG
649static bool valid_msr_intercept(u32 index)
650{
651 return direct_access_msr_slot(index) != -ENOENT;
ac72a9b7
JR
652}
653
476c9bd8 654static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
b2ac58f9
KA
655{
656 u8 bit_write;
657 unsigned long tmp;
658 u32 offset;
659 u32 *msrpm;
660
661 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
662 to_svm(vcpu)->msrpm;
663
664 offset = svm_msrpm_offset(msr);
665 bit_write = 2 * (msr & 0x0f) + 1;
666 tmp = msrpm[offset];
667
668 BUG_ON(offset == MSR_INVALID);
669
670 return !!test_bit(bit_write, &tmp);
671}
672
fd6fa73d
AG
673static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
674 u32 msr, int read, int write)
6aa8b732 675{
73c25546 676 struct vcpu_svm *svm = to_svm(vcpu);
455716fa
JR
677 u8 bit_read, bit_write;
678 unsigned long tmp;
679 u32 offset;
6aa8b732 680
ac72a9b7
JR
681 /*
682 * If this warning triggers extend the direct_access_msrs list at the
683 * beginning of the file
684 */
685 WARN_ON(!valid_msr_intercept(msr));
686
fd6fa73d
AG
687 /* Enforce non allowed MSRs to trap */
688 if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
689 read = 0;
690
691 if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
692 write = 0;
693
455716fa
JR
694 offset = svm_msrpm_offset(msr);
695 bit_read = 2 * (msr & 0x0f);
696 bit_write = 2 * (msr & 0x0f) + 1;
697 tmp = msrpm[offset];
698
699 BUG_ON(offset == MSR_INVALID);
700
701 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
702 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
703
704 msrpm[offset] = tmp;
c4327f15
VP
705
706 svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
73c25546 707 svm->nested.force_msr_bitmap_recalc = true;
6aa8b732
AK
708}
709
376c6d28
TL
710void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
711 int read, int write)
6aa8b732 712{
fd6fa73d
AG
713 set_shadow_msr_intercept(vcpu, msr, read, write);
714 set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
715}
716
2fcf4876 717u32 *svm_vcpu_alloc_msrpm(void)
6aa8b732 718{
47903dc1
KS
719 unsigned int order = get_order(MSRPM_SIZE);
720 struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
476c9bd8 721 u32 *msrpm;
f4c847a9
ML
722
723 if (!pages)
724 return NULL;
6aa8b732 725
f4c847a9 726 msrpm = page_address(pages);
47903dc1 727 memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
f65c229c 728
476c9bd8
AL
729 return msrpm;
730}
731
2fcf4876 732void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
476c9bd8
AL
733{
734 int i;
735
ac72a9b7
JR
736 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
737 if (!direct_access_msrs[i].always)
738 continue;
476c9bd8 739 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
ac72a9b7 740 }
f4c847a9 741}
ac72a9b7 742
2fcf4876
ML
743
744void svm_vcpu_free_msrpm(u32 *msrpm)
f4c847a9 745{
47903dc1 746 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
f65c229c
JR
747}
748
fd6fa73d
AG
749static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
750{
751 struct vcpu_svm *svm = to_svm(vcpu);
752 u32 i;
753
754 /*
755 * Set intercept permissions for all direct access MSRs again. They
756 * will automatically get filtered through the MSR filter, so we are
757 * back in sync after this.
758 */
759 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
760 u32 msr = direct_access_msrs[i].index;
761 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
762 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
763
764 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
ac72a9b7 765 }
f65c229c
JR
766}
767
323c3d80
JR
768static void add_msr_offset(u32 offset)
769{
770 int i;
771
772 for (i = 0; i < MSRPM_OFFSETS; ++i) {
773
774 /* Offset already in list? */
775 if (msrpm_offsets[i] == offset)
bfc733a7 776 return;
323c3d80
JR
777
778 /* Slot used by another offset? */
779 if (msrpm_offsets[i] != MSR_INVALID)
780 continue;
781
782 /* Add offset to list */
783 msrpm_offsets[i] = offset;
784
785 return;
6aa8b732 786 }
323c3d80
JR
787
788 /*
789 * If this BUG triggers the msrpm_offsets table has an overflow. Just
790 * increase MSRPM_OFFSETS in this case.
791 */
bfc733a7 792 BUG();
6aa8b732
AK
793}
794
323c3d80 795static void init_msrpm_offsets(void)
f65c229c 796{
323c3d80 797 int i;
f65c229c 798
323c3d80
JR
799 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
800
801 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
802 u32 offset;
803
804 offset = svm_msrpm_offset(direct_access_msrs[i].index);
805 BUG_ON(offset == MSR_INVALID);
806
807 add_msr_offset(offset);
808 }
f65c229c
JR
809}
810
1d5a1b58
ML
811void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
812{
813 to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
814 to_vmcb->save.br_from = from_vmcb->save.br_from;
815 to_vmcb->save.br_to = from_vmcb->save.br_to;
816 to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
817 to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
818
819 vmcb_mark_dirty(to_vmcb, VMCB_LBR);
820}
821
476c9bd8 822static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
24e09cbf 823{
476c9bd8 824 struct vcpu_svm *svm = to_svm(vcpu);
24e09cbf 825
0dc92119 826 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
476c9bd8
AL
827 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
828 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
829 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
830 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
1d5a1b58
ML
831
832 /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
833 if (is_guest_mode(vcpu))
834 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
24e09cbf
JR
835}
836
476c9bd8 837static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
24e09cbf 838{
476c9bd8 839 struct vcpu_svm *svm = to_svm(vcpu);
24e09cbf 840
0dc92119 841 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
476c9bd8
AL
842 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
843 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
844 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
845 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
1d5a1b58
ML
846
847 /*
848 * Move the LBR msrs back to the vmcb01 to avoid copying them
849 * on nested guest entries.
850 */
851 if (is_guest_mode(vcpu))
852 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
853}
854
855static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
856{
857 /*
858 * If the LBR virtualization is disabled, the LBR msrs are always
859 * kept in the vmcb01 to avoid copying them on nested guest entries.
860 *
861 * If nested, and the LBR virtualization is enabled/disabled, the msrs
862 * are moved between the vmcb01 and vmcb02 as needed.
863 */
864 struct vmcb *vmcb =
865 (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
866 svm->vmcb : svm->vmcb01.ptr;
867
868 switch (index) {
869 case MSR_IA32_DEBUGCTLMSR:
870 return vmcb->save.dbgctl;
871 case MSR_IA32_LASTBRANCHFROMIP:
872 return vmcb->save.br_from;
873 case MSR_IA32_LASTBRANCHTOIP:
874 return vmcb->save.br_to;
875 case MSR_IA32_LASTINTFROMIP:
876 return vmcb->save.last_excp_from;
877 case MSR_IA32_LASTINTTOIP:
878 return vmcb->save.last_excp_to;
879 default:
880 KVM_BUG(false, svm->vcpu.kvm,
881 "%s: Unknown MSR 0x%x", __func__, index);
882 return 0;
883 }
884}
885
886void svm_update_lbrv(struct kvm_vcpu *vcpu)
887{
888 struct vcpu_svm *svm = to_svm(vcpu);
889
890 bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
891 DEBUGCTLMSR_LBR;
892
893 bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
894 LBR_CTL_ENABLE_MASK);
895
d20c796c
ML
896 if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
897 if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
898 enable_lbrv = true;
899
1d5a1b58
ML
900 if (enable_lbrv == current_enable_lbrv)
901 return;
902
903 if (enable_lbrv)
904 svm_enable_lbrv(vcpu);
905 else
906 svm_disable_lbrv(vcpu);
24e09cbf
JR
907}
908
883b0a91 909void disable_nmi_singlestep(struct vcpu_svm *svm)
4aebd0e9
LP
910{
911 svm->nmi_singlestep = false;
640bd6e5 912
ab2f4d73
LP
913 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
914 /* Clear our flags if they were not set by the guest */
915 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
916 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
917 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
918 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
919 }
4aebd0e9
LP
920}
921
8566ac8b
BM
922static void grow_ple_window(struct kvm_vcpu *vcpu)
923{
924 struct vcpu_svm *svm = to_svm(vcpu);
925 struct vmcb_control_area *control = &svm->vmcb->control;
926 int old = control->pause_filter_count;
927
74fd41ed
ML
928 if (kvm_pause_in_guest(vcpu->kvm) || !old)
929 return;
930
8566ac8b
BM
931 control->pause_filter_count = __grow_ple_window(old,
932 pause_filter_count,
933 pause_filter_count_grow,
934 pause_filter_count_max);
935
4f75bcc3 936 if (control->pause_filter_count != old) {
06e7852c 937 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
4f75bcc3
PX
938 trace_kvm_ple_window_update(vcpu->vcpu_id,
939 control->pause_filter_count, old);
940 }
8566ac8b
BM
941}
942
943static void shrink_ple_window(struct kvm_vcpu *vcpu)
944{
945 struct vcpu_svm *svm = to_svm(vcpu);
946 struct vmcb_control_area *control = &svm->vmcb->control;
947 int old = control->pause_filter_count;
948
74fd41ed
ML
949 if (kvm_pause_in_guest(vcpu->kvm) || !old)
950 return;
951
8566ac8b
BM
952 control->pause_filter_count =
953 __shrink_ple_window(old,
954 pause_filter_count,
955 pause_filter_count_shrink,
956 pause_filter_count);
4f75bcc3 957 if (control->pause_filter_count != old) {
06e7852c 958 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
4f75bcc3
PX
959 trace_kvm_ple_window_update(vcpu->vcpu_id,
960 control->pause_filter_count, old);
961 }
8566ac8b
BM
962}
963
23e5092b 964static void svm_hardware_unsetup(void)
dd58f3c9
LR
965{
966 int cpu;
967
23e5092b 968 sev_hardware_unsetup();
dd58f3c9
LR
969
970 for_each_possible_cpu(cpu)
971 svm_cpu_uninit(cpu);
972
47903dc1
KS
973 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
974 get_order(IOPM_SIZE));
dd58f3c9
LR
975 iopm_base = 0;
976}
977
6aa8b732
AK
978static void init_seg(struct vmcb_seg *seg)
979{
980 seg->selector = 0;
981 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
e0231715 982 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
6aa8b732
AK
983 seg->limit = 0xffff;
984 seg->base = 0;
985}
986
987static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
988{
989 seg->selector = 0;
990 seg->attrib = SVM_SELECTOR_P_MASK | type;
991 seg->limit = 0xffff;
992 seg->base = 0;
993}
994
307a94c7
IS
995static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
996{
997 struct vcpu_svm *svm = to_svm(vcpu);
998
999 return svm->nested.ctl.tsc_offset;
1000}
1001
1002static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1003{
5228eb96
ML
1004 struct vcpu_svm *svm = to_svm(vcpu);
1005
1006 return svm->tsc_ratio_msr;
307a94c7
IS
1007}
1008
edcfe540 1009static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
f4e1b3c8
ZA
1010{
1011 struct vcpu_svm *svm = to_svm(vcpu);
116a0a23 1012
edcfe540
IS
1013 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1014 svm->vmcb->control.tsc_offset = offset;
06e7852c 1015 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
f4e1b3c8
ZA
1016}
1017
11d39e8c 1018static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1ab9287a 1019{
11d39e8c 1020 __svm_write_tsc_multiplier(multiplier);
1ab9287a
IS
1021}
1022
11d39e8c 1023
3b195ac9
SC
1024/* Evaluate instruction intercepts that depend on guest CPUID features. */
1025static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1026 struct vcpu_svm *svm)
4407a797
BM
1027{
1028 /*
0a8ed2ea
SC
1029 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1030 * roots, or if INVPCID is disabled in the guest to inject #UD.
4407a797
BM
1031 */
1032 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
0a8ed2ea
SC
1033 if (!npt_enabled ||
1034 !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
4407a797
BM
1035 svm_set_intercept(svm, INTERCEPT_INVPCID);
1036 else
1037 svm_clr_intercept(svm, INTERCEPT_INVPCID);
1038 }
3b195ac9
SC
1039
1040 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1041 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1042 svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1043 else
1044 svm_set_intercept(svm, INTERCEPT_RDTSCP);
1045 }
4407a797
BM
1046}
1047
36e8194d
PB
1048static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1049{
1050 struct vcpu_svm *svm = to_svm(vcpu);
1051
1052 if (guest_cpuid_is_intel(vcpu)) {
1053 /*
1054 * We must intercept SYSENTER_EIP and SYSENTER_ESP
1055 * accesses because the processor only stores 32 bits.
1056 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1057 */
1058 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1059 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1060 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1061
1062 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1063 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
b9f3973a
ML
1064
1065 svm->v_vmload_vmsave_enabled = false;
36e8194d
PB
1066 } else {
1067 /*
1068 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1069 * in VMCB and clear intercepts to avoid #VMEXIT.
1070 */
1071 if (vls) {
1072 svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1073 svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1074 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1075 }
1076 /* No need to intercept these MSRs */
1077 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1078 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1079 }
1080}
1081
63129754 1082static void init_vmcb(struct kvm_vcpu *vcpu)
6aa8b732 1083{
63129754 1084 struct vcpu_svm *svm = to_svm(vcpu);
1ee73a33
ML
1085 struct vmcb *vmcb = svm->vmcb01.ptr;
1086 struct vmcb_control_area *control = &vmcb->control;
1087 struct vmcb_save_area *save = &vmcb->save;
6aa8b732 1088
830bd71f
BM
1089 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1090 svm_set_intercept(svm, INTERCEPT_CR3_READ);
1091 svm_set_intercept(svm, INTERCEPT_CR4_READ);
1092 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1093 svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1094 svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
63129754 1095 if (!kvm_vcpu_apicv_active(vcpu))
830bd71f 1096 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
6aa8b732 1097
5315c716 1098 set_dr_intercepts(svm);
6aa8b732 1099
18c918c5
JR
1100 set_exception_intercept(svm, PF_VECTOR);
1101 set_exception_intercept(svm, UD_VECTOR);
1102 set_exception_intercept(svm, MC_VECTOR);
54a20552 1103 set_exception_intercept(svm, AC_VECTOR);
cbdb967a 1104 set_exception_intercept(svm, DB_VECTOR);
9718420e
LA
1105 /*
1106 * Guest access to VMware backdoor ports could legitimately
1107 * trigger #GP because of TSS I/O permission bitmap.
1108 * We intercept those #GP and allow access to them anyway
0b0be065
SC
1109 * as VMware does. Don't intercept #GP for SEV guests as KVM can't
1110 * decrypt guest memory to decode the faulting instruction.
9718420e 1111 */
0b0be065 1112 if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
9718420e 1113 set_exception_intercept(svm, GP_VECTOR);
6aa8b732 1114
a284ba56
JR
1115 svm_set_intercept(svm, INTERCEPT_INTR);
1116 svm_set_intercept(svm, INTERCEPT_NMI);
4b639a9f
ML
1117
1118 if (intercept_smi)
1119 svm_set_intercept(svm, INTERCEPT_SMI);
1120
a284ba56
JR
1121 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1122 svm_set_intercept(svm, INTERCEPT_RDPMC);
1123 svm_set_intercept(svm, INTERCEPT_CPUID);
1124 svm_set_intercept(svm, INTERCEPT_INVD);
1125 svm_set_intercept(svm, INTERCEPT_INVLPG);
1126 svm_set_intercept(svm, INTERCEPT_INVLPGA);
1127 svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1128 svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1129 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1130 svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1131 svm_set_intercept(svm, INTERCEPT_VMRUN);
1132 svm_set_intercept(svm, INTERCEPT_VMMCALL);
1133 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1134 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1135 svm_set_intercept(svm, INTERCEPT_STGI);
1136 svm_set_intercept(svm, INTERCEPT_CLGI);
1137 svm_set_intercept(svm, INTERCEPT_SKINIT);
1138 svm_set_intercept(svm, INTERCEPT_WBINVD);
1139 svm_set_intercept(svm, INTERCEPT_XSETBV);
1140 svm_set_intercept(svm, INTERCEPT_RDPRU);
1141 svm_set_intercept(svm, INTERCEPT_RSM);
6aa8b732 1142
63129754 1143 if (!kvm_mwait_in_guest(vcpu->kvm)) {
a284ba56
JR
1144 svm_set_intercept(svm, INTERCEPT_MONITOR);
1145 svm_set_intercept(svm, INTERCEPT_MWAIT);
668fffa3
MT
1146 }
1147
63129754 1148 if (!kvm_hlt_in_guest(vcpu->kvm))
a284ba56 1149 svm_set_intercept(svm, INTERCEPT_HLT);
caa057a2 1150
d0ec49d4
TL
1151 control->iopm_base_pa = __sme_set(iopm_base);
1152 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
6aa8b732
AK
1153 control->int_ctl = V_INTR_MASKING_MASK;
1154
1155 init_seg(&save->es);
1156 init_seg(&save->ss);
1157 init_seg(&save->ds);
1158 init_seg(&save->fs);
1159 init_seg(&save->gs);
1160
1161 save->cs.selector = 0xf000;
04b66839 1162 save->cs.base = 0xffff0000;
6aa8b732
AK
1163 /* Executable/Readable Code Segment */
1164 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1165 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1166 save->cs.limit = 0xffff;
6aa8b732 1167
4f117ce4 1168 save->gdtr.base = 0;
6aa8b732 1169 save->gdtr.limit = 0xffff;
4f117ce4 1170 save->idtr.base = 0;
6aa8b732
AK
1171 save->idtr.limit = 0xffff;
1172
1173 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1174 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1175
709ddebf
JR
1176 if (npt_enabled) {
1177 /* Setup VMCB for Nested Paging */
cea3a19b 1178 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
a284ba56 1179 svm_clr_intercept(svm, INTERCEPT_INVLPG);
18c918c5 1180 clr_exception_intercept(svm, PF_VECTOR);
830bd71f
BM
1181 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1182 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
63129754 1183 save->g_pat = vcpu->arch.pat;
709ddebf 1184 save->cr3 = 0;
709ddebf 1185 }
193015ad 1186 svm->current_vmcb->asid_generation = 0;
7e8e6eed 1187 svm->asid = 0;
1371d904 1188
c74ad08f
ML
1189 svm->nested.vmcb12_gpa = INVALID_GPA;
1190 svm->nested.last_vmcb12_gpa = INVALID_GPA;
2af9194d 1191
63129754 1192 if (!kvm_pause_in_guest(vcpu->kvm)) {
8566ac8b
BM
1193 control->pause_filter_count = pause_filter_count;
1194 if (pause_filter_thresh)
1195 control->pause_filter_thresh = pause_filter_thresh;
a284ba56 1196 svm_set_intercept(svm, INTERCEPT_PAUSE);
8566ac8b 1197 } else {
a284ba56 1198 svm_clr_intercept(svm, INTERCEPT_PAUSE);
565d0998
ML
1199 }
1200
3b195ac9 1201 svm_recalc_instruction_intercepts(vcpu, svm);
4407a797 1202
89c8a498 1203 /*
d00b99c5
BM
1204 * If the host supports V_SPEC_CTRL then disable the interception
1205 * of MSR_IA32_SPEC_CTRL.
89c8a498 1206 */
d00b99c5
BM
1207 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1208 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1209
63129754 1210 if (kvm_vcpu_apicv_active(vcpu))
1ee73a33 1211 avic_init_vmcb(svm, vmcb);
89c8a498 1212
640bd6e5 1213 if (vgif) {
a284ba56
JR
1214 svm_clr_intercept(svm, INTERCEPT_STGI);
1215 svm_clr_intercept(svm, INTERCEPT_CLGI);
640bd6e5
JN
1216 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1217 }
1218
63129754 1219 if (sev_guest(vcpu->kvm)) {
1654efcb 1220 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
35c6f649 1221 clr_exception_intercept(svm, UD_VECTOR);
376c6d28 1222
63129754 1223 if (sev_es_guest(vcpu->kvm)) {
376c6d28
TL
1224 /* Perform SEV-ES specific VMCB updates */
1225 sev_es_init_vmcb(svm);
1226 }
35c6f649 1227 }
1654efcb 1228
1ee73a33 1229 svm_hv_init_vmcb(vmcb);
36e8194d 1230 init_vmcb_after_set_cpuid(vcpu);
1e0c7d40 1231
1ee73a33 1232 vmcb_mark_all_dirty(vmcb);
8d28fec4 1233
2af9194d 1234 enable_gif(svm);
44a95dae
SS
1235}
1236
9ebe530b
SC
1237static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1238{
1239 struct vcpu_svm *svm = to_svm(vcpu);
44a95dae 1240
9ebe530b
SC
1241 svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1242
1243 svm_init_osvw(vcpu);
1244 vcpu->arch.microcode_version = 0x01000065;
5228eb96 1245 svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio;
9ebe530b
SC
1246
1247 if (sev_es_guest(vcpu->kvm))
1248 sev_es_vcpu_reset(svm);
44a95dae
SS
1249}
1250
d28bc9dd 1251static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
04d2cc77
AK
1252{
1253 struct vcpu_svm *svm = to_svm(vcpu);
1254
b2ac58f9 1255 svm->spec_ctrl = 0;
ccbcd267 1256 svm->virt_spec_ctrl = 0;
b2ac58f9 1257
63129754 1258 init_vmcb(vcpu);
9ebe530b
SC
1259
1260 if (!init_event)
1261 __svm_vcpu_reset(vcpu);
04d2cc77
AK
1262}
1263
4995a368
CA
1264void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1265{
1266 svm->current_vmcb = target_vmcb;
1267 svm->vmcb = target_vmcb->ptr;
4995a368
CA
1268}
1269
23e5092b 1270static int svm_vcpu_create(struct kvm_vcpu *vcpu)
6aa8b732 1271{
a2fa3e9f 1272 struct vcpu_svm *svm;
4995a368 1273 struct page *vmcb01_page;
add5e2f0 1274 struct page *vmsa_page = NULL;
fb3f0f51 1275 int err;
6aa8b732 1276
a9dd6f09
SC
1277 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1278 svm = to_svm(vcpu);
fb3f0f51 1279
b7af4043 1280 err = -ENOMEM;
4995a368
CA
1281 vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1282 if (!vmcb01_page)
987b2594 1283 goto out;
6aa8b732 1284
63129754 1285 if (sev_es_guest(vcpu->kvm)) {
add5e2f0
TL
1286 /*
1287 * SEV-ES guests require a separate VMSA page used to contain
1288 * the encrypted register state of the guest.
1289 */
1290 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1291 if (!vmsa_page)
1292 goto error_free_vmcb_page;
ed02b213
TL
1293
1294 /*
1295 * SEV-ES guests maintain an encrypted version of their FPU
1296 * state which is restored and saved on VMRUN and VMEXIT.
d69c1382
TG
1297 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1298 * do xsave/xrstor on it.
ed02b213 1299 */
d69c1382 1300 fpstate_set_confidential(&vcpu->arch.guest_fpu);
add5e2f0
TL
1301 }
1302
dfa20099
SS
1303 err = avic_init_vcpu(svm);
1304 if (err)
add5e2f0 1305 goto error_free_vmsa_page;
44a95dae 1306
476c9bd8 1307 svm->msrpm = svm_vcpu_alloc_msrpm();
054409ab
CZ
1308 if (!svm->msrpm) {
1309 err = -ENOMEM;
add5e2f0 1310 goto error_free_vmsa_page;
054409ab 1311 }
b7af4043 1312
4995a368
CA
1313 svm->vmcb01.ptr = page_address(vmcb01_page);
1314 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
9ebe530b 1315 svm_switch_vmcb(svm, &svm->vmcb01);
add5e2f0
TL
1316
1317 if (vmsa_page)
b67a4cc3 1318 svm->sev_es.vmsa = page_address(vmsa_page);
add5e2f0 1319
a7fc06dd 1320 svm->guest_state_loaded = false;
4995a368 1321
a9dd6f09 1322 return 0;
36241b8c 1323
add5e2f0
TL
1324error_free_vmsa_page:
1325 if (vmsa_page)
1326 __free_page(vmsa_page);
8d22b90e 1327error_free_vmcb_page:
4995a368 1328 __free_page(vmcb01_page);
987b2594 1329out:
a9dd6f09 1330 return err;
6aa8b732
AK
1331}
1332
fd65d314
JM
1333static void svm_clear_current_vmcb(struct vmcb *vmcb)
1334{
1335 int i;
1336
1337 for_each_online_cpu(i)
1338 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1339}
1340
23e5092b 1341static void svm_vcpu_free(struct kvm_vcpu *vcpu)
6aa8b732 1342{
a2fa3e9f
GH
1343 struct vcpu_svm *svm = to_svm(vcpu);
1344
fd65d314
JM
1345 /*
1346 * The vmcb page can be recycled, causing a false negative in
1347 * svm_vcpu_load(). So, ensure that no logical CPU has this
1348 * vmcb page recorded as its current vmcb.
1349 */
1350 svm_clear_current_vmcb(svm->vmcb);
1351
2fcf4876
ML
1352 svm_free_nested(svm);
1353
add5e2f0
TL
1354 sev_free_vcpu(vcpu);
1355
4995a368 1356 __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
47903dc1 1357 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
6aa8b732
AK
1358}
1359
23e5092b 1360static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
6aa8b732 1361{
a2fa3e9f 1362 struct vcpu_svm *svm = to_svm(vcpu);
a7fc06dd 1363 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
0cc5064d 1364
ce7ea0cf
TL
1365 if (sev_es_guest(vcpu->kvm))
1366 sev_es_unmap_ghcb(svm);
1367
a7fc06dd
MR
1368 if (svm->guest_state_loaded)
1369 return;
1370
a7fc06dd
MR
1371 /*
1372 * Save additional host state that will be restored on VMEXIT (sev-es)
1373 * or subsequent vmload of host save area.
1374 */
068f7ea6 1375 vmsave(__sme_page_pa(sd->save_area));
63129754 1376 if (sev_es_guest(vcpu->kvm)) {
3dd2775b
TL
1377 struct sev_es_save_area *hostsa;
1378 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
068f7ea6 1379
23e5092b 1380 sev_es_prepare_switch_to_guest(hostsa);
86137773 1381 }
fbc0db76 1382
11d39e8c
ML
1383 if (tsc_scaling)
1384 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
a7fc06dd 1385
0caa0a77
SC
1386 if (likely(tsc_aux_uret_slot >= 0))
1387 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
8221c137 1388
a7fc06dd
MR
1389 svm->guest_state_loaded = true;
1390}
1391
1392static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1393{
844d69c2 1394 to_svm(vcpu)->guest_state_loaded = false;
a7fc06dd
MR
1395}
1396
1397static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1398{
1399 struct vcpu_svm *svm = to_svm(vcpu);
1400 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1401
15d45071
AR
1402 if (sd->current_vmcb != svm->vmcb) {
1403 sd->current_vmcb = svm->vmcb;
1404 indirect_branch_prediction_barrier();
1405 }
bf5f6b9d 1406 if (kvm_vcpu_apicv_active(vcpu))
b652de1e 1407 __avic_vcpu_load(vcpu, cpu);
6aa8b732
AK
1408}
1409
1410static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1411{
bf5f6b9d 1412 if (kvm_vcpu_apicv_active(vcpu))
b652de1e 1413 __avic_vcpu_put(vcpu);
bf5f6b9d 1414
a7fc06dd 1415 svm_prepare_host_switch(vcpu);
8221c137 1416
e1beb1d3 1417 ++vcpu->stat.host_state_reload;
6aa8b732
AK
1418}
1419
6aa8b732
AK
1420static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1421{
9b611747
LP
1422 struct vcpu_svm *svm = to_svm(vcpu);
1423 unsigned long rflags = svm->vmcb->save.rflags;
1424
1425 if (svm->nmi_singlestep) {
1426 /* Hide our flags if they were not set by the guest */
1427 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1428 rflags &= ~X86_EFLAGS_TF;
1429 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1430 rflags &= ~X86_EFLAGS_RF;
1431 }
1432 return rflags;
6aa8b732
AK
1433}
1434
1435static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1436{
9b611747
LP
1437 if (to_svm(vcpu)->nmi_singlestep)
1438 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1439
ae9fedc7 1440 /*
bb3541f1 1441 * Any change of EFLAGS.VM is accompanied by a reload of SS
ae9fedc7
PB
1442 * (caused by either a task switch or an inter-privilege IRET),
1443 * so we do not need to update the CPL here.
1444 */
a2fa3e9f 1445 to_svm(vcpu)->vmcb->save.rflags = rflags;
6aa8b732
AK
1446}
1447
c5063551
MO
1448static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1449{
1450 struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1451
1452 return sev_es_guest(vcpu->kvm)
1453 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1454 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1455}
1456
6de4f3ad
AK
1457static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1458{
40e49c4f
LJ
1459 kvm_register_mark_available(vcpu, reg);
1460
6de4f3ad
AK
1461 switch (reg) {
1462 case VCPU_EXREG_PDPTR:
40e49c4f
LJ
1463 /*
1464 * When !npt_enabled, mmu->pdptrs[] is already available since
1465 * it is always updated per SDM when moving to CRs.
1466 */
1467 if (npt_enabled)
2df4a5eb 1468 load_pdptrs(vcpu, kvm_read_cr3(vcpu));
6de4f3ad
AK
1469 break;
1470 default:
67369273 1471 KVM_BUG_ON(1, vcpu->kvm);
6de4f3ad
AK
1472 }
1473}
1474
e14b7786 1475static void svm_set_vintr(struct vcpu_svm *svm)
64b5bd27
PB
1476{
1477 struct vmcb_control_area *control;
1478
f1577ab2
ML
1479 /*
1480 * The following fields are ignored when AVIC is enabled
1481 */
f44509f8 1482 WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
f1577ab2 1483
a284ba56 1484 svm_set_intercept(svm, INTERCEPT_VINTR);
64b5bd27
PB
1485
1486 /*
1487 * This is just a dummy VINTR to actually cause a vmexit to happen.
1488 * Actual injection of virtual interrupts happens through EVENTINJ.
1489 */
1490 control = &svm->vmcb->control;
1491 control->int_vector = 0x0;
1492 control->int_ctl &= ~V_INTR_PRIO_MASK;
1493 control->int_ctl |= V_IRQ_MASK |
1494 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
06e7852c 1495 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
64b5bd27
PB
1496}
1497
f0b85051
AG
1498static void svm_clear_vintr(struct vcpu_svm *svm)
1499{
a284ba56 1500 svm_clr_intercept(svm, INTERCEPT_VINTR);
64b5bd27 1501
d8e4e58f 1502 /* Drop int_ctl fields related to VINTR injection. */
0f923e07 1503 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
d8e4e58f 1504 if (is_guest_mode(&svm->vcpu)) {
0f923e07 1505 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
fb7333df 1506
d8e4e58f
PB
1507 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1508 (svm->nested.ctl.int_ctl & V_TPR_MASK));
0f923e07
ML
1509
1510 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1511 V_IRQ_INJECTION_BITS_MASK;
aee77e11
ML
1512
1513 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
d8e4e58f
PB
1514 }
1515
06e7852c 1516 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
f0b85051
AG
1517}
1518
6aa8b732
AK
1519static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1520{
a2fa3e9f 1521 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
cc3ed80a 1522 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
6aa8b732
AK
1523
1524 switch (seg) {
1525 case VCPU_SREG_CS: return &save->cs;
1526 case VCPU_SREG_DS: return &save->ds;
1527 case VCPU_SREG_ES: return &save->es;
cc3ed80a
ML
1528 case VCPU_SREG_FS: return &save01->fs;
1529 case VCPU_SREG_GS: return &save01->gs;
6aa8b732 1530 case VCPU_SREG_SS: return &save->ss;
cc3ed80a
ML
1531 case VCPU_SREG_TR: return &save01->tr;
1532 case VCPU_SREG_LDTR: return &save01->ldtr;
6aa8b732
AK
1533 }
1534 BUG();
8b6d44c7 1535 return NULL;
6aa8b732
AK
1536}
1537
1538static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1539{
1540 struct vmcb_seg *s = svm_seg(vcpu, seg);
1541
1542 return s->base;
1543}
1544
1545static void svm_get_segment(struct kvm_vcpu *vcpu,
1546 struct kvm_segment *var, int seg)
1547{
1548 struct vmcb_seg *s = svm_seg(vcpu, seg);
1549
1550 var->base = s->base;
1551 var->limit = s->limit;
1552 var->selector = s->selector;
1553 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1554 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1555 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1556 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1557 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1558 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1559 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
80112c89
JM
1560
1561 /*
1562 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1563 * However, the SVM spec states that the G bit is not observed by the
1564 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1565 * So let's synthesize a legal G bit for all segments, this helps
1566 * running KVM nested. It also helps cross-vendor migration, because
1567 * Intel's vmentry has a check on the 'G' bit.
1568 */
1569 var->g = s->limit > 0xfffff;
25022acc 1570
e0231715
JR
1571 /*
1572 * AMD's VMCB does not have an explicit unusable field, so emulate it
19bca6ab
AP
1573 * for cross vendor migration purposes by "not present"
1574 */
8eae9570 1575 var->unusable = !var->present;
19bca6ab 1576
1fbdc7a5 1577 switch (seg) {
1fbdc7a5
AP
1578 case VCPU_SREG_TR:
1579 /*
1580 * Work around a bug where the busy flag in the tr selector
1581 * isn't exposed
1582 */
c0d09828 1583 var->type |= 0x2;
1fbdc7a5
AP
1584 break;
1585 case VCPU_SREG_DS:
1586 case VCPU_SREG_ES:
1587 case VCPU_SREG_FS:
1588 case VCPU_SREG_GS:
1589 /*
1590 * The accessed bit must always be set in the segment
1591 * descriptor cache, although it can be cleared in the
1592 * descriptor, the cached bit always remains at 1. Since
1593 * Intel has a check on this, set it here to support
1594 * cross-vendor migration.
1595 */
1596 if (!var->unusable)
1597 var->type |= 0x1;
1598 break;
b586eb02 1599 case VCPU_SREG_SS:
e0231715
JR
1600 /*
1601 * On AMD CPUs sometimes the DB bit in the segment
b586eb02
AP
1602 * descriptor is left as 1, although the whole segment has
1603 * been made unusable. Clear it here to pass an Intel VMX
1604 * entry check when cross vendor migrating.
1605 */
1606 if (var->unusable)
1607 var->db = 0;
d9c1b543 1608 /* This is symmetric with svm_set_segment() */
33b458d2 1609 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
b586eb02 1610 break;
1fbdc7a5 1611 }
6aa8b732
AK
1612}
1613
2e4d2653
IE
1614static int svm_get_cpl(struct kvm_vcpu *vcpu)
1615{
1616 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1617
1618 return save->cpl;
1619}
1620
872e0c53
SC
1621static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1622{
1623 struct kvm_segment cs;
1624
1625 svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1626 *db = cs.db;
1627 *l = cs.l;
1628}
1629
89a27f4d 1630static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1631{
a2fa3e9f
GH
1632 struct vcpu_svm *svm = to_svm(vcpu);
1633
89a27f4d
GN
1634 dt->size = svm->vmcb->save.idtr.limit;
1635 dt->address = svm->vmcb->save.idtr.base;
6aa8b732
AK
1636}
1637
89a27f4d 1638static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1639{
a2fa3e9f
GH
1640 struct vcpu_svm *svm = to_svm(vcpu);
1641
89a27f4d
GN
1642 svm->vmcb->save.idtr.limit = dt->size;
1643 svm->vmcb->save.idtr.base = dt->address ;
06e7852c 1644 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
6aa8b732
AK
1645}
1646
89a27f4d 1647static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1648{
a2fa3e9f
GH
1649 struct vcpu_svm *svm = to_svm(vcpu);
1650
89a27f4d
GN
1651 dt->size = svm->vmcb->save.gdtr.limit;
1652 dt->address = svm->vmcb->save.gdtr.base;
6aa8b732
AK
1653}
1654
89a27f4d 1655static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 1656{
a2fa3e9f
GH
1657 struct vcpu_svm *svm = to_svm(vcpu);
1658
89a27f4d
GN
1659 svm->vmcb->save.gdtr.limit = dt->size;
1660 svm->vmcb->save.gdtr.base = dt->address ;
06e7852c 1661 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
6aa8b732
AK
1662}
1663
559c7c75 1664static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
405329fc
MR
1665{
1666 struct vcpu_svm *svm = to_svm(vcpu);
1667
1668 /*
1669 * For guests that don't set guest_state_protected, the cr3 update is
1670 * handled via kvm_mmu_load() while entering the guest. For guests
1671 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1672 * VMCB save area now, since the save area will become the initial
1673 * contents of the VMSA, and future VMCB save area updates won't be
1674 * seen.
1675 */
1676 if (sev_es_guest(vcpu->kvm)) {
1677 svm->vmcb->save.cr3 = cr3;
1678 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1679 }
1680}
1681
883b0a91 1682void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
6aa8b732 1683{
a2fa3e9f 1684 struct vcpu_svm *svm = to_svm(vcpu);
2a32a77c 1685 u64 hcr0 = cr0;
c53bbe21 1686 bool old_paging = is_paging(vcpu);
a2fa3e9f 1687
05b3e0c2 1688#ifdef CONFIG_X86_64
f1c6366e 1689 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
707d92fa 1690 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
f6801dff 1691 vcpu->arch.efer |= EFER_LMA;
2b5203ee 1692 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
6aa8b732
AK
1693 }
1694
d77c26fc 1695 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
f6801dff 1696 vcpu->arch.efer &= ~EFER_LMA;
2b5203ee 1697 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
6aa8b732
AK
1698 }
1699 }
1700#endif
ad312c7c 1701 vcpu->arch.cr0 = cr0;
888f9f3e 1702
c53bbe21 1703 if (!npt_enabled) {
2a32a77c 1704 hcr0 |= X86_CR0_PG | X86_CR0_WP;
c53bbe21
ML
1705 if (old_paging != is_paging(vcpu))
1706 svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1707 }
02daab21 1708
bcf166a9
PB
1709 /*
1710 * re-enable caching here because the QEMU bios
1711 * does not do it - this results in some delay at
1712 * reboot
1713 */
1714 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
2a32a77c
PB
1715 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1716
1717 svm->vmcb->save.cr0 = hcr0;
06e7852c 1718 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
2a32a77c
PB
1719
1720 /*
1721 * SEV-ES guests must always keep the CR intercepts cleared. CR
1722 * tracking is done using the CR write traps.
1723 */
63129754 1724 if (sev_es_guest(vcpu->kvm))
2a32a77c
PB
1725 return;
1726
1727 if (hcr0 == cr0) {
1728 /* Selective CR0 write remains on. */
1729 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1730 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1731 } else {
1732 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1733 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1734 }
6aa8b732
AK
1735}
1736
c2fe3cd4
SC
1737static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1738{
1739 return true;
1740}
1741
1742void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6aa8b732 1743{
1e02ce4c 1744 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
dc924b06 1745 unsigned long old_cr4 = vcpu->arch.cr4;
e5eab0ce
JR
1746
1747 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
4d9c83f5 1748 svm_flush_tlb_current(vcpu);
6394b649 1749
ec077263 1750 vcpu->arch.cr4 = cr4;
c53bbe21 1751 if (!npt_enabled) {
ec077263 1752 cr4 |= X86_CR4_PAE;
c53bbe21
ML
1753
1754 if (!is_paging(vcpu))
1755 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1756 }
6394b649 1757 cr4 |= host_cr4_mce;
ec077263 1758 to_svm(vcpu)->vmcb->save.cr4 = cr4;
06e7852c 1759 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
2259c17f
JM
1760
1761 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1762 kvm_update_cpuid_runtime(vcpu);
6aa8b732
AK
1763}
1764
1765static void svm_set_segment(struct kvm_vcpu *vcpu,
1766 struct kvm_segment *var, int seg)
1767{
a2fa3e9f 1768 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732
AK
1769 struct vmcb_seg *s = svm_seg(vcpu, seg);
1770
1771 s->base = var->base;
1772 s->limit = var->limit;
1773 s->selector = var->selector;
d9c1b543
RP
1774 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1775 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1776 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1777 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1778 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1779 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1780 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1781 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
ae9fedc7
PB
1782
1783 /*
1784 * This is always accurate, except if SYSRET returned to a segment
1785 * with SS.DPL != 3. Intel does not have this quirk, and always
1786 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1787 * would entail passing the CPL to userspace and back.
1788 */
1789 if (seg == VCPU_SREG_SS)
d9c1b543
RP
1790 /* This is symmetric with svm_get_segment() */
1791 svm->vmcb->save.cpl = (var->dpl & 3);
6aa8b732 1792
06e7852c 1793 vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
6aa8b732
AK
1794}
1795
b6a7cc35 1796static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
6aa8b732 1797{
d0bfb940
JK
1798 struct vcpu_svm *svm = to_svm(vcpu);
1799
18c918c5 1800 clr_exception_intercept(svm, BP_VECTOR);
44c11430 1801
d0bfb940 1802 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
d0bfb940 1803 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
18c918c5 1804 set_exception_intercept(svm, BP_VECTOR);
6986982f 1805 }
44c11430
GN
1806}
1807
0fe1e009 1808static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
6aa8b732 1809{
0fe1e009
TH
1810 if (sd->next_asid > sd->max_asid) {
1811 ++sd->asid_generation;
4faefff3 1812 sd->next_asid = sd->min_asid;
a2fa3e9f 1813 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
7e8e6eed 1814 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
6aa8b732
AK
1815 }
1816
193015ad 1817 svm->current_vmcb->asid_generation = sd->asid_generation;
7e8e6eed 1818 svm->asid = sd->next_asid++;
6aa8b732
AK
1819}
1820
d67668e9 1821static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
73aaf249 1822{
d67668e9 1823 struct vmcb *vmcb = svm->vmcb;
73aaf249 1824
8d4846b9
TL
1825 if (svm->vcpu.arch.guest_state_protected)
1826 return;
1827
d67668e9
PB
1828 if (unlikely(value != vmcb->save.dr6)) {
1829 vmcb->save.dr6 = value;
06e7852c 1830 vmcb_mark_dirty(vmcb, VMCB_DR);
d67668e9 1831 }
73aaf249
JK
1832}
1833
facb0139
PB
1834static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1835{
1836 struct vcpu_svm *svm = to_svm(vcpu);
1837
8d4846b9
TL
1838 if (vcpu->arch.guest_state_protected)
1839 return;
1840
facb0139
PB
1841 get_debugreg(vcpu->arch.db[0], 0);
1842 get_debugreg(vcpu->arch.db[1], 1);
1843 get_debugreg(vcpu->arch.db[2], 2);
1844 get_debugreg(vcpu->arch.db[3], 3);
d67668e9 1845 /*
9a3ecd5e 1846 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
d67668e9
PB
1847 * because db_interception might need it. We can do it before vmentry.
1848 */
5679b803 1849 vcpu->arch.dr6 = svm->vmcb->save.dr6;
facb0139 1850 vcpu->arch.dr7 = svm->vmcb->save.dr7;
facb0139
PB
1851 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1852 set_dr_intercepts(svm);
1853}
1854
020df079 1855static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
6aa8b732 1856{
42dbaa5a 1857 struct vcpu_svm *svm = to_svm(vcpu);
42dbaa5a 1858
8d4846b9
TL
1859 if (vcpu->arch.guest_state_protected)
1860 return;
1861
020df079 1862 svm->vmcb->save.dr7 = value;
06e7852c 1863 vmcb_mark_dirty(svm->vmcb, VMCB_DR);
6aa8b732
AK
1864}
1865
63129754 1866static int pf_interception(struct kvm_vcpu *vcpu)
6aa8b732 1867{
63129754
PB
1868 struct vcpu_svm *svm = to_svm(vcpu);
1869
6d1b867d 1870 u64 fault_address = svm->vmcb->control.exit_info_2;
1261bfa3 1871 u64 error_code = svm->vmcb->control.exit_info_1;
6aa8b732 1872
63129754 1873 return kvm_handle_page_fault(vcpu, error_code, fault_address,
00b10fe1
BS
1874 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1875 svm->vmcb->control.insn_bytes : NULL,
d0006530
PB
1876 svm->vmcb->control.insn_len);
1877}
1878
63129754 1879static int npf_interception(struct kvm_vcpu *vcpu)
d0006530 1880{
63129754
PB
1881 struct vcpu_svm *svm = to_svm(vcpu);
1882
76ff371b 1883 u64 fault_address = svm->vmcb->control.exit_info_2;
d0006530
PB
1884 u64 error_code = svm->vmcb->control.exit_info_1;
1885
1886 trace_kvm_page_fault(fault_address, error_code);
63129754 1887 return kvm_mmu_page_fault(vcpu, fault_address, error_code,
00b10fe1
BS
1888 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1889 svm->vmcb->control.insn_bytes : NULL,
d0006530 1890 svm->vmcb->control.insn_len);
6aa8b732
AK
1891}
1892
63129754 1893static int db_interception(struct kvm_vcpu *vcpu)
d0bfb940 1894{
63129754
PB
1895 struct kvm_run *kvm_run = vcpu->run;
1896 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 1897
63129754 1898 if (!(vcpu->guest_debug &
44c11430 1899 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
6be7d306 1900 !svm->nmi_singlestep) {
9a3ecd5e 1901 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
63129754 1902 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
d0bfb940
JK
1903 return 1;
1904 }
44c11430 1905
6be7d306 1906 if (svm->nmi_singlestep) {
4aebd0e9 1907 disable_nmi_singlestep(svm);
99c22179
VK
1908 /* Make sure we check for pending NMIs upon entry */
1909 kvm_make_request(KVM_REQ_EVENT, vcpu);
44c11430
GN
1910 }
1911
63129754 1912 if (vcpu->guest_debug &
e0231715 1913 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
44c11430 1914 kvm_run->exit_reason = KVM_EXIT_DEBUG;
dee919d1
PB
1915 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
1916 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
44c11430
GN
1917 kvm_run->debug.arch.pc =
1918 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1919 kvm_run->debug.arch.exception = DB_VECTOR;
1920 return 0;
1921 }
1922
1923 return 1;
d0bfb940
JK
1924}
1925
63129754 1926static int bp_interception(struct kvm_vcpu *vcpu)
d0bfb940 1927{
63129754
PB
1928 struct vcpu_svm *svm = to_svm(vcpu);
1929 struct kvm_run *kvm_run = vcpu->run;
851ba692 1930
d0bfb940
JK
1931 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1932 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1933 kvm_run->debug.arch.exception = BP_VECTOR;
1934 return 0;
1935}
1936
63129754 1937static int ud_interception(struct kvm_vcpu *vcpu)
7aa81cc0 1938{
63129754 1939 return handle_ud(vcpu);
7aa81cc0
AL
1940}
1941
63129754 1942static int ac_interception(struct kvm_vcpu *vcpu)
54a20552 1943{
63129754 1944 kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
54a20552
EN
1945 return 1;
1946}
1947
67ec6607
JR
1948static bool is_erratum_383(void)
1949{
1950 int err, i;
1951 u64 value;
1952
1953 if (!erratum_383_found)
1954 return false;
1955
1956 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1957 if (err)
1958 return false;
1959
1960 /* Bit 62 may or may not be set for this mce */
1961 value &= ~(1ULL << 62);
1962
1963 if (value != 0xb600000000010015ULL)
1964 return false;
1965
1966 /* Clear MCi_STATUS registers */
1967 for (i = 0; i < 6; ++i)
1968 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
1969
1970 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
1971 if (!err) {
1972 u32 low, high;
1973
1974 value &= ~(1ULL << 2);
1975 low = lower_32_bits(value);
1976 high = upper_32_bits(value);
1977
1978 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
1979 }
1980
1981 /* Flush tlb to evict multi-match entries */
1982 __flush_tlb_all();
1983
1984 return true;
1985}
1986
63129754 1987static void svm_handle_mce(struct kvm_vcpu *vcpu)
53371b50 1988{
67ec6607
JR
1989 if (is_erratum_383()) {
1990 /*
1991 * Erratum 383 triggered. Guest state is corrupt so kill the
1992 * guest.
1993 */
1994 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1995
63129754 1996 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
67ec6607
JR
1997
1998 return;
1999 }
2000
53371b50
JR
2001 /*
2002 * On an #MC intercept the MCE handler is not called automatically in
2003 * the host. So do it by hand here.
2004 */
1c164cb3 2005 kvm_machine_check();
fe5913e4
JR
2006}
2007
63129754 2008static int mc_interception(struct kvm_vcpu *vcpu)
fe5913e4 2009{
53371b50
JR
2010 return 1;
2011}
2012
63129754 2013static int shutdown_interception(struct kvm_vcpu *vcpu)
46fe4ddd 2014{
63129754
PB
2015 struct kvm_run *kvm_run = vcpu->run;
2016 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 2017
8164a5ff
TL
2018 /*
2019 * The VM save area has already been encrypted so it
2020 * cannot be reinitialized - just terminate.
2021 */
63129754 2022 if (sev_es_guest(vcpu->kvm))
8164a5ff
TL
2023 return -EINVAL;
2024
46fe4ddd 2025 /*
265e4353
SC
2026 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
2027 * the VMCB in a known good state. Unfortuately, KVM doesn't have
2028 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2029 * userspace. At a platform view, INIT is acceptable behavior as
2030 * there exist bare metal platforms that automatically INIT the CPU
2031 * in response to shutdown.
46fe4ddd 2032 */
a2fa3e9f 2033 clear_page(svm->vmcb);
265e4353 2034 kvm_vcpu_reset(vcpu, true);
46fe4ddd
JR
2035
2036 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2037 return 0;
2038}
2039
63129754 2040static int io_interception(struct kvm_vcpu *vcpu)
6aa8b732 2041{
63129754 2042 struct vcpu_svm *svm = to_svm(vcpu);
d77c26fc 2043 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
dca7f128 2044 int size, in, string;
039576c0 2045 unsigned port;
6aa8b732 2046
63129754 2047 ++vcpu->stat.io_exits;
e70669ab 2048 string = (io_info & SVM_IOIO_STR_MASK) != 0;
039576c0
AK
2049 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2050 port = io_info >> 16;
2051 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
7ed9abfe
TL
2052
2053 if (string) {
2054 if (sev_es_guest(vcpu->kvm))
2055 return sev_es_string_io(svm, size, port, in);
2056 else
2057 return kvm_emulate_instruction(vcpu, 0);
2058 }
2059
cf8f70bf 2060 svm->next_rip = svm->vmcb->control.exit_info_2;
cf8f70bf 2061
63129754 2062 return kvm_fast_pio(vcpu, size, port, in);
c47f098d
JR
2063}
2064
63129754 2065static int nmi_interception(struct kvm_vcpu *vcpu)
a0698055 2066{
a0698055
JR
2067 return 1;
2068}
2069
991afbbe
ML
2070static int smi_interception(struct kvm_vcpu *vcpu)
2071{
2072 return 1;
2073}
2074
63129754 2075static int intr_interception(struct kvm_vcpu *vcpu)
6aa8b732 2076{
63129754 2077 ++vcpu->stat.irq_exits;
6aa8b732
AK
2078 return 1;
2079}
2080
2ac636a6 2081static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
6aa8b732 2082{
63129754 2083 struct vcpu_svm *svm = to_svm(vcpu);
9e8f0fbf 2084 struct vmcb *vmcb12;
8c5fbf1a 2085 struct kvm_host_map map;
b742c1e6 2086 int ret;
9966bf68 2087
63129754 2088 if (nested_svm_check_permissions(vcpu))
5542675b
AG
2089 return 1;
2090
63129754 2091 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
8c5fbf1a
KA
2092 if (ret) {
2093 if (ret == -EINVAL)
63129754 2094 kvm_inject_gp(vcpu, 0);
9966bf68 2095 return 1;
8c5fbf1a
KA
2096 }
2097
9e8f0fbf 2098 vmcb12 = map.hva;
9966bf68 2099
63129754 2100 ret = kvm_skip_emulated_instruction(vcpu);
9966bf68 2101
adc2a237 2102 if (vmload) {
2bb16bea 2103 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
adc2a237
ML
2104 svm->sysenter_eip_hi = 0;
2105 svm->sysenter_esp_hi = 0;
9a9e7481 2106 } else {
2bb16bea 2107 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
9a9e7481 2108 }
e3e9ed3d 2109
63129754 2110 kvm_vcpu_unmap(vcpu, &map, true);
5542675b 2111
b742c1e6 2112 return ret;
5542675b
AG
2113}
2114
2ac636a6 2115static int vmload_interception(struct kvm_vcpu *vcpu)
5542675b 2116{
2ac636a6
SC
2117 return vmload_vmsave_interception(vcpu, true);
2118}
5542675b 2119
2ac636a6
SC
2120static int vmsave_interception(struct kvm_vcpu *vcpu)
2121{
2122 return vmload_vmsave_interception(vcpu, false);
5542675b
AG
2123}
2124
63129754 2125static int vmrun_interception(struct kvm_vcpu *vcpu)
3d6368ef 2126{
63129754 2127 if (nested_svm_check_permissions(vcpu))
3d6368ef
AG
2128 return 1;
2129
63129754 2130 return nested_svm_vmrun(vcpu);
3d6368ef
AG
2131}
2132
82a11e9c
BD
2133enum {
2134 NONE_SVM_INSTR,
2135 SVM_INSTR_VMRUN,
2136 SVM_INSTR_VMLOAD,
2137 SVM_INSTR_VMSAVE,
2138};
2139
2140/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2141static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2142{
2143 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2144
2145 if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2146 return NONE_SVM_INSTR;
2147
2148 switch (ctxt->modrm) {
2149 case 0xd8: /* VMRUN */
2150 return SVM_INSTR_VMRUN;
2151 case 0xda: /* VMLOAD */
2152 return SVM_INSTR_VMLOAD;
2153 case 0xdb: /* VMSAVE */
2154 return SVM_INSTR_VMSAVE;
2155 default:
2156 break;
2157 }
2158
2159 return NONE_SVM_INSTR;
2160}
2161
2162static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2163{
14c2bf81
WH
2164 const int guest_mode_exit_codes[] = {
2165 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2166 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2167 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2168 };
63129754 2169 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
82a11e9c
BD
2170 [SVM_INSTR_VMRUN] = vmrun_interception,
2171 [SVM_INSTR_VMLOAD] = vmload_interception,
2172 [SVM_INSTR_VMSAVE] = vmsave_interception,
2173 };
2174 struct vcpu_svm *svm = to_svm(vcpu);
2df8d380 2175 int ret;
82a11e9c 2176
14c2bf81 2177 if (is_guest_mode(vcpu)) {
2df8d380 2178 /* Returns '1' or -errno on failure, '0' on success. */
3a87c7e0 2179 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2df8d380
SC
2180 if (ret)
2181 return ret;
2182 return 1;
2183 }
63129754 2184 return svm_instr_handlers[opcode](vcpu);
82a11e9c
BD
2185}
2186
2187/*
2188 * #GP handling code. Note that #GP can be triggered under the following two
2189 * cases:
2190 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2191 * some AMD CPUs when EAX of these instructions are in the reserved memory
2192 * regions (e.g. SMM memory on host).
2193 * 2) VMware backdoor
2194 */
63129754 2195static int gp_interception(struct kvm_vcpu *vcpu)
82a11e9c 2196{
63129754 2197 struct vcpu_svm *svm = to_svm(vcpu);
82a11e9c
BD
2198 u32 error_code = svm->vmcb->control.exit_info_1;
2199 int opcode;
2200
2201 /* Both #GP cases have zero error_code */
2202 if (error_code)
2203 goto reinject;
2204
2205 /* Decode the instruction for usage later */
2206 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2207 goto reinject;
2208
2209 opcode = svm_instr_opcode(vcpu);
2210
2211 if (opcode == NONE_SVM_INSTR) {
2212 if (!enable_vmware_backdoor)
2213 goto reinject;
2214
2215 /*
2216 * VMware backdoor emulation on #GP interception only handles
2217 * IN{S}, OUT{S}, and RDPMC.
2218 */
14c2bf81
WH
2219 if (!is_guest_mode(vcpu))
2220 return kvm_emulate_instruction(vcpu,
82a11e9c 2221 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
47c28d43
DV
2222 } else {
2223 /* All SVM instructions expect page aligned RAX */
2224 if (svm->vmcb->save.rax & ~PAGE_MASK)
2225 goto reinject;
2226
82a11e9c 2227 return emulate_svm_instr(vcpu, opcode);
47c28d43 2228 }
82a11e9c
BD
2229
2230reinject:
2231 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2232 return 1;
2233}
2234
ffdf7f9e
PB
2235void svm_set_gif(struct vcpu_svm *svm, bool value)
2236{
2237 if (value) {
2238 /*
2239 * If VGIF is enabled, the STGI intercept is only added to
2240 * detect the opening of the SMI/NMI window; remove it now.
2241 * Likewise, clear the VINTR intercept, we will set it
2242 * again while processing KVM_REQ_EVENT if needed.
2243 */
ea91559b 2244 if (vgif)
a284ba56
JR
2245 svm_clr_intercept(svm, INTERCEPT_STGI);
2246 if (svm_is_intercept(svm, INTERCEPT_VINTR))
ffdf7f9e
PB
2247 svm_clear_vintr(svm);
2248
2249 enable_gif(svm);
2250 if (svm->vcpu.arch.smi_pending ||
2251 svm->vcpu.arch.nmi_pending ||
2252 kvm_cpu_has_injectable_intr(&svm->vcpu))
2253 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2254 } else {
2255 disable_gif(svm);
2256
2257 /*
2258 * After a CLGI no interrupts should come. But if vGIF is
2259 * in use, we still rely on the VINTR intercept (rather than
2260 * STGI) to detect an open interrupt window.
2261 */
ea91559b 2262 if (!vgif)
ffdf7f9e
PB
2263 svm_clear_vintr(svm);
2264 }
2265}
2266
63129754 2267static int stgi_interception(struct kvm_vcpu *vcpu)
1371d904 2268{
b742c1e6
LP
2269 int ret;
2270
63129754 2271 if (nested_svm_check_permissions(vcpu))
1371d904
AG
2272 return 1;
2273
63129754
PB
2274 ret = kvm_skip_emulated_instruction(vcpu);
2275 svm_set_gif(to_svm(vcpu), true);
b742c1e6 2276 return ret;
1371d904
AG
2277}
2278
63129754 2279static int clgi_interception(struct kvm_vcpu *vcpu)
1371d904 2280{
b742c1e6
LP
2281 int ret;
2282
63129754 2283 if (nested_svm_check_permissions(vcpu))
1371d904
AG
2284 return 1;
2285
63129754
PB
2286 ret = kvm_skip_emulated_instruction(vcpu);
2287 svm_set_gif(to_svm(vcpu), false);
b742c1e6 2288 return ret;
1371d904
AG
2289}
2290
63129754 2291static int invlpga_interception(struct kvm_vcpu *vcpu)
ff092385 2292{
bc9eff67
SC
2293 gva_t gva = kvm_rax_read(vcpu);
2294 u32 asid = kvm_rcx_read(vcpu);
ff092385 2295
bc9eff67
SC
2296 /* FIXME: Handle an address size prefix. */
2297 if (!is_long_mode(vcpu))
2298 gva = (u32)gva;
ff092385 2299
bc9eff67 2300 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
532a46b9 2301
ff092385 2302 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
bc9eff67 2303 kvm_mmu_invlpg(vcpu, gva);
532a46b9 2304
63129754 2305 return kvm_skip_emulated_instruction(vcpu);
dab429a7
DK
2306}
2307
63129754 2308static int skinit_interception(struct kvm_vcpu *vcpu)
81dd35d4 2309{
63129754 2310 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
81dd35d4 2311
63129754 2312 kvm_queue_exception(vcpu, UD_VECTOR);
0cb8410b
JM
2313 return 1;
2314}
2315
63129754 2316static int task_switch_interception(struct kvm_vcpu *vcpu)
6aa8b732 2317{
63129754 2318 struct vcpu_svm *svm = to_svm(vcpu);
37817f29 2319 u16 tss_selector;
64a7ec06
GN
2320 int reason;
2321 int int_type = svm->vmcb->control.exit_int_info &
2322 SVM_EXITINTINFO_TYPE_MASK;
8317c298 2323 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
fe8e7f83
GN
2324 uint32_t type =
2325 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2326 uint32_t idt_v =
2327 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
e269fb21
JK
2328 bool has_error_code = false;
2329 u32 error_code = 0;
37817f29
IE
2330
2331 tss_selector = (u16)svm->vmcb->control.exit_info_1;
64a7ec06 2332
37817f29
IE
2333 if (svm->vmcb->control.exit_info_2 &
2334 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
64a7ec06
GN
2335 reason = TASK_SWITCH_IRET;
2336 else if (svm->vmcb->control.exit_info_2 &
2337 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2338 reason = TASK_SWITCH_JMP;
fe8e7f83 2339 else if (idt_v)
64a7ec06
GN
2340 reason = TASK_SWITCH_GATE;
2341 else
2342 reason = TASK_SWITCH_CALL;
2343
fe8e7f83
GN
2344 if (reason == TASK_SWITCH_GATE) {
2345 switch (type) {
2346 case SVM_EXITINTINFO_TYPE_NMI:
63129754 2347 vcpu->arch.nmi_injected = false;
fe8e7f83
GN
2348 break;
2349 case SVM_EXITINTINFO_TYPE_EXEPT:
e269fb21
JK
2350 if (svm->vmcb->control.exit_info_2 &
2351 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2352 has_error_code = true;
2353 error_code =
2354 (u32)svm->vmcb->control.exit_info_2;
2355 }
63129754 2356 kvm_clear_exception_queue(vcpu);
fe8e7f83
GN
2357 break;
2358 case SVM_EXITINTINFO_TYPE_INTR:
63129754 2359 kvm_clear_interrupt_queue(vcpu);
fe8e7f83
GN
2360 break;
2361 default:
2362 break;
2363 }
2364 }
64a7ec06 2365
8317c298
GN
2366 if (reason != TASK_SWITCH_GATE ||
2367 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2368 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
f8ea7c60 2369 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
23e5092b 2370 if (!svm_skip_emulated_instruction(vcpu))
738fece4 2371 return 0;
f8ea7c60 2372 }
64a7ec06 2373
7f3d35fd
KW
2374 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2375 int_vec = -1;
2376
63129754 2377 return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
60fc3d02 2378 has_error_code, error_code);
6aa8b732
AK
2379}
2380
63129754 2381static int iret_interception(struct kvm_vcpu *vcpu)
6aa8b732 2382{
63129754 2383 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732 2384
63129754
PB
2385 ++vcpu->stat.nmi_window_exits;
2386 vcpu->arch.hflags |= HF_IRET_MASK;
2387 if (!sev_es_guest(vcpu->kvm)) {
4444dfe4 2388 svm_clr_intercept(svm, INTERCEPT_IRET);
63129754 2389 svm->nmi_iret_rip = kvm_rip_read(vcpu);
4444dfe4 2390 }
63129754 2391 kvm_make_request(KVM_REQ_EVENT, vcpu);
95ba8273
GN
2392 return 1;
2393}
2394
63129754 2395static int invlpg_interception(struct kvm_vcpu *vcpu)
a7052897 2396{
df4f3108 2397 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2398 return kvm_emulate_instruction(vcpu, 0);
df4f3108 2399
63129754
PB
2400 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2401 return kvm_skip_emulated_instruction(vcpu);
a7052897
MT
2402}
2403
63129754 2404static int emulate_on_interception(struct kvm_vcpu *vcpu)
6aa8b732 2405{
63129754 2406 return kvm_emulate_instruction(vcpu, 0);
6aa8b732
AK
2407}
2408
63129754 2409static int rsm_interception(struct kvm_vcpu *vcpu)
7607b717 2410{
63129754 2411 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
7607b717
BS
2412}
2413
63129754 2414static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
52eb5a6d 2415 unsigned long val)
628afd2a 2416{
63129754
PB
2417 struct vcpu_svm *svm = to_svm(vcpu);
2418 unsigned long cr0 = vcpu->arch.cr0;
628afd2a 2419 bool ret = false;
628afd2a 2420
63129754 2421 if (!is_guest_mode(vcpu) ||
8fc78909 2422 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
628afd2a
JR
2423 return false;
2424
2425 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2426 val &= ~SVM_CR0_SELECTIVE_MASK;
2427
2428 if (cr0 ^ val) {
2429 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2430 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2431 }
2432
2433 return ret;
2434}
2435
7ff76d58
AP
2436#define CR_VALID (1ULL << 63)
2437
63129754 2438static int cr_interception(struct kvm_vcpu *vcpu)
7ff76d58 2439{
63129754 2440 struct vcpu_svm *svm = to_svm(vcpu);
7ff76d58
AP
2441 int reg, cr;
2442 unsigned long val;
2443 int err;
2444
2445 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2446 return emulate_on_interception(vcpu);
7ff76d58
AP
2447
2448 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
63129754 2449 return emulate_on_interception(vcpu);
7ff76d58
AP
2450
2451 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
5e57518d
DK
2452 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2453 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2454 else
2455 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
7ff76d58
AP
2456
2457 err = 0;
2458 if (cr >= 16) { /* mov to cr */
2459 cr -= 16;
27b4a9c4 2460 val = kvm_register_read(vcpu, reg);
95b28ac9 2461 trace_kvm_cr_write(cr, val);
7ff76d58
AP
2462 switch (cr) {
2463 case 0:
63129754
PB
2464 if (!check_selective_cr0_intercepted(vcpu, val))
2465 err = kvm_set_cr0(vcpu, val);
977b2d03
JR
2466 else
2467 return 1;
2468
7ff76d58
AP
2469 break;
2470 case 3:
63129754 2471 err = kvm_set_cr3(vcpu, val);
7ff76d58
AP
2472 break;
2473 case 4:
63129754 2474 err = kvm_set_cr4(vcpu, val);
7ff76d58
AP
2475 break;
2476 case 8:
63129754 2477 err = kvm_set_cr8(vcpu, val);
7ff76d58
AP
2478 break;
2479 default:
2480 WARN(1, "unhandled write to CR%d", cr);
63129754 2481 kvm_queue_exception(vcpu, UD_VECTOR);
7ff76d58
AP
2482 return 1;
2483 }
2484 } else { /* mov from cr */
2485 switch (cr) {
2486 case 0:
63129754 2487 val = kvm_read_cr0(vcpu);
7ff76d58
AP
2488 break;
2489 case 2:
63129754 2490 val = vcpu->arch.cr2;
7ff76d58
AP
2491 break;
2492 case 3:
63129754 2493 val = kvm_read_cr3(vcpu);
7ff76d58
AP
2494 break;
2495 case 4:
63129754 2496 val = kvm_read_cr4(vcpu);
7ff76d58
AP
2497 break;
2498 case 8:
63129754 2499 val = kvm_get_cr8(vcpu);
7ff76d58
AP
2500 break;
2501 default:
2502 WARN(1, "unhandled read from CR%d", cr);
63129754 2503 kvm_queue_exception(vcpu, UD_VECTOR);
7ff76d58
AP
2504 return 1;
2505 }
27b4a9c4 2506 kvm_register_write(vcpu, reg, val);
95b28ac9 2507 trace_kvm_cr_read(cr, val);
7ff76d58 2508 }
63129754 2509 return kvm_complete_insn_gp(vcpu, err);
7ff76d58
AP
2510}
2511
63129754 2512static int cr_trap(struct kvm_vcpu *vcpu)
f27ad38a 2513{
63129754 2514 struct vcpu_svm *svm = to_svm(vcpu);
f27ad38a
TL
2515 unsigned long old_value, new_value;
2516 unsigned int cr;
d1949b93 2517 int ret = 0;
f27ad38a
TL
2518
2519 new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2520
2521 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2522 switch (cr) {
2523 case 0:
2524 old_value = kvm_read_cr0(vcpu);
2525 svm_set_cr0(vcpu, new_value);
2526
2527 kvm_post_set_cr0(vcpu, old_value, new_value);
2528 break;
5b51cb13
TL
2529 case 4:
2530 old_value = kvm_read_cr4(vcpu);
2531 svm_set_cr4(vcpu, new_value);
2532
2533 kvm_post_set_cr4(vcpu, old_value, new_value);
2534 break;
d1949b93 2535 case 8:
63129754 2536 ret = kvm_set_cr8(vcpu, new_value);
d1949b93 2537 break;
f27ad38a
TL
2538 default:
2539 WARN(1, "unhandled CR%d write trap", cr);
2540 kvm_queue_exception(vcpu, UD_VECTOR);
2541 return 1;
2542 }
2543
d1949b93 2544 return kvm_complete_insn_gp(vcpu, ret);
f27ad38a
TL
2545}
2546
63129754 2547static int dr_interception(struct kvm_vcpu *vcpu)
cae3797a 2548{
63129754 2549 struct vcpu_svm *svm = to_svm(vcpu);
cae3797a
AP
2550 int reg, dr;
2551 unsigned long val;
996ff542 2552 int err = 0;
cae3797a 2553
63129754 2554 if (vcpu->guest_debug == 0) {
facb0139
PB
2555 /*
2556 * No more DR vmexits; force a reload of the debug registers
2557 * and reenter on this instruction. The next vmexit will
2558 * retrieve the full state of the debug registers.
2559 */
2560 clr_dr_intercepts(svm);
63129754 2561 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
facb0139
PB
2562 return 1;
2563 }
2564
cae3797a 2565 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
63129754 2566 return emulate_on_interception(vcpu);
cae3797a
AP
2567
2568 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2569 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
996ff542
PB
2570 if (dr >= 16) { /* mov to DRn */
2571 dr -= 16;
27b4a9c4 2572 val = kvm_register_read(vcpu, reg);
63129754 2573 err = kvm_set_dr(vcpu, dr, val);
cae3797a 2574 } else {
63129754 2575 kvm_get_dr(vcpu, dr, &val);
27b4a9c4 2576 kvm_register_write(vcpu, reg, val);
cae3797a
AP
2577 }
2578
63129754 2579 return kvm_complete_insn_gp(vcpu, err);
cae3797a
AP
2580}
2581
63129754 2582static int cr8_write_interception(struct kvm_vcpu *vcpu)
1d075434 2583{
eea1cff9 2584 int r;
851ba692 2585
63129754 2586 u8 cr8_prev = kvm_get_cr8(vcpu);
0a5fff19 2587 /* instruction emulation calls kvm_set_cr8() */
63129754
PB
2588 r = cr_interception(vcpu);
2589 if (lapic_in_kernel(vcpu))
7ff76d58 2590 return r;
63129754 2591 if (cr8_prev <= kvm_get_cr8(vcpu))
7ff76d58 2592 return r;
63129754 2593 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
1d075434
JR
2594 return 0;
2595}
2596
63129754 2597static int efer_trap(struct kvm_vcpu *vcpu)
2985afbc
TL
2598{
2599 struct msr_data msr_info;
2600 int ret;
2601
2602 /*
2603 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2604 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2605 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2606 * the guest doesn't have X86_FEATURE_SVM.
2607 */
2608 msr_info.host_initiated = false;
2609 msr_info.index = MSR_EFER;
63129754
PB
2610 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2611 ret = kvm_set_msr_common(vcpu, &msr_info);
2985afbc 2612
63129754 2613 return kvm_complete_insn_gp(vcpu, ret);
2985afbc
TL
2614}
2615
801e459a
TL
2616static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2617{
d1d93fa9
TL
2618 msr->data = 0;
2619
2620 switch (msr->index) {
2621 case MSR_F10H_DECFG:
2622 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2623 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2624 break;
d574c539
VK
2625 case MSR_IA32_PERF_CAPABILITIES:
2626 return 0;
d1d93fa9 2627 default:
12bc2132 2628 return KVM_MSR_RET_INVALID;
d1d93fa9
TL
2629 }
2630
2631 return 0;
801e459a
TL
2632}
2633
609e36d3 2634static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 2635{
a2fa3e9f
GH
2636 struct vcpu_svm *svm = to_svm(vcpu);
2637
609e36d3 2638 switch (msr_info->index) {
5228eb96
ML
2639 case MSR_AMD64_TSC_RATIO:
2640 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2641 return 1;
2642 msr_info->data = svm->tsc_ratio_msr;
2643 break;
8c06585d 2644 case MSR_STAR:
cc3ed80a 2645 msr_info->data = svm->vmcb01.ptr->save.star;
6aa8b732 2646 break;
0e859cac 2647#ifdef CONFIG_X86_64
6aa8b732 2648 case MSR_LSTAR:
cc3ed80a 2649 msr_info->data = svm->vmcb01.ptr->save.lstar;
6aa8b732
AK
2650 break;
2651 case MSR_CSTAR:
cc3ed80a 2652 msr_info->data = svm->vmcb01.ptr->save.cstar;
6aa8b732
AK
2653 break;
2654 case MSR_KERNEL_GS_BASE:
cc3ed80a 2655 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
6aa8b732
AK
2656 break;
2657 case MSR_SYSCALL_MASK:
cc3ed80a 2658 msr_info->data = svm->vmcb01.ptr->save.sfmask;
6aa8b732
AK
2659 break;
2660#endif
2661 case MSR_IA32_SYSENTER_CS:
cc3ed80a 2662 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
6aa8b732
AK
2663 break;
2664 case MSR_IA32_SYSENTER_EIP:
adc2a237
ML
2665 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2666 if (guest_cpuid_is_intel(vcpu))
2667 msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
6aa8b732
AK
2668 break;
2669 case MSR_IA32_SYSENTER_ESP:
adc2a237
ML
2670 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2671 if (guest_cpuid_is_intel(vcpu))
2672 msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
6aa8b732 2673 break;
46896c73 2674 case MSR_TSC_AUX:
46896c73
PB
2675 msr_info->data = svm->tsc_aux;
2676 break;
a2938c80 2677 case MSR_IA32_DEBUGCTLMSR:
a2938c80 2678 case MSR_IA32_LASTBRANCHFROMIP:
a2938c80 2679 case MSR_IA32_LASTBRANCHTOIP:
a2938c80 2680 case MSR_IA32_LASTINTFROMIP:
a2938c80 2681 case MSR_IA32_LASTINTTOIP:
1d5a1b58 2682 msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
a2938c80 2683 break;
b286d5d8 2684 case MSR_VM_HSAVE_PA:
609e36d3 2685 msr_info->data = svm->nested.hsave_msr;
b286d5d8 2686 break;
eb6f302e 2687 case MSR_VM_CR:
609e36d3 2688 msr_info->data = svm->nested.vm_cr_msr;
eb6f302e 2689 break;
b2ac58f9
KA
2690 case MSR_IA32_SPEC_CTRL:
2691 if (!msr_info->host_initiated &&
39485ed9 2692 !guest_has_spec_ctrl_msr(vcpu))
b2ac58f9
KA
2693 return 1;
2694
d00b99c5
BM
2695 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2696 msr_info->data = svm->vmcb->save.spec_ctrl;
2697 else
2698 msr_info->data = svm->spec_ctrl;
b2ac58f9 2699 break;
bc226f07
TL
2700 case MSR_AMD64_VIRT_SPEC_CTRL:
2701 if (!msr_info->host_initiated &&
2702 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2703 return 1;
2704
2705 msr_info->data = svm->virt_spec_ctrl;
2706 break;
ae8b7875
BP
2707 case MSR_F15H_IC_CFG: {
2708
2709 int family, model;
2710
2711 family = guest_cpuid_family(vcpu);
2712 model = guest_cpuid_model(vcpu);
2713
2714 if (family < 0 || model < 0)
2715 return kvm_get_msr_common(vcpu, msr_info);
2716
2717 msr_info->data = 0;
2718
2719 if (family == 0x15 &&
2720 (model >= 0x2 && model < 0x20))
2721 msr_info->data = 0x1E;
2722 }
2723 break;
d1d93fa9
TL
2724 case MSR_F10H_DECFG:
2725 msr_info->data = svm->msr_decfg;
2726 break;
6aa8b732 2727 default:
609e36d3 2728 return kvm_get_msr_common(vcpu, msr_info);
6aa8b732
AK
2729 }
2730 return 0;
2731}
2732
f1c6366e
TL
2733static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2734{
2735 struct vcpu_svm *svm = to_svm(vcpu);
b67a4cc3 2736 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
63129754 2737 return kvm_complete_insn_gp(vcpu, err);
f1c6366e 2738
b67a4cc3
PG
2739 ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2740 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
f1c6366e
TL
2741 X86_TRAP_GP |
2742 SVM_EVTINJ_TYPE_EXEPT |
2743 SVM_EVTINJ_VALID);
2744 return 1;
2745}
2746
4a810181
JR
2747static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2748{
2749 struct vcpu_svm *svm = to_svm(vcpu);
2750 int svm_dis, chg_mask;
2751
2752 if (data & ~SVM_VM_CR_VALID_MASK)
2753 return 1;
2754
2755 chg_mask = SVM_VM_CR_VALID_MASK;
2756
2757 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2758 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2759
2760 svm->nested.vm_cr_msr &= ~chg_mask;
2761 svm->nested.vm_cr_msr |= (data & chg_mask);
2762
2763 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2764
2765 /* check for svm_disable while efer.svme is set */
2766 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2767 return 1;
2768
2769 return 0;
2770}
2771
8fe8ab46 2772static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
6aa8b732 2773{
a2fa3e9f 2774 struct vcpu_svm *svm = to_svm(vcpu);
844d69c2 2775 int r;
a2fa3e9f 2776
8fe8ab46
WA
2777 u32 ecx = msr->index;
2778 u64 data = msr->data;
6aa8b732 2779 switch (ecx) {
5228eb96 2780 case MSR_AMD64_TSC_RATIO:
e910a53f
ML
2781
2782 if (!svm->tsc_scaling_enabled) {
2783
2784 if (!msr->host_initiated)
2785 return 1;
2786 /*
2787 * In case TSC scaling is not enabled, always
2788 * leave this MSR at the default value.
2789 *
2790 * Due to bug in qemu 6.2.0, it would try to set
2791 * this msr to 0 if tsc scaling is not enabled.
2792 * Ignore this value as well.
2793 */
2794 if (data != 0 && data != svm->tsc_ratio_msr)
2795 return 1;
2796 break;
2797 }
5228eb96 2798
bb2aa78e 2799 if (data & SVM_TSC_RATIO_RSVD)
5228eb96
ML
2800 return 1;
2801
2802 svm->tsc_ratio_msr = data;
2803
2804 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2805 nested_svm_update_tsc_ratio_msr(vcpu);
2806
2807 break;
15038e14
PB
2808 case MSR_IA32_CR_PAT:
2809 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2810 return 1;
2811 vcpu->arch.pat = data;
4995a368
CA
2812 svm->vmcb01.ptr->save.g_pat = data;
2813 if (is_guest_mode(vcpu))
2814 nested_vmcb02_compute_g_pat(svm);
06e7852c 2815 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
15038e14 2816 break;
b2ac58f9
KA
2817 case MSR_IA32_SPEC_CTRL:
2818 if (!msr->host_initiated &&
39485ed9 2819 !guest_has_spec_ctrl_msr(vcpu))
b2ac58f9
KA
2820 return 1;
2821
841c2be0 2822 if (kvm_spec_ctrl_test_value(data))
b2ac58f9
KA
2823 return 1;
2824
d00b99c5
BM
2825 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2826 svm->vmcb->save.spec_ctrl = data;
2827 else
2828 svm->spec_ctrl = data;
b2ac58f9
KA
2829 if (!data)
2830 break;
2831
2832 /*
2833 * For non-nested:
2834 * When it's written (to non-zero) for the first time, pass
2835 * it through.
2836 *
2837 * For nested:
2838 * The handling of the MSR bitmap for L2 guests is done in
2839 * nested_svm_vmrun_msrpm.
2840 * We update the L1 MSR bit as well since it will end up
2841 * touching the MSR anyway now.
2842 */
476c9bd8 2843 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
b2ac58f9 2844 break;
15d45071
AR
2845 case MSR_IA32_PRED_CMD:
2846 if (!msr->host_initiated &&
39485ed9 2847 !guest_has_pred_cmd_msr(vcpu))
15d45071
AR
2848 return 1;
2849
2850 if (data & ~PRED_CMD_IBPB)
2851 return 1;
39485ed9 2852 if (!boot_cpu_has(X86_FEATURE_IBPB))
6441fa61 2853 return 1;
15d45071
AR
2854 if (!data)
2855 break;
2856
2857 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
476c9bd8 2858 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
15d45071 2859 break;
bc226f07
TL
2860 case MSR_AMD64_VIRT_SPEC_CTRL:
2861 if (!msr->host_initiated &&
2862 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2863 return 1;
2864
2865 if (data & ~SPEC_CTRL_SSBD)
2866 return 1;
2867
2868 svm->virt_spec_ctrl = data;
2869 break;
8c06585d 2870 case MSR_STAR:
cc3ed80a 2871 svm->vmcb01.ptr->save.star = data;
6aa8b732 2872 break;
49b14f24 2873#ifdef CONFIG_X86_64
6aa8b732 2874 case MSR_LSTAR:
cc3ed80a 2875 svm->vmcb01.ptr->save.lstar = data;
6aa8b732
AK
2876 break;
2877 case MSR_CSTAR:
cc3ed80a 2878 svm->vmcb01.ptr->save.cstar = data;
6aa8b732
AK
2879 break;
2880 case MSR_KERNEL_GS_BASE:
cc3ed80a 2881 svm->vmcb01.ptr->save.kernel_gs_base = data;
6aa8b732
AK
2882 break;
2883 case MSR_SYSCALL_MASK:
cc3ed80a 2884 svm->vmcb01.ptr->save.sfmask = data;
6aa8b732
AK
2885 break;
2886#endif
2887 case MSR_IA32_SYSENTER_CS:
cc3ed80a 2888 svm->vmcb01.ptr->save.sysenter_cs = data;
6aa8b732
AK
2889 break;
2890 case MSR_IA32_SYSENTER_EIP:
adc2a237
ML
2891 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2892 /*
2893 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2894 * when we spoof an Intel vendor ID (for cross vendor migration).
2895 * In this case we use this intercept to track the high
2896 * 32 bit part of these msrs to support Intel's
2897 * implementation of SYSENTER/SYSEXIT.
2898 */
2899 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
6aa8b732
AK
2900 break;
2901 case MSR_IA32_SYSENTER_ESP:
adc2a237
ML
2902 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2903 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
6aa8b732 2904 break;
46896c73 2905 case MSR_TSC_AUX:
46896c73 2906 /*
844d69c2
SC
2907 * TSC_AUX is usually changed only during boot and never read
2908 * directly. Intercept TSC_AUX instead of exposing it to the
2909 * guest via direct_access_msrs, and switch it via user return.
46896c73 2910 */
844d69c2 2911 preempt_disable();
0caa0a77 2912 r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
844d69c2
SC
2913 preempt_enable();
2914 if (r)
2915 return 1;
2916
46896c73 2917 svm->tsc_aux = data;
46896c73 2918 break;
a2938c80 2919 case MSR_IA32_DEBUGCTLMSR:
4c84926e 2920 if (!lbrv) {
a737f256
CD
2921 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2922 __func__, data);
24e09cbf
JR
2923 break;
2924 }
2925 if (data & DEBUGCTL_RESERVED_BITS)
2926 return 1;
2927
1d5a1b58
ML
2928 if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
2929 svm->vmcb->save.dbgctl = data;
24e09cbf 2930 else
1d5a1b58
ML
2931 svm->vmcb01.ptr->save.dbgctl = data;
2932
2933 svm_update_lbrv(vcpu);
2934
a2938c80 2935 break;
b286d5d8 2936 case MSR_VM_HSAVE_PA:
fce7e152
VK
2937 /*
2938 * Old kernels did not validate the value written to
2939 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
2940 * value to allow live migrating buggy or malicious guests
2941 * originating from those kernels.
2942 */
2943 if (!msr->host_initiated && !page_address_valid(vcpu, data))
2944 return 1;
2945
2946 svm->nested.hsave_msr = data & PAGE_MASK;
62b9abaa 2947 break;
3c5d0a44 2948 case MSR_VM_CR:
4a810181 2949 return svm_set_vm_cr(vcpu, data);
3c5d0a44 2950 case MSR_VM_IGNNE:
a737f256 2951 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3c5d0a44 2952 break;
d1d93fa9
TL
2953 case MSR_F10H_DECFG: {
2954 struct kvm_msr_entry msr_entry;
2955
2956 msr_entry.index = msr->index;
2957 if (svm_get_msr_feature(&msr_entry))
2958 return 1;
2959
2960 /* Check the supported bits */
2961 if (data & ~msr_entry.data)
2962 return 1;
2963
2964 /* Don't allow the guest to change a bit, #GP */
2965 if (!msr->host_initiated && (data ^ msr_entry.data))
2966 return 1;
2967
2968 svm->msr_decfg = data;
2969 break;
2970 }
6aa8b732 2971 default:
8fe8ab46 2972 return kvm_set_msr_common(vcpu, msr);
6aa8b732
AK
2973 }
2974 return 0;
2975}
2976
63129754 2977static int msr_interception(struct kvm_vcpu *vcpu)
6aa8b732 2978{
63129754 2979 if (to_svm(vcpu)->vmcb->control.exit_info_1)
5ff3a351 2980 return kvm_emulate_wrmsr(vcpu);
6aa8b732 2981 else
5ff3a351 2982 return kvm_emulate_rdmsr(vcpu);
6aa8b732
AK
2983}
2984
63129754 2985static int interrupt_window_interception(struct kvm_vcpu *vcpu)
c1150d8c 2986{
63129754
PB
2987 kvm_make_request(KVM_REQ_EVENT, vcpu);
2988 svm_clear_vintr(to_svm(vcpu));
f3515dc3
SS
2989
2990 /*
f44509f8 2991 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
f3515dc3
SS
2992 * In this case AVIC was temporarily disabled for
2993 * requesting the IRQ window and we have to re-enable it.
f44509f8
ML
2994 *
2995 * If running nested, still remove the VM wide AVIC inhibit to
2996 * support case in which the interrupt window was requested when the
2997 * vCPU was not running nested.
2998
2999 * All vCPUs which run still run nested, will remain to have their
3000 * AVIC still inhibited due to per-cpu AVIC inhibition.
f3515dc3 3001 */
320af55a 3002 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
f3515dc3 3003
63129754 3004 ++vcpu->stat.irq_window_exits;
c1150d8c
DL
3005 return 1;
3006}
3007
63129754 3008static int pause_interception(struct kvm_vcpu *vcpu)
565d0998 3009{
f1c6366e 3010 bool in_kernel;
f1c6366e
TL
3011 /*
3012 * CPL is not made available for an SEV-ES guest, therefore
3013 * vcpu->arch.preempted_in_kernel can never be true. Just
3014 * set in_kernel to false as well.
3015 */
63129754 3016 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
de63ad4c 3017
74fd41ed 3018 grow_ple_window(vcpu);
8566ac8b 3019
de63ad4c 3020 kvm_vcpu_on_spin(vcpu, in_kernel);
c8781fea 3021 return kvm_skip_emulated_instruction(vcpu);
87c00572
GS
3022}
3023
63129754 3024static int invpcid_interception(struct kvm_vcpu *vcpu)
87c00572 3025{
63129754 3026 struct vcpu_svm *svm = to_svm(vcpu);
4407a797
BM
3027 unsigned long type;
3028 gva_t gva;
3029
3030 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3031 kvm_queue_exception(vcpu, UD_VECTOR);
3032 return 1;
3033 }
3034
3035 /*
3036 * For an INVPCID intercept:
3037 * EXITINFO1 provides the linear address of the memory operand.
3038 * EXITINFO2 provides the contents of the register operand.
3039 */
3040 type = svm->vmcb->control.exit_info_2;
3041 gva = svm->vmcb->control.exit_info_1;
3042
4407a797
BM
3043 return kvm_handle_invpcid(vcpu, type, gva);
3044}
3045
63129754 3046static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7ff76d58
AP
3047 [SVM_EXIT_READ_CR0] = cr_interception,
3048 [SVM_EXIT_READ_CR3] = cr_interception,
3049 [SVM_EXIT_READ_CR4] = cr_interception,
3050 [SVM_EXIT_READ_CR8] = cr_interception,
5e57518d 3051 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
628afd2a 3052 [SVM_EXIT_WRITE_CR0] = cr_interception,
7ff76d58
AP
3053 [SVM_EXIT_WRITE_CR3] = cr_interception,
3054 [SVM_EXIT_WRITE_CR4] = cr_interception,
e0231715 3055 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
cae3797a
AP
3056 [SVM_EXIT_READ_DR0] = dr_interception,
3057 [SVM_EXIT_READ_DR1] = dr_interception,
3058 [SVM_EXIT_READ_DR2] = dr_interception,
3059 [SVM_EXIT_READ_DR3] = dr_interception,
3060 [SVM_EXIT_READ_DR4] = dr_interception,
3061 [SVM_EXIT_READ_DR5] = dr_interception,
3062 [SVM_EXIT_READ_DR6] = dr_interception,
3063 [SVM_EXIT_READ_DR7] = dr_interception,
3064 [SVM_EXIT_WRITE_DR0] = dr_interception,
3065 [SVM_EXIT_WRITE_DR1] = dr_interception,
3066 [SVM_EXIT_WRITE_DR2] = dr_interception,
3067 [SVM_EXIT_WRITE_DR3] = dr_interception,
3068 [SVM_EXIT_WRITE_DR4] = dr_interception,
3069 [SVM_EXIT_WRITE_DR5] = dr_interception,
3070 [SVM_EXIT_WRITE_DR6] = dr_interception,
3071 [SVM_EXIT_WRITE_DR7] = dr_interception,
d0bfb940
JK
3072 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
3073 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
7aa81cc0 3074 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
e0231715 3075 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
e0231715 3076 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
54a20552 3077 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
9718420e 3078 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
e0231715 3079 [SVM_EXIT_INTR] = intr_interception,
c47f098d 3080 [SVM_EXIT_NMI] = nmi_interception,
991afbbe 3081 [SVM_EXIT_SMI] = smi_interception,
c1150d8c 3082 [SVM_EXIT_VINTR] = interrupt_window_interception,
32c23c7d 3083 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
5ff3a351 3084 [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
95ba8273 3085 [SVM_EXIT_IRET] = iret_interception,
5ff3a351 3086 [SVM_EXIT_INVD] = kvm_emulate_invd,
565d0998 3087 [SVM_EXIT_PAUSE] = pause_interception,
5ff3a351 3088 [SVM_EXIT_HLT] = kvm_emulate_halt,
a7052897 3089 [SVM_EXIT_INVLPG] = invlpg_interception,
ff092385 3090 [SVM_EXIT_INVLPGA] = invlpga_interception,
e0231715 3091 [SVM_EXIT_IOIO] = io_interception,
6aa8b732
AK
3092 [SVM_EXIT_MSR] = msr_interception,
3093 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
46fe4ddd 3094 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
3d6368ef 3095 [SVM_EXIT_VMRUN] = vmrun_interception,
5ff3a351 3096 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
5542675b
AG
3097 [SVM_EXIT_VMLOAD] = vmload_interception,
3098 [SVM_EXIT_VMSAVE] = vmsave_interception,
1371d904
AG
3099 [SVM_EXIT_STGI] = stgi_interception,
3100 [SVM_EXIT_CLGI] = clgi_interception,
532a46b9 3101 [SVM_EXIT_SKINIT] = skinit_interception,
3b195ac9 3102 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
5ff3a351
SC
3103 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
3104 [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
3105 [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
92f9895c 3106 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
5ff3a351 3107 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
2985afbc 3108 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
f27ad38a 3109 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
5b51cb13 3110 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
d1949b93 3111 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
4407a797 3112 [SVM_EXIT_INVPCID] = invpcid_interception,
d0006530 3113 [SVM_EXIT_NPF] = npf_interception,
7607b717 3114 [SVM_EXIT_RSM] = rsm_interception,
18f40c53
SS
3115 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
3116 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
291bd20d 3117 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
6aa8b732
AK
3118};
3119
ae8cc059 3120static void dump_vmcb(struct kvm_vcpu *vcpu)
3f10c846
JR
3121{
3122 struct vcpu_svm *svm = to_svm(vcpu);
3123 struct vmcb_control_area *control = &svm->vmcb->control;
3124 struct vmcb_save_area *save = &svm->vmcb->save;
cc3ed80a 3125 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3f10c846 3126
6f2f8453
PB
3127 if (!dump_invalid_vmcb) {
3128 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3129 return;
3130 }
3131
18f63b15
JM
3132 pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3133 svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3f10c846 3134 pr_err("VMCB Control Area:\n");
03bfeeb9
BM
3135 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3136 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
30abaa88
BM
3137 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3138 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
9780d51d 3139 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
c62e2e94
BM
3140 pr_err("%-20s%08x %08x\n", "intercepts:",
3141 control->intercepts[INTERCEPT_WORD3],
3142 control->intercepts[INTERCEPT_WORD4]);
ae8cc059 3143 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
1d8fb44a
BM
3144 pr_err("%-20s%d\n", "pause filter threshold:",
3145 control->pause_filter_thresh);
ae8cc059
JP
3146 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3147 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3148 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3149 pr_err("%-20s%d\n", "asid:", control->asid);
3150 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3151 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3152 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3153 pr_err("%-20s%08x\n", "int_state:", control->int_state);
3154 pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3155 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3156 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3157 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3158 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3159 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3160 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
44a95dae 3161 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
291bd20d 3162 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
ae8cc059
JP
3163 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3164 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
0dc92119 3165 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
ae8cc059 3166 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
44a95dae
SS
3167 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3168 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3169 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
376c6d28 3170 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3f10c846 3171 pr_err("VMCB State Save Area:\n");
ae8cc059
JP
3172 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3173 "es:",
3174 save->es.selector, save->es.attrib,
3175 save->es.limit, save->es.base);
3176 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3177 "cs:",
3178 save->cs.selector, save->cs.attrib,
3179 save->cs.limit, save->cs.base);
3180 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3181 "ss:",
3182 save->ss.selector, save->ss.attrib,
3183 save->ss.limit, save->ss.base);
3184 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3185 "ds:",
3186 save->ds.selector, save->ds.attrib,
3187 save->ds.limit, save->ds.base);
3188 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3189 "fs:",
cc3ed80a
ML
3190 save01->fs.selector, save01->fs.attrib,
3191 save01->fs.limit, save01->fs.base);
ae8cc059
JP
3192 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3193 "gs:",
cc3ed80a
ML
3194 save01->gs.selector, save01->gs.attrib,
3195 save01->gs.limit, save01->gs.base);
ae8cc059
JP
3196 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3197 "gdtr:",
3198 save->gdtr.selector, save->gdtr.attrib,
3199 save->gdtr.limit, save->gdtr.base);
3200 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3201 "ldtr:",
cc3ed80a
ML
3202 save01->ldtr.selector, save01->ldtr.attrib,
3203 save01->ldtr.limit, save01->ldtr.base);
ae8cc059
JP
3204 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3205 "idtr:",
3206 save->idtr.selector, save->idtr.attrib,
3207 save->idtr.limit, save->idtr.base);
3208 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3209 "tr:",
cc3ed80a
ML
3210 save01->tr.selector, save01->tr.attrib,
3211 save01->tr.limit, save01->tr.base);
046f773b
BS
3212 pr_err("vmpl: %d cpl: %d efer: %016llx\n",
3213 save->vmpl, save->cpl, save->efer);
ae8cc059
JP
3214 pr_err("%-15s %016llx %-13s %016llx\n",
3215 "cr0:", save->cr0, "cr2:", save->cr2);
3216 pr_err("%-15s %016llx %-13s %016llx\n",
3217 "cr3:", save->cr3, "cr4:", save->cr4);
3218 pr_err("%-15s %016llx %-13s %016llx\n",
3219 "dr6:", save->dr6, "dr7:", save->dr7);
3220 pr_err("%-15s %016llx %-13s %016llx\n",
3221 "rip:", save->rip, "rflags:", save->rflags);
3222 pr_err("%-15s %016llx %-13s %016llx\n",
3223 "rsp:", save->rsp, "rax:", save->rax);
3224 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a 3225 "star:", save01->star, "lstar:", save01->lstar);
ae8cc059 3226 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a 3227 "cstar:", save01->cstar, "sfmask:", save01->sfmask);
ae8cc059 3228 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a
ML
3229 "kernel_gs_base:", save01->kernel_gs_base,
3230 "sysenter_cs:", save01->sysenter_cs);
ae8cc059 3231 pr_err("%-15s %016llx %-13s %016llx\n",
cc3ed80a
ML
3232 "sysenter_esp:", save01->sysenter_esp,
3233 "sysenter_eip:", save01->sysenter_eip);
ae8cc059
JP
3234 pr_err("%-15s %016llx %-13s %016llx\n",
3235 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3236 pr_err("%-15s %016llx %-13s %016llx\n",
3237 "br_from:", save->br_from, "br_to:", save->br_to);
3238 pr_err("%-15s %016llx %-13s %016llx\n",
3239 "excp_from:", save->last_excp_from,
3240 "excp_to:", save->last_excp_to);
3f10c846
JR
3241}
3242
98242dca 3243static bool svm_check_exit_valid(u64 exit_code)
e9093fd4 3244{
7a4bca85
ML
3245 return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3246 svm_exit_handlers[exit_code]);
3247}
e9093fd4 3248
7a4bca85
ML
3249static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3250{
e9093fd4
TL
3251 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3252 dump_vmcb(vcpu);
3253 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3254 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3255 vcpu->run->internal.ndata = 2;
3256 vcpu->run->internal.data[0] = exit_code;
3257 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
7a4bca85 3258 return 0;
e9093fd4
TL
3259}
3260
63129754 3261int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
e9093fd4 3262{
98242dca 3263 if (!svm_check_exit_valid(exit_code))
7a4bca85 3264 return svm_handle_invalid_exit(vcpu, exit_code);
e9093fd4
TL
3265
3266#ifdef CONFIG_RETPOLINE
3267 if (exit_code == SVM_EXIT_MSR)
63129754 3268 return msr_interception(vcpu);
e9093fd4 3269 else if (exit_code == SVM_EXIT_VINTR)
63129754 3270 return interrupt_window_interception(vcpu);
e9093fd4 3271 else if (exit_code == SVM_EXIT_INTR)
63129754 3272 return intr_interception(vcpu);
e9093fd4 3273 else if (exit_code == SVM_EXIT_HLT)
5ff3a351 3274 return kvm_emulate_halt(vcpu);
e9093fd4 3275 else if (exit_code == SVM_EXIT_NPF)
63129754 3276 return npf_interception(vcpu);
e9093fd4 3277#endif
63129754 3278 return svm_exit_handlers[exit_code](vcpu);
e9093fd4
TL
3279}
3280
0a62a031
DE
3281static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3282 u64 *info1, u64 *info2,
235ba74f 3283 u32 *intr_info, u32 *error_code)
586f9607
AK
3284{
3285 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3286
0a62a031 3287 *reason = control->exit_code;
586f9607
AK
3288 *info1 = control->exit_info_1;
3289 *info2 = control->exit_info_2;
235ba74f
SC
3290 *intr_info = control->exit_int_info;
3291 if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3292 (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3293 *error_code = control->exit_int_info_err;
3294 else
3295 *error_code = 0;
586f9607
AK
3296}
3297
23e5092b 3298static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6aa8b732 3299{
04d2cc77 3300 struct vcpu_svm *svm = to_svm(vcpu);
851ba692 3301 struct kvm_run *kvm_run = vcpu->run;
a2fa3e9f 3302 u32 exit_code = svm->vmcb->control.exit_code;
6aa8b732 3303
0a62a031 3304 trace_kvm_exit(vcpu, KVM_ISA_SVM);
8b89fe1f 3305
f1c6366e
TL
3306 /* SEV-ES guests must use the CR write traps to track CR registers. */
3307 if (!sev_es_guest(vcpu->kvm)) {
3308 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3309 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3310 if (npt_enabled)
3311 vcpu->arch.cr3 = svm->vmcb->save.cr3;
3312 }
af9ca2d7 3313
2030753d 3314 if (is_guest_mode(vcpu)) {
410e4d57
JR
3315 int vmexit;
3316
0a62a031 3317 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
d8cabddf 3318
410e4d57
JR
3319 vmexit = nested_svm_exit_special(svm);
3320
3321 if (vmexit == NESTED_EXIT_CONTINUE)
3322 vmexit = nested_svm_exit_handled(svm);
3323
3324 if (vmexit == NESTED_EXIT_DONE)
cf74a78b 3325 return 1;
cf74a78b
AG
3326 }
3327
04d2cc77
AK
3328 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3329 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3330 kvm_run->fail_entry.hardware_entry_failure_reason
3331 = svm->vmcb->control.exit_code;
8a14fe4f 3332 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3f10c846 3333 dump_vmcb(vcpu);
04d2cc77
AK
3334 return 0;
3335 }
3336
a2fa3e9f 3337 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
709ddebf 3338 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
55c5e464
JR
3339 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3340 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
6614c7d0 3341 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
6aa8b732 3342 "exit_code 0x%x\n",
b8688d51 3343 __func__, svm->vmcb->control.exit_int_info,
6aa8b732
AK
3344 exit_code);
3345
404d5d7b 3346 if (exit_fastpath != EXIT_FASTPATH_NONE)
1e9e2622 3347 return 1;
404d5d7b 3348
63129754 3349 return svm_invoke_exit_handler(vcpu, exit_code);
6aa8b732
AK
3350}
3351
3352static void reload_tss(struct kvm_vcpu *vcpu)
3353{
73cd6e5f 3354 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
6aa8b732 3355
0fe1e009 3356 sd->tss_desc->type = 9; /* available 32/64-bit TSS */
6aa8b732
AK
3357 load_TR_desc();
3358}
3359
63129754 3360static void pre_svm_run(struct kvm_vcpu *vcpu)
6aa8b732 3361{
63129754
PB
3362 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3363 struct vcpu_svm *svm = to_svm(vcpu);
6aa8b732 3364
af18fa77 3365 /*
44f1b558
SC
3366 * If the previous vmrun of the vmcb occurred on a different physical
3367 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
3368 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3369 */
63129754 3370 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
193015ad 3371 svm->current_vmcb->asid_generation = 0;
af18fa77 3372 vmcb_mark_all_dirty(svm->vmcb);
63129754 3373 svm->current_vmcb->cpu = vcpu->cpu;
af18fa77
CA
3374 }
3375
63129754
PB
3376 if (sev_guest(vcpu->kvm))
3377 return pre_sev_run(svm, vcpu->cpu);
70cd94e6 3378
4b656b12 3379 /* FIXME: handle wraparound of asid_generation */
193015ad 3380 if (svm->current_vmcb->asid_generation != sd->asid_generation)
0fe1e009 3381 new_asid(svm, sd);
6aa8b732
AK
3382}
3383
95ba8273
GN
3384static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3385{
3386 struct vcpu_svm *svm = to_svm(vcpu);
3387
3388 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3389 vcpu->arch.hflags |= HF_NMI_MASK;
63129754 3390 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3391 svm_set_intercept(svm, INTERCEPT_IRET);
95ba8273
GN
3392 ++vcpu->stat.nmi_injections;
3393}
6aa8b732 3394
23e5092b 3395static void svm_inject_irq(struct kvm_vcpu *vcpu)
2a8067f1
ED
3396{
3397 struct vcpu_svm *svm = to_svm(vcpu);
3398
9fb2d2b4
GN
3399 trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3400 ++vcpu->stat.irq_injections;
3401
219b65dc
AG
3402 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3403 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2a8067f1
ED
3404}
3405
66fa226c
ML
3406void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3407 int trig_mode, int vector)
57dfd7b5 3408{
66fa226c
ML
3409 /*
3410 * vcpu->arch.apicv_active must be read after vcpu->mode.
3411 * Pairs with smp_store_release in vcpu_enter_guest.
3412 */
3413 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
57dfd7b5 3414
66fa226c
ML
3415 if (!READ_ONCE(vcpu->arch.apicv_active)) {
3416 /* Process the interrupt via inject_pending_event */
57dfd7b5
SC
3417 kvm_make_request(KVM_REQ_EVENT, vcpu);
3418 kvm_vcpu_kick(vcpu);
66fa226c
ML
3419 return;
3420 }
3421
3422 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3423 if (in_guest_mode) {
3424 /*
3425 * Signal the doorbell to tell hardware to inject the IRQ. If
3426 * the vCPU exits the guest before the doorbell chimes, hardware
3427 * will automatically process AVIC interrupts at the next VMRUN.
3428 */
3429 avic_ring_doorbell(vcpu);
57dfd7b5 3430 } else {
66fa226c
ML
3431 /*
3432 * Wake the vCPU if it was blocking. KVM will then detect the
3433 * pending IRQ when checking if the vCPU has a wake event.
3434 */
3435 kvm_vcpu_wake_up(vcpu);
57dfd7b5
SC
3436 }
3437}
3438
66fa226c
ML
3439static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
3440 int trig_mode, int vector)
3441{
3442 kvm_lapic_set_irr(vector, apic);
3443
3444 /*
3445 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3446 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3447 * the read of guest_mode. This guarantees that either VMRUN will see
3448 * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3449 * will signal the doorbell if the CPU has already entered the guest.
3450 */
3451 smp_mb__after_atomic();
3452 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3453}
3454
b6a7cc35 3455static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
aaacfc9a
JR
3456{
3457 struct vcpu_svm *svm = to_svm(vcpu);
aaacfc9a 3458
f1c6366e
TL
3459 /*
3460 * SEV-ES guests must always keep the CR intercepts cleared. CR
3461 * tracking is done using the CR write traps.
3462 */
3463 if (sev_es_guest(vcpu->kvm))
3464 return;
3465
01c3b2b5 3466 if (nested_svm_virtualize_tpr(vcpu))
88ab24ad
JR
3467 return;
3468
830bd71f 3469 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
596f3142 3470
95ba8273 3471 if (irr == -1)
aaacfc9a
JR
3472 return;
3473
95ba8273 3474 if (tpr >= irr)
830bd71f 3475 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
95ba8273 3476}
aaacfc9a 3477
cae96af1 3478bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
95ba8273
GN
3479{
3480 struct vcpu_svm *svm = to_svm(vcpu);
3481 struct vmcb *vmcb = svm->vmcb;
88c604b6 3482 bool ret;
9c3d370a 3483
cae96af1 3484 if (!gif_set(svm))
bbdad0b5
PB
3485 return true;
3486
cae96af1
PB
3487 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3488 return false;
3489
3490 ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
63129754 3491 (vcpu->arch.hflags & HF_NMI_MASK);
924584cc
JR
3492
3493 return ret;
aaacfc9a
JR
3494}
3495
c9d40913 3496static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
3497{
3498 struct vcpu_svm *svm = to_svm(vcpu);
3499 if (svm->nested.nested_run_pending)
c9d40913 3500 return -EBUSY;
cae96af1 3501
2b0ecccb
ML
3502 if (svm_nmi_blocked(vcpu))
3503 return 0;
3504
c300ab9f
PB
3505 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
3506 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
c9d40913 3507 return -EBUSY;
2b0ecccb 3508 return 1;
cae96af1
PB
3509}
3510
3cfc3092
JK
3511static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3512{
63129754 3513 return !!(vcpu->arch.hflags & HF_NMI_MASK);
3cfc3092
JK
3514}
3515
3516static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3517{
3518 struct vcpu_svm *svm = to_svm(vcpu);
3519
3520 if (masked) {
63129754
PB
3521 vcpu->arch.hflags |= HF_NMI_MASK;
3522 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3523 svm_set_intercept(svm, INTERCEPT_IRET);
3cfc3092 3524 } else {
63129754
PB
3525 vcpu->arch.hflags &= ~HF_NMI_MASK;
3526 if (!sev_es_guest(vcpu->kvm))
4444dfe4 3527 svm_clr_intercept(svm, INTERCEPT_IRET);
3cfc3092
JK
3528 }
3529}
3530
cae96af1 3531bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
78646121
GN
3532{
3533 struct vcpu_svm *svm = to_svm(vcpu);
3534 struct vmcb *vmcb = svm->vmcb;
7fcdb510 3535
fc6f7c03 3536 if (!gif_set(svm))
cae96af1 3537 return true;
7fcdb510 3538
c5063551 3539 if (is_guest_mode(vcpu)) {
fc6f7c03 3540 /* As long as interrupts are being delivered... */
e9fd761a 3541 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
4995a368 3542 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
fc6f7c03
PB
3543 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3544 return true;
3545
3546 /* ... vmexits aren't blocked by the interrupt shadow */
3547 if (nested_exit_on_intr(svm))
3548 return false;
3549 } else {
c5063551 3550 if (!svm_get_if_flag(vcpu))
fc6f7c03
PB
3551 return true;
3552 }
3553
3554 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
cae96af1
PB
3555}
3556
c9d40913 3557static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
3558{
3559 struct vcpu_svm *svm = to_svm(vcpu);
2b0ecccb 3560
cae96af1 3561 if (svm->nested.nested_run_pending)
c9d40913 3562 return -EBUSY;
cae96af1 3563
2b0ecccb
ML
3564 if (svm_interrupt_blocked(vcpu))
3565 return 0;
3566
c300ab9f
PB
3567 /*
3568 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3569 * e.g. if the IRQ arrived asynchronously after checking nested events.
3570 */
3571 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
c9d40913 3572 return -EBUSY;
c300ab9f 3573
2b0ecccb 3574 return 1;
78646121
GN
3575}
3576
b6a7cc35 3577static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
6aa8b732 3578{
219b65dc 3579 struct vcpu_svm *svm = to_svm(vcpu);
219b65dc 3580
e0231715
JR
3581 /*
3582 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3583 * 1, because that's a separate STGI/VMRUN intercept. The next time we
3584 * get that intercept, this function will be called again though and
640bd6e5
JN
3585 * we'll get the vintr intercept. However, if the vGIF feature is
3586 * enabled, the STGI interception will not occur. Enable the irq
3587 * window under the assumption that the hardware will set the GIF.
e0231715 3588 */
ea91559b 3589 if (vgif || gif_set(svm)) {
f3515dc3
SS
3590 /*
3591 * IRQ window is not needed when AVIC is enabled,
3592 * unless we have pending ExtINT since it cannot be injected
f44509f8 3593 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
f3515dc3 3594 * and fallback to injecting IRQ via V_IRQ.
f44509f8
ML
3595 *
3596 * If running nested, AVIC is already locally inhibited
3597 * on this vCPU, therefore there is no need to request
3598 * the VM wide AVIC inhibition.
f3515dc3 3599 */
f44509f8
ML
3600 if (!is_guest_mode(vcpu))
3601 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3602
219b65dc 3603 svm_set_vintr(svm);
219b65dc 3604 }
85f455f7
ED
3605}
3606
b6a7cc35 3607static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
c1150d8c 3608{
04d2cc77 3609 struct vcpu_svm *svm = to_svm(vcpu);
c1150d8c 3610
63129754 3611 if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
c9a7953f 3612 return; /* IRET will cause a vm exit */
44c11430 3613
640bd6e5 3614 if (!gif_set(svm)) {
ea91559b 3615 if (vgif)
a284ba56 3616 svm_set_intercept(svm, INTERCEPT_STGI);
1a5e1852 3617 return; /* STGI will cause a vm exit */
640bd6e5 3618 }
1a5e1852 3619
e0231715
JR
3620 /*
3621 * Something prevents NMI from been injected. Single step over possible
3622 * problem (IRET or exception injection or interrupt shadow)
3623 */
ab2f4d73 3624 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
6be7d306 3625 svm->nmi_singlestep = true;
44c11430 3626 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
c1150d8c
DL
3627}
3628
4d9c83f5 3629static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
d9e368d6 3630{
38e5e92f
JR
3631 struct vcpu_svm *svm = to_svm(vcpu);
3632
4a41e43c
SC
3633 /*
3634 * Flush only the current ASID even if the TLB flush was invoked via
3635 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
3636 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3637 * unconditionally does a TLB flush on both nested VM-Enter and nested
3638 * VM-Exit (via kvm_mmu_reset_context()).
3639 */
38e5e92f
JR
3640 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3641 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3642 else
193015ad 3643 svm->current_vmcb->asid_generation--;
d9e368d6
AK
3644}
3645
faff8758
JS
3646static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3647{
3648 struct vcpu_svm *svm = to_svm(vcpu);
3649
3650 invlpga(gva, svm->vmcb->control.asid);
3651}
3652
d7bf8221
JR
3653static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3654{
3655 struct vcpu_svm *svm = to_svm(vcpu);
3656
01c3b2b5 3657 if (nested_svm_virtualize_tpr(vcpu))
88ab24ad
JR
3658 return;
3659
830bd71f 3660 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
d7bf8221 3661 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
615d5193 3662 kvm_set_cr8(vcpu, cr8);
d7bf8221
JR
3663 }
3664}
3665
649d6864
JR
3666static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3667{
3668 struct vcpu_svm *svm = to_svm(vcpu);
3669 u64 cr8;
3670
01c3b2b5 3671 if (nested_svm_virtualize_tpr(vcpu) ||
3bbf3565 3672 kvm_vcpu_apicv_active(vcpu))
88ab24ad
JR
3673 return;
3674
649d6864
JR
3675 cr8 = kvm_get_cr8(vcpu);
3676 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3677 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3678}
3679
63129754 3680static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
9222be18 3681{
63129754 3682 struct vcpu_svm *svm = to_svm(vcpu);
9222be18
GN
3683 u8 vector;
3684 int type;
3685 u32 exitintinfo = svm->vmcb->control.exit_int_info;
66b7138f
JK
3686 unsigned int3_injected = svm->int3_injected;
3687
3688 svm->int3_injected = 0;
9222be18 3689
bd3d1ec3
AK
3690 /*
3691 * If we've made progress since setting HF_IRET_MASK, we've
3692 * executed an IRET and can allow NMI injection.
3693 */
63129754
PB
3694 if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3695 (sev_es_guest(vcpu->kvm) ||
3696 kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3697 vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3698 kvm_make_request(KVM_REQ_EVENT, vcpu);
3842d135 3699 }
44c11430 3700
63129754
PB
3701 vcpu->arch.nmi_injected = false;
3702 kvm_clear_exception_queue(vcpu);
3703 kvm_clear_interrupt_queue(vcpu);
9222be18
GN
3704
3705 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3706 return;
3707
63129754 3708 kvm_make_request(KVM_REQ_EVENT, vcpu);
3842d135 3709
9222be18
GN
3710 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3711 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3712
cd9e6da8
SC
3713 /*
3714 * If NextRIP isn't enabled, KVM must manually advance RIP prior to
3715 * injecting the soft exception/interrupt. That advancement needs to
3741aec4 3716 * be unwound if vectoring didn't complete. Note, the new event may
cd9e6da8
SC
3717 * not be the injected event, e.g. if KVM injected an INTn, the INTn
3718 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3719 * be the reported vectored event, but RIP still needs to be unwound.
3720 */
3721 if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
3722 kvm_is_linear_rip(vcpu, svm->int3_rip))
3723 kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected);
3724
9222be18
GN
3725 switch (type) {
3726 case SVM_EXITINTINFO_TYPE_NMI:
63129754 3727 vcpu->arch.nmi_injected = true;
9222be18
GN
3728 break;
3729 case SVM_EXITINTINFO_TYPE_EXEPT:
f1c6366e
TL
3730 /*
3731 * Never re-inject a #VC exception.
3732 */
3733 if (vector == X86_TRAP_VC)
3734 break;
3735
66b7138f
JK
3736 /*
3737 * In case of software exceptions, do not reinject the vector,
cd9e6da8 3738 * but re-execute the instruction instead.
66b7138f 3739 */
cd9e6da8 3740 if (kvm_exception_is_soft(vector))
9222be18 3741 break;
cd9e6da8 3742
9222be18
GN
3743 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3744 u32 err = svm->vmcb->control.exit_int_info_err;
63129754 3745 kvm_requeue_exception_e(vcpu, vector, err);
9222be18
GN
3746
3747 } else
63129754 3748 kvm_requeue_exception(vcpu, vector);
9222be18
GN
3749 break;
3750 case SVM_EXITINTINFO_TYPE_INTR:
63129754 3751 kvm_queue_interrupt(vcpu, vector, false);
9222be18
GN
3752 break;
3753 default:
3754 break;
3755 }
3756}
3757
b463a6f7
AK
3758static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3759{
3760 struct vcpu_svm *svm = to_svm(vcpu);
3761 struct vmcb_control_area *control = &svm->vmcb->control;
3762
3763 control->exit_int_info = control->event_inj;
3764 control->exit_int_info_err = control->event_inj_err;
3765 control->event_inj = 0;
63129754 3766 svm_complete_interrupts(vcpu);
b463a6f7
AK
3767}
3768
fc4fad79
SC
3769static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
3770{
3771 return 1;
3772}
3773
404d5d7b 3774static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
a9ab13ff 3775{
4e810adb 3776 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
a9ab13ff
WL
3777 to_svm(vcpu)->vmcb->control.exit_info_1)
3778 return handle_fastpath_set_msr_irqoff(vcpu);
3779
3780 return EXIT_FASTPATH_NONE;
3781}
3782
63129754 3783static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
135961e0 3784{
63129754 3785 struct vcpu_svm *svm = to_svm(vcpu);
d1788191 3786 unsigned long vmcb_pa = svm->current_vmcb->pa;
63129754 3787
b2d2af7e 3788 guest_state_enter_irqoff();
135961e0 3789
63129754 3790 if (sev_es_guest(vcpu->kvm)) {
d1788191 3791 __svm_sev_es_vcpu_run(vmcb_pa);
16809ecd 3792 } else {
e79b91bb
MR
3793 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3794
d1788191
SC
3795 /*
3796 * Use a single vmcb (vmcb01 because it's always valid) for
3797 * context switching guest state via VMLOAD/VMSAVE, that way
3798 * the state doesn't need to be copied between vmcb01 and
3799 * vmcb02 when switching vmcbs for nested virtualization.
3800 */
cc3ed80a 3801 vmload(svm->vmcb01.pa);
d1788191 3802 __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
cc3ed80a 3803 vmsave(svm->vmcb01.pa);
135961e0 3804
e79b91bb 3805 vmload(__sme_page_pa(sd->save_area));
16809ecd 3806 }
135961e0 3807
b2d2af7e 3808 guest_state_exit_irqoff();
135961e0
TG
3809}
3810
b95273f1 3811static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
6aa8b732 3812{
a2fa3e9f 3813 struct vcpu_svm *svm = to_svm(vcpu);
d9e368d6 3814
d95df951
LB
3815 trace_kvm_entry(vcpu);
3816
2041a06a
JR
3817 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3818 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3819 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3820
a12713c2
LP
3821 /*
3822 * Disable singlestep if we're injecting an interrupt/exception.
3823 * We don't want our modified rflags to be pushed on the stack where
3824 * we might not be able to easily reset them if we disabled NMI
3825 * singlestep later.
3826 */
3827 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3828 /*
3829 * Event injection happens before external interrupts cause a
3830 * vmexit and interrupts are disabled here, so smp_send_reschedule
3831 * is enough to force an immediate vmexit.
3832 */
3833 disable_nmi_singlestep(svm);
3834 smp_send_reschedule(vcpu->cpu);
3835 }
3836
63129754 3837 pre_svm_run(vcpu);
6aa8b732 3838
649d6864
JR
3839 sync_lapic_to_cr8(vcpu);
3840
7e8e6eed
CA
3841 if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3842 svm->vmcb->control.asid = svm->asid;
3843 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3844 }
cda0ffdd 3845 svm->vmcb->save.cr2 = vcpu->arch.cr2;
6aa8b732 3846
1183646a
VP
3847 svm_hv_update_vp_id(svm->vmcb, vcpu);
3848
d67668e9
PB
3849 /*
3850 * Run with all-zero DR6 unless needed, so that we can get the exact cause
3851 * of a #DB.
3852 */
63129754 3853 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
d67668e9
PB
3854 svm_set_dr6(svm, vcpu->arch.dr6);
3855 else
9a3ecd5e 3856 svm_set_dr6(svm, DR6_ACTIVE_LOW);
d67668e9 3857
04d2cc77 3858 clgi();
139a12cf 3859 kvm_load_guest_xsave_state(vcpu);
04d2cc77 3860
010fd37f 3861 kvm_wait_lapic_expire(vcpu);
b6c4bc65 3862
b2ac58f9
KA
3863 /*
3864 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3865 * it's non-zero. Since vmentry is serialising on affected CPUs, there
3866 * is no need to worry about the conditional branch over the wrmsr
3867 * being speculatively taken.
3868 */
d00b99c5
BM
3869 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3870 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
b2ac58f9 3871
63129754 3872 svm_vcpu_enter_exit(vcpu);
15e6c22f 3873
b2ac58f9
KA
3874 /*
3875 * We do not use IBRS in the kernel. If this vCPU has used the
3876 * SPEC_CTRL MSR it may have left it on; save the value and
3877 * turn it off. This is much more efficient than blindly adding
3878 * it to the atomic save/restore list. Especially as the former
3879 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
3880 *
3881 * For non-nested case:
3882 * If the L01 MSR bitmap does not intercept the MSR, then we need to
3883 * save it.
3884 *
3885 * For nested case:
3886 * If the L02 MSR bitmap does not intercept the MSR, then we need to
3887 * save it.
3888 */
d00b99c5
BM
3889 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
3890 unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
ecb586bd 3891 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
b2ac58f9 3892
63129754 3893 if (!sev_es_guest(vcpu->kvm))
16809ecd 3894 reload_tss(vcpu);
6aa8b732 3895
d00b99c5
BM
3896 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3897 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
024d83ca 3898
63129754 3899 if (!sev_es_guest(vcpu->kvm)) {
16809ecd
TL
3900 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3901 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3902 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3903 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3904 }
41e68b69 3905 vcpu->arch.regs_dirty = 0;
13c34e07 3906
3781c01c 3907 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
db215756 3908 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
3781c01c 3909
139a12cf 3910 kvm_load_host_xsave_state(vcpu);
3781c01c
JR
3911 stgi();
3912
3913 /* Any pending NMI will happen here */
3914
3915 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
63129754 3916 kvm_after_interrupt(vcpu);
3781c01c 3917
d7bf8221
JR
3918 sync_cr8_to_lapic(vcpu);
3919
a2fa3e9f 3920 svm->next_rip = 0;
63129754 3921 if (is_guest_mode(vcpu)) {
9e8f0fbf 3922 nested_sync_control_from_vmcb02(svm);
b93af02c
KS
3923
3924 /* Track VMRUNs that have made past consistency checking */
3925 if (svm->nested.nested_run_pending &&
3926 svm->vmcb->control.exit_code != SVM_EXIT_ERR)
3927 ++vcpu->stat.nested_run;
3928
2d8a42be
PB
3929 svm->nested.nested_run_pending = 0;
3930 }
9222be18 3931
38e5e92f 3932 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
e42c6828 3933 vmcb_mark_all_clean(svm->vmcb);
38e5e92f 3934
631bc487
GN
3935 /* if exit due to PF check for async PF */
3936 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
63129754 3937 vcpu->arch.apf.host_apf_flags =
68fd66f1 3938 kvm_read_and_reset_apf_flags();
631bc487 3939
41e68b69 3940 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
fe5913e4
JR
3941
3942 /*
3943 * We need to handle MC intercepts here before the vcpu has a chance to
3944 * change the physical cpu
3945 */
3946 if (unlikely(svm->vmcb->control.exit_code ==
3947 SVM_EXIT_EXCP_BASE + MC_VECTOR))
63129754 3948 svm_handle_mce(vcpu);
8d28fec4 3949
63129754 3950 svm_complete_interrupts(vcpu);
4e810adb
WL
3951
3952 if (is_guest_mode(vcpu))
3953 return EXIT_FASTPATH_NONE;
3954
3955 return svm_exit_handlers_fastpath(vcpu);
6aa8b732
AK
3956}
3957
e83bc09c 3958static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
2a40b900 3959 int root_level)
6aa8b732 3960{
a2fa3e9f 3961 struct vcpu_svm *svm = to_svm(vcpu);
689f3bf2 3962 unsigned long cr3;
a2fa3e9f 3963
689f3bf2 3964 if (npt_enabled) {
4a98623d 3965 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
06e7852c 3966 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
1c97f0a0 3967
1e0c7d40
VP
3968 hv_track_root_tdp(vcpu, root_hpa);
3969
978ce583 3970 cr3 = vcpu->arch.cr3;
a972e29c 3971 } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
4a98623d 3972 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
e83bc09c
SC
3973 } else {
3974 /* PCID in the guest should be impossible with a 32-bit MMU. */
3975 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
3976 cr3 = root_hpa;
689f3bf2 3977 }
1c97f0a0 3978
978ce583 3979 svm->vmcb->save.cr3 = cr3;
06e7852c 3980 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1c97f0a0
JR
3981}
3982
6aa8b732
AK
3983static int is_disabled(void)
3984{
6031a61c
JR
3985 u64 vm_cr;
3986
3987 rdmsrl(MSR_VM_CR, vm_cr);
3988 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
3989 return 1;
3990
6aa8b732
AK
3991 return 0;
3992}
3993
102d8325
IM
3994static void
3995svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3996{
3997 /*
3998 * Patch in the VMMCALL instruction:
3999 */
4000 hypercall[0] = 0x0f;
4001 hypercall[1] = 0x01;
4002 hypercall[2] = 0xd9;
102d8325
IM
4003}
4004
f257d6dc 4005static int __init svm_check_processor_compat(void)
002c7f7c 4006{
f257d6dc 4007 return 0;
002c7f7c
YS
4008}
4009
5719455f
TL
4010/*
4011 * The kvm parameter can be NULL (module initialization, or invocation before
4012 * VM creation). Be sure to check the kvm parameter before using it.
4013 */
4014static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
6d396b55 4015{
e87555e5
VK
4016 switch (index) {
4017 case MSR_IA32_MCG_EXT_CTL:
95c5c7c7 4018 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
e87555e5 4019 return false;
5719455f
TL
4020 case MSR_IA32_SMBASE:
4021 /* SEV-ES guests do not support SMM, so report false */
4022 if (kvm && sev_es_guest(kvm))
4023 return false;
4024 break;
e87555e5
VK
4025 default:
4026 break;
4027 }
4028
6d396b55
PB
4029 return true;
4030}
4031
bf07be36
ML
4032static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4033{
4034 return 0;
4035}
4036
7c1b761b 4037static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
0e851880 4038{
6092d3d3 4039 struct vcpu_svm *svm = to_svm(vcpu);
96308b06 4040 struct kvm_cpuid_entry2 *best;
320af55a 4041 struct kvm *kvm = vcpu->kvm;
6092d3d3 4042
7204160e 4043 vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
96be4e06 4044 boot_cpu_has(X86_FEATURE_XSAVE) &&
7204160e
AL
4045 boot_cpu_has(X86_FEATURE_XSAVES);
4046
6092d3d3 4047 /* Update nrips enabled cache */
4eb87460 4048 svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
63129754 4049 guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
46781eae 4050
5228eb96 4051 svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
d20c796c 4052 svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
5228eb96 4053
b9f3973a
ML
4054 svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4055
74fd41ed
ML
4056 svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4057 guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4058
4059 svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4060 guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4061
0b349662 4062 svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
5228eb96 4063
3b195ac9 4064 svm_recalc_instruction_intercepts(vcpu, svm);
4407a797 4065
96308b06
BM
4066 /* For sev guests, the memory encryption bit is not reserved in CR3. */
4067 if (sev_guest(vcpu->kvm)) {
4068 best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
4069 if (best)
ca29e145 4070 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
96308b06
BM
4071 }
4072
adc2a237
ML
4073 if (kvm_vcpu_apicv_active(vcpu)) {
4074 /*
4075 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
4076 * is exposed to the guest, disable AVIC.
4077 */
4078 if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
320af55a 4079 kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
adc2a237 4080 }
36e8194d 4081 init_vmcb_after_set_cpuid(vcpu);
0e851880
SY
4082}
4083
f5f48ee1
SY
4084static bool svm_has_wbinvd_exit(void)
4085{
4086 return true;
4087}
4088
8061252e 4089#define PRE_EX(exit) { .exit_code = (exit), \
40e19b51 4090 .stage = X86_ICPT_PRE_EXCEPT, }
cfec82cb 4091#define POST_EX(exit) { .exit_code = (exit), \
40e19b51 4092 .stage = X86_ICPT_POST_EXCEPT, }
d7eb8203 4093#define POST_MEM(exit) { .exit_code = (exit), \
40e19b51 4094 .stage = X86_ICPT_POST_MEMACCESS, }
cfec82cb 4095
09941fbb 4096static const struct __x86_intercept {
cfec82cb
JR
4097 u32 exit_code;
4098 enum x86_intercept_stage stage;
cfec82cb
JR
4099} x86_intercept_map[] = {
4100 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
4101 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
4102 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
4103 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
4104 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
3b88e41a
JR
4105 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
4106 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
dee6bb70
JR
4107 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
4108 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
4109 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
4110 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
4111 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
4112 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
4113 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
4114 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
01de8b09
JR
4115 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4116 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4117 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4118 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4119 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4120 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4121 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4122 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
d7eb8203
JR
4123 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4124 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4125 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
8061252e
JR
4126 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4127 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4128 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4129 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4130 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4131 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4132 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4133 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4134 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
bf608f88
JR
4135 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4136 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4137 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4138 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4139 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4140 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4141 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
f6511935
JR
4142 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4143 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4144 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4145 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
02d4160f 4146 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
cfec82cb
JR
4147};
4148
8061252e 4149#undef PRE_EX
cfec82cb 4150#undef POST_EX
d7eb8203 4151#undef POST_MEM
cfec82cb 4152
8a76d7f2
JR
4153static int svm_check_intercept(struct kvm_vcpu *vcpu,
4154 struct x86_instruction_info *info,
21f1b8f2
SC
4155 enum x86_intercept_stage stage,
4156 struct x86_exception *exception)
8a76d7f2 4157{
cfec82cb
JR
4158 struct vcpu_svm *svm = to_svm(vcpu);
4159 int vmexit, ret = X86EMUL_CONTINUE;
4160 struct __x86_intercept icpt_info;
4161 struct vmcb *vmcb = svm->vmcb;
4162
4163 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4164 goto out;
4165
4166 icpt_info = x86_intercept_map[info->intercept];
4167
40e19b51 4168 if (stage != icpt_info.stage)
cfec82cb
JR
4169 goto out;
4170
4171 switch (icpt_info.exit_code) {
4172 case SVM_EXIT_READ_CR0:
4173 if (info->intercept == x86_intercept_cr_read)
4174 icpt_info.exit_code += info->modrm_reg;
4175 break;
4176 case SVM_EXIT_WRITE_CR0: {
4177 unsigned long cr0, val;
cfec82cb
JR
4178
4179 if (info->intercept == x86_intercept_cr_write)
4180 icpt_info.exit_code += info->modrm_reg;
4181
62baf44c
JK
4182 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4183 info->intercept == x86_intercept_clts)
cfec82cb
JR
4184 break;
4185
8fc78909 4186 if (!(vmcb12_is_intercept(&svm->nested.ctl,
c62e2e94 4187 INTERCEPT_SELECTIVE_CR0)))
cfec82cb
JR
4188 break;
4189
4190 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4191 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4192
4193 if (info->intercept == x86_intercept_lmsw) {
4194 cr0 &= 0xfUL;
4195 val &= 0xfUL;
4196 /* lmsw can't clear PE - catch this here */
4197 if (cr0 & X86_CR0_PE)
4198 val |= X86_CR0_PE;
4199 }
4200
4201 if (cr0 ^ val)
4202 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4203
4204 break;
4205 }
3b88e41a
JR
4206 case SVM_EXIT_READ_DR0:
4207 case SVM_EXIT_WRITE_DR0:
4208 icpt_info.exit_code += info->modrm_reg;
4209 break;
8061252e
JR
4210 case SVM_EXIT_MSR:
4211 if (info->intercept == x86_intercept_wrmsr)
4212 vmcb->control.exit_info_1 = 1;
4213 else
4214 vmcb->control.exit_info_1 = 0;
4215 break;
bf608f88
JR
4216 case SVM_EXIT_PAUSE:
4217 /*
4218 * We get this for NOP only, but pause
4219 * is rep not, check this here
4220 */
4221 if (info->rep_prefix != REPE_PREFIX)
4222 goto out;
49a8afca 4223 break;
f6511935
JR
4224 case SVM_EXIT_IOIO: {
4225 u64 exit_info;
4226 u32 bytes;
4227
f6511935
JR
4228 if (info->intercept == x86_intercept_in ||
4229 info->intercept == x86_intercept_ins) {
6cbc5f5a
JK
4230 exit_info = ((info->src_val & 0xffff) << 16) |
4231 SVM_IOIO_TYPE_MASK;
f6511935 4232 bytes = info->dst_bytes;
6493f157 4233 } else {
6cbc5f5a 4234 exit_info = (info->dst_val & 0xffff) << 16;
6493f157 4235 bytes = info->src_bytes;
f6511935
JR
4236 }
4237
4238 if (info->intercept == x86_intercept_outs ||
4239 info->intercept == x86_intercept_ins)
4240 exit_info |= SVM_IOIO_STR_MASK;
4241
4242 if (info->rep_prefix)
4243 exit_info |= SVM_IOIO_REP_MASK;
4244
4245 bytes = min(bytes, 4u);
4246
4247 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4248
4249 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4250
4251 vmcb->control.exit_info_1 = exit_info;
4252 vmcb->control.exit_info_2 = info->next_rip;
4253
4254 break;
4255 }
cfec82cb
JR
4256 default:
4257 break;
4258 }
4259
f104765b
BD
4260 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4261 if (static_cpu_has(X86_FEATURE_NRIPS))
4262 vmcb->control.next_rip = info->next_rip;
cfec82cb
JR
4263 vmcb->control.exit_code = icpt_info.exit_code;
4264 vmexit = nested_svm_exit_handled(svm);
4265
4266 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4267 : X86EMUL_CONTINUE;
4268
4269out:
4270 return ret;
8a76d7f2
JR
4271}
4272
a9ab13ff 4273static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
a547c6db 4274{
a547c6db
YZ
4275}
4276
ae97a3b8
RK
4277static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4278{
830f01b0 4279 if (!kvm_pause_in_guest(vcpu->kvm))
8566ac8b 4280 shrink_ple_window(vcpu);
ae97a3b8
RK
4281}
4282
74f16909
BP
4283static void svm_setup_mce(struct kvm_vcpu *vcpu)
4284{
4285 /* [63:9] are reserved. */
4286 vcpu->arch.mcg_cap &= 0x1ff;
4287}
4288
cae96af1 4289bool svm_smi_blocked(struct kvm_vcpu *vcpu)
72d7b374 4290{
05cade71
LP
4291 struct vcpu_svm *svm = to_svm(vcpu);
4292
4293 /* Per APM Vol.2 15.22.2 "Response to SMI" */
4294 if (!gif_set(svm))
cae96af1
PB
4295 return true;
4296
4297 return is_smm(vcpu);
4298}
4299
c9d40913 4300static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
cae96af1
PB
4301{
4302 struct vcpu_svm *svm = to_svm(vcpu);
4303 if (svm->nested.nested_run_pending)
c9d40913 4304 return -EBUSY;
05cade71 4305
2b0ecccb
ML
4306 if (svm_smi_blocked(vcpu))
4307 return 0;
4308
c300ab9f
PB
4309 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
4310 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
c9d40913 4311 return -EBUSY;
c300ab9f 4312
2b0ecccb 4313 return 1;
72d7b374
LP
4314}
4315
ecc513e5 4316static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
0234bf88 4317{
05cade71 4318 struct vcpu_svm *svm = to_svm(vcpu);
37be407b 4319 struct kvm_host_map map_save;
05cade71
LP
4320 int ret;
4321
136a55c0
ML
4322 if (!is_guest_mode(vcpu))
4323 return 0;
05cade71 4324
136a55c0
ML
4325 /* FED8h - SVM Guest */
4326 put_smstate(u64, smstate, 0x7ed8, 1);
4327 /* FEE0h - SVM Guest VMCB Physical Address */
4328 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
05cade71 4329
136a55c0
ML
4330 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4331 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4332 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
37be407b 4333
249f3249 4334 ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
136a55c0
ML
4335 if (ret)
4336 return ret;
4337
4338 /*
4339 * KVM uses VMCB01 to store L1 host state while L2 runs but
4340 * VMCB01 is going to be used during SMM and thus the state will
4341 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4342 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4343 * format of the area is identical to guest save area offsetted
4344 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4345 * within 'struct vmcb'). Note: HSAVE area may also be used by
4346 * L1 hypervisor to save additional host context (e.g. KVM does
23e5092b 4347 * that, see svm_prepare_switch_to_guest()) which must be
136a55c0
ML
4348 * preserved.
4349 */
4350 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
4351 &map_save) == -EINVAL)
4352 return 1;
37be407b 4353
136a55c0 4354 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
37be407b 4355
136a55c0
ML
4356 svm_copy_vmrun_state(map_save.hva + 0x400,
4357 &svm->vmcb01.ptr->save);
37be407b 4358
136a55c0 4359 kvm_vcpu_unmap(vcpu, &map_save, true);
0234bf88
LP
4360 return 0;
4361}
4362
ecc513e5 4363static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
0234bf88 4364{
05cade71 4365 struct vcpu_svm *svm = to_svm(vcpu);
37be407b 4366 struct kvm_host_map map, map_save;
136a55c0
ML
4367 u64 saved_efer, vmcb12_gpa;
4368 struct vmcb *vmcb12;
4369 int ret;
05cade71 4370
136a55c0
ML
4371 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4372 return 0;
05cade71 4373
136a55c0
ML
4374 /* Non-zero if SMI arrived while vCPU was in guest mode. */
4375 if (!GET_SMSTATE(u64, smstate, 0x7ed8))
4376 return 0;
3ebb5d26 4377
136a55c0
ML
4378 if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4379 return 1;
3ebb5d26 4380
136a55c0
ML
4381 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
4382 if (!(saved_efer & EFER_SVME))
4383 return 1;
3ebb5d26 4384
136a55c0
ML
4385 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
4386 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
4387 return 1;
3ebb5d26 4388
136a55c0
ML
4389 ret = 1;
4390 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
4391 goto unmap_map;
37be407b 4392
136a55c0
ML
4393 if (svm_allocate_nested(svm))
4394 goto unmap_save;
37be407b 4395
136a55c0
ML
4396 /*
4397 * Restore L1 host state from L1 HSAVE area as VMCB01 was
4398 * used during SMM (see svm_enter_smm())
4399 */
37be407b 4400
136a55c0 4401 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
e2e6e449 4402
136a55c0
ML
4403 /*
4404 * Enter the nested guest now
4405 */
59cd9bc5 4406
e8efa4ff
ML
4407 vmcb_mark_all_dirty(svm->vmcb01.ptr);
4408
136a55c0 4409 vmcb12 = map.hva;
7907160d 4410 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
f2740a8d 4411 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
136a55c0
ML
4412 ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
4413
759cbd59
ML
4414 if (ret)
4415 goto unmap_save;
4416
4417 svm->nested.nested_run_pending = 1;
4418
136a55c0
ML
4419unmap_save:
4420 kvm_vcpu_unmap(vcpu, &map_save, true);
4421unmap_map:
4422 kvm_vcpu_unmap(vcpu, &map, true);
59cd9bc5 4423 return ret;
0234bf88
LP
4424}
4425
b6a7cc35 4426static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
cc3d967f
LP
4427{
4428 struct vcpu_svm *svm = to_svm(vcpu);
4429
4430 if (!gif_set(svm)) {
ea91559b 4431 if (vgif)
a284ba56 4432 svm_set_intercept(svm, INTERCEPT_STGI);
cc3d967f 4433 /* STGI will cause a vm exit */
c9d40913
PB
4434 } else {
4435 /* We must be in SMM; RSM will cause a vmexit anyway. */
cc3d967f 4436 }
cc3d967f
LP
4437}
4438
4d31d9ef
SC
4439static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4440 void *insn, int insn_len)
05d5a486 4441{
09e3e2a1
SC
4442 bool smep, smap, is_user;
4443 unsigned long cr4;
3280cc22 4444 u64 error_code;
e72436bc 4445
55467fcd
SC
4446 /* Emulation is always possible when KVM has access to all guest state. */
4447 if (!sev_guest(vcpu->kvm))
4448 return true;
4449
132627c6
SC
4450 /* #UD and #GP should never be intercepted for SEV guests. */
4451 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4452 EMULTYPE_TRAP_UD_FORCED |
4453 EMULTYPE_VMWARE_GP));
4454
bc624d9f 4455 /*
55467fcd
SC
4456 * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4457 * to guest register state.
bc624d9f
TL
4458 */
4459 if (sev_es_guest(vcpu->kvm))
4460 return false;
4461
04c40f34
SC
4462 /*
4463 * Emulation is possible if the instruction is already decoded, e.g.
4464 * when completing I/O after returning from userspace.
4465 */
4466 if (emul_type & EMULTYPE_NO_DECODE)
4467 return true;
4468
4469 /*
4470 * Emulation is possible for SEV guests if and only if a prefilled
4471 * buffer containing the bytes of the intercepted instruction is
4472 * available. SEV guest memory is encrypted with a guest specific key
4473 * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4474 * decode garbage.
4475 *
4476 * Inject #UD if KVM reached this point without an instruction buffer.
4477 * In practice, this path should never be hit by a well-behaved guest,
4478 * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
4479 * is still theoretically reachable, e.g. via unaccelerated fault-like
4480 * AVIC access, and needs to be handled by KVM to avoid putting the
4481 * guest into an infinite loop. Injecting #UD is somewhat arbitrary,
4482 * but its the least awful option given lack of insight into the guest.
4483 */
4484 if (unlikely(!insn)) {
4485 kvm_queue_exception(vcpu, UD_VECTOR);
4486 return false;
4487 }
4488
4489 /*
4490 * Emulate for SEV guests if the insn buffer is not empty. The buffer
4491 * will be empty if the DecodeAssist microcode cannot fetch bytes for
4492 * the faulting instruction because the code fetch itself faulted, e.g.
4493 * the guest attempted to fetch from emulated MMIO or a guest page
4494 * table used to translate CS:RIP resides in emulated MMIO.
4495 */
4496 if (likely(insn_len))
4497 return true;
4498
05d5a486 4499 /*
118154bd
LA
4500 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4501 *
4502 * Errata:
04c40f34
SC
4503 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4504 * possible that CPU microcode implementing DecodeAssist will fail to
4505 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4506 * be '0'. This happens because microcode reads CS:RIP using a _data_
4507 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
4508 * gives up and does not fill the instruction bytes buffer.
118154bd 4509 *
3280cc22
SC
4510 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4511 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4512 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4513 * GuestIntrBytes field of the VMCB.
05d5a486 4514 *
04c40f34
SC
4515 * This does _not_ mean that the erratum has been encountered, as the
4516 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4517 * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4518 * encountered a reserved/not-present #PF.
05d5a486 4519 *
3280cc22
SC
4520 * To hit the erratum, the following conditions must be true:
4521 * 1. CR4.SMAP=1 (obviously).
4522 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
4523 * have been hit as the guest would have encountered a SMEP
4524 * violation #PF, not a #NPF.
4525 * 3. The #NPF is not due to a code fetch, in which case failure to
4526 * retrieve the instruction bytes is legitimate (see abvoe).
4527 *
4528 * In addition, don't apply the erratum workaround if the #NPF occurred
4529 * while translating guest page tables (see below).
05d5a486 4530 */
3280cc22
SC
4531 error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4532 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4533 goto resume_guest;
4534
09e3e2a1
SC
4535 cr4 = kvm_read_cr4(vcpu);
4536 smep = cr4 & X86_CR4_SMEP;
4537 smap = cr4 & X86_CR4_SMAP;
4538 is_user = svm_get_cpl(vcpu) == 3;
118154bd 4539 if (smap && (!smep || is_user)) {
118154bd 4540 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
cdf85e0c
SC
4541
4542 /*
4543 * If the fault occurred in userspace, arbitrarily inject #GP
4544 * to avoid killing the guest and to hopefully avoid confusing
4545 * the guest kernel too much, e.g. injecting #PF would not be
4546 * coherent with respect to the guest's page tables. Request
4547 * triple fault if the fault occurred in the kernel as there's
4548 * no fault that KVM can inject without confusing the guest.
4549 * In practice, the triple fault is moot as no sane SEV kernel
4550 * will execute from user memory while also running with SMAP=1.
4551 */
4552 if (is_user)
4553 kvm_inject_gp(vcpu, 0);
4554 else
4555 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
05d5a486
SB
4556 }
4557
3280cc22
SC
4558resume_guest:
4559 /*
4560 * If the erratum was not hit, simply resume the guest and let it fault
4561 * again. While awful, e.g. the vCPU may get stuck in an infinite loop
4562 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
4563 * userspace will kill the guest, and letting the emulator read garbage
4564 * will yield random behavior and potentially corrupt the guest.
4565 *
4566 * Simply resuming the guest is technically not a violation of the SEV
4567 * architecture. AMD's APM states that all code fetches and page table
4568 * accesses for SEV guest are encrypted, regardless of the C-Bit. The
4569 * APM also states that encrypted accesses to MMIO are "ignored", but
4570 * doesn't explicitly define "ignored", i.e. doing nothing and letting
4571 * the guest spin is technically "ignoring" the access.
4572 */
05d5a486
SB
4573 return false;
4574}
4575
4b9852f4
LA
4576static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4577{
4578 struct vcpu_svm *svm = to_svm(vcpu);
4579
4580 /*
4581 * TODO: Last condition latch INIT signals on vCPU when
4582 * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
33b22172
PB
4583 * To properly emulate the INIT intercept,
4584 * svm_check_nested_events() should call nested_svm_vmexit()
4585 * if an INIT signal is pending.
4b9852f4
LA
4586 */
4587 return !gif_set(svm) ||
c62e2e94 4588 (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
4b9852f4
LA
4589}
4590
647daca2
TL
4591static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4592{
4593 if (!sev_es_guest(vcpu->kvm))
4594 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4595
4596 sev_vcpu_deliver_sipi_vector(vcpu, vector);
4597}
4598
eaf78265
JR
4599static void svm_vm_destroy(struct kvm *kvm)
4600{
4601 avic_vm_destroy(kvm);
4602 sev_vm_destroy(kvm);
4603}
4604
4605static int svm_vm_init(struct kvm *kvm)
4606{
830f01b0
WL
4607 if (!pause_filter_count || !pause_filter_thresh)
4608 kvm->arch.pause_in_guest = true;
4609
fdf513e3 4610 if (enable_apicv) {
eaf78265
JR
4611 int ret = avic_vm_init(kvm);
4612 if (ret)
4613 return ret;
4614 }
4615
eaf78265
JR
4616 return 0;
4617}
4618
9c14ee21 4619static struct kvm_x86_ops svm_x86_ops __initdata = {
9dadfc4a
SC
4620 .name = "kvm_amd",
4621
23e5092b 4622 .hardware_unsetup = svm_hardware_unsetup,
6aa8b732
AK
4623 .hardware_enable = svm_hardware_enable,
4624 .hardware_disable = svm_hardware_disable,
bc226f07 4625 .has_emulated_msr = svm_has_emulated_msr,
6aa8b732 4626
23e5092b
SC
4627 .vcpu_create = svm_vcpu_create,
4628 .vcpu_free = svm_vcpu_free,
04d2cc77 4629 .vcpu_reset = svm_vcpu_reset,
6aa8b732 4630
562b6b08 4631 .vm_size = sizeof(struct kvm_svm),
4e19c36f 4632 .vm_init = svm_vm_init,
1654efcb 4633 .vm_destroy = svm_vm_destroy,
44a95dae 4634
23e5092b 4635 .prepare_switch_to_guest = svm_prepare_switch_to_guest,
6aa8b732
AK
4636 .vcpu_load = svm_vcpu_load,
4637 .vcpu_put = svm_vcpu_put,
a3c19d5b
SC
4638 .vcpu_blocking = avic_vcpu_blocking,
4639 .vcpu_unblocking = avic_vcpu_unblocking,
6aa8b732 4640
b6a7cc35 4641 .update_exception_bitmap = svm_update_exception_bitmap,
801e459a 4642 .get_msr_feature = svm_get_msr_feature,
6aa8b732
AK
4643 .get_msr = svm_get_msr,
4644 .set_msr = svm_set_msr,
4645 .get_segment_base = svm_get_segment_base,
4646 .get_segment = svm_get_segment,
4647 .set_segment = svm_set_segment,
2e4d2653 4648 .get_cpl = svm_get_cpl,
872e0c53 4649 .get_cs_db_l_bits = svm_get_cs_db_l_bits,
6aa8b732 4650 .set_cr0 = svm_set_cr0,
559c7c75 4651 .post_set_cr3 = sev_post_set_cr3,
c2fe3cd4 4652 .is_valid_cr4 = svm_is_valid_cr4,
6aa8b732
AK
4653 .set_cr4 = svm_set_cr4,
4654 .set_efer = svm_set_efer,
4655 .get_idt = svm_get_idt,
4656 .set_idt = svm_set_idt,
4657 .get_gdt = svm_get_gdt,
4658 .set_gdt = svm_set_gdt,
020df079 4659 .set_dr7 = svm_set_dr7,
facb0139 4660 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
6de4f3ad 4661 .cache_reg = svm_cache_reg,
6aa8b732
AK
4662 .get_rflags = svm_get_rflags,
4663 .set_rflags = svm_set_rflags,
c5063551 4664 .get_if_flag = svm_get_if_flag,
be94f6b7 4665
4d9c83f5
SC
4666 .flush_tlb_all = svm_flush_tlb_current,
4667 .flush_tlb_current = svm_flush_tlb_current,
e27bc044 4668 .flush_tlb_gva = svm_flush_tlb_gva,
4d9c83f5 4669 .flush_tlb_guest = svm_flush_tlb_current,
6aa8b732 4670
fc4fad79 4671 .vcpu_pre_run = svm_vcpu_pre_run,
e27bc044 4672 .vcpu_run = svm_vcpu_run,
23e5092b
SC
4673 .handle_exit = svm_handle_exit,
4674 .skip_emulated_instruction = svm_skip_emulated_instruction,
5ef8acbd 4675 .update_emulated_instruction = NULL,
2809f5d2
GC
4676 .set_interrupt_shadow = svm_set_interrupt_shadow,
4677 .get_interrupt_shadow = svm_get_interrupt_shadow,
102d8325 4678 .patch_hypercall = svm_patch_hypercall,
23e5092b 4679 .inject_irq = svm_inject_irq,
e27bc044 4680 .inject_nmi = svm_inject_nmi,
298101da 4681 .queue_exception = svm_queue_exception,
b463a6f7 4682 .cancel_injection = svm_cancel_injection,
78646121 4683 .interrupt_allowed = svm_interrupt_allowed,
95ba8273 4684 .nmi_allowed = svm_nmi_allowed,
3cfc3092
JK
4685 .get_nmi_mask = svm_get_nmi_mask,
4686 .set_nmi_mask = svm_set_nmi_mask,
b6a7cc35
JB
4687 .enable_nmi_window = svm_enable_nmi_window,
4688 .enable_irq_window = svm_enable_irq_window,
4689 .update_cr8_intercept = svm_update_cr8_intercept,
db6e7adf
SC
4690 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4691 .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
db6e7adf 4692 .apicv_post_state_restore = avic_apicv_post_state_restore,
cbc94022 4693
bf07be36 4694 .get_mt_mask = svm_get_mt_mask,
586f9607 4695 .get_exit_info = svm_get_exit_info,
586f9607 4696
7c1b761b 4697 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4e47c7a6 4698
f5f48ee1 4699 .has_wbinvd_exit = svm_has_wbinvd_exit,
99e3e30a 4700
307a94c7
IS
4701 .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4702 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
edcfe540 4703 .write_tsc_offset = svm_write_tsc_offset,
1ab9287a 4704 .write_tsc_multiplier = svm_write_tsc_multiplier,
1c97f0a0 4705
727a7e27 4706 .load_mmu_pgd = svm_load_mmu_pgd,
8a76d7f2
JR
4707
4708 .check_intercept = svm_check_intercept,
95b5a48c 4709 .handle_exit_irqoff = svm_handle_exit_irqoff,
ae97a3b8 4710
d264ee0c
SC
4711 .request_immediate_exit = __kvm_request_immediate_exit,
4712
ae97a3b8 4713 .sched_in = svm_sched_in,
25462f7f 4714
33b22172
PB
4715 .nested_ops = &svm_nested_ops,
4716
57dfd7b5 4717 .deliver_interrupt = svm_deliver_interrupt,
db6e7adf 4718 .pi_update_irte = avic_pi_update_irte,
74f16909 4719 .setup_mce = svm_setup_mce,
0234bf88 4720
72d7b374 4721 .smi_allowed = svm_smi_allowed,
ecc513e5
SC
4722 .enter_smm = svm_enter_smm,
4723 .leave_smm = svm_leave_smm,
b6a7cc35 4724 .enable_smi_window = svm_enable_smi_window,
1654efcb 4725
559c7c75
SC
4726 .mem_enc_ioctl = sev_mem_enc_ioctl,
4727 .mem_enc_register_region = sev_mem_enc_register_region,
4728 .mem_enc_unregister_region = sev_mem_enc_unregister_region,
683412cc 4729 .guest_memory_reclaimed = sev_guest_memory_reclaimed,
57b119da 4730
559c7c75
SC
4731 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4732 .vm_move_enc_context_from = sev_vm_move_enc_context_from,
54526d1f 4733
09e3e2a1 4734 .can_emulate_instruction = svm_can_emulate_instruction,
4b9852f4
LA
4735
4736 .apic_init_signal_blocked = svm_apic_init_signal_blocked,
fd6fa73d
AG
4737
4738 .msr_filter_changed = svm_msr_filter_changed,
f1c6366e 4739 .complete_emulated_msr = svm_complete_emulated_msr,
647daca2
TL
4740
4741 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
f44509f8 4742 .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
6aa8b732
AK
4743};
4744
54744e17
SC
4745/*
4746 * The default MMIO mask is a single bit (excluding the present bit),
4747 * which could conflict with the memory encryption bit. Check for
4748 * memory encryption support and override the default MMIO mask if
4749 * memory encryption is enabled.
4750 */
4751static __init void svm_adjust_mmio_mask(void)
4752{
4753 unsigned int enc_bit, mask_bit;
4754 u64 msr, mask;
4755
4756 /* If there is no memory encryption support, use existing mask */
4757 if (cpuid_eax(0x80000000) < 0x8000001f)
4758 return;
4759
4760 /* If memory encryption is not enabled, use existing mask */
4761 rdmsrl(MSR_AMD64_SYSCFG, msr);
4762 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4763 return;
4764
4765 enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4766 mask_bit = boot_cpu_data.x86_phys_bits;
4767
4768 /* Increment the mask bit if it is the same as the encryption bit */
4769 if (enc_bit == mask_bit)
4770 mask_bit++;
4771
4772 /*
4773 * If the mask bit location is below 52, then some bits above the
4774 * physical addressing limit will always be reserved, so use the
4775 * rsvd_bits() function to generate the mask. This mask, along with
4776 * the present bit, will be used to generate a page fault with
4777 * PFER.RSV = 1.
4778 *
4779 * If the mask bit location is 52 (or above), then clear the mask.
4780 */
4781 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
4782
4783 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
4784}
4785
4786static __init void svm_set_cpu_caps(void)
4787{
4788 kvm_set_cpu_caps();
4789
4790 supported_xss = 0;
4791
4792 /* CPUID 0x80000001 and 0x8000000A (SVM features) */
4793 if (nested) {
4794 kvm_cpu_cap_set(X86_FEATURE_SVM);
91f673b3 4795 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
54744e17
SC
4796
4797 if (nrips)
4798 kvm_cpu_cap_set(X86_FEATURE_NRIPS);
4799
4800 if (npt_enabled)
4801 kvm_cpu_cap_set(X86_FEATURE_NPT);
4802
4803 if (tsc_scaling)
4804 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
4805
b9f3973a
ML
4806 if (vls)
4807 kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
d20c796c
ML
4808 if (lbrv)
4809 kvm_cpu_cap_set(X86_FEATURE_LBRV);
b9f3973a 4810
74fd41ed
ML
4811 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
4812 kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
4813
4814 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
4815 kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
4816
0b349662
ML
4817 if (vgif)
4818 kvm_cpu_cap_set(X86_FEATURE_VGIF);
4819
54744e17
SC
4820 /* Nested VM can receive #VMEXIT instead of triggering #GP */
4821 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
4822 }
4823
4824 /* CPUID 0x80000008 */
4825 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
4826 boot_cpu_has(X86_FEATURE_AMD_SSBD))
4827 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
4828
4829 /* AMD PMU PERFCTR_CORE CPUID */
4830 if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
4831 kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
4832
4833 /* CPUID 0x8000001F (SME/SEV features) */
4834 sev_set_cpu_caps();
4835}
4836
4837static __init int svm_hardware_setup(void)
4838{
4839 int cpu;
4840 struct page *iopm_pages;
4841 void *iopm_va;
4842 int r;
4843 unsigned int order = get_order(IOPM_SIZE);
4844
4845 /*
4846 * NX is required for shadow paging and for NPT if the NX huge pages
4847 * mitigation is enabled.
4848 */
4849 if (!boot_cpu_has(X86_FEATURE_NX)) {
4850 pr_err_ratelimited("NX (Execute Disable) not supported\n");
4851 return -EOPNOTSUPP;
4852 }
4853 kvm_enable_efer_bits(EFER_NX);
4854
4855 iopm_pages = alloc_pages(GFP_KERNEL, order);
4856
4857 if (!iopm_pages)
4858 return -ENOMEM;
4859
4860 iopm_va = page_address(iopm_pages);
4861 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
4862 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
4863
4864 init_msrpm_offsets();
4865
4866 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
4867
4868 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
4869 kvm_enable_efer_bits(EFER_FFXSR);
4870
4871 if (tsc_scaling) {
4872 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
4873 tsc_scaling = false;
4874 } else {
4875 pr_info("TSC scaling supported\n");
4876 kvm_has_tsc_control = true;
54744e17
SC
4877 }
4878 }
88099313
ML
4879 kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
4880 kvm_tsc_scaling_ratio_frac_bits = 32;
54744e17
SC
4881
4882 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
4883
4884 /* Check for pause filtering support */
4885 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
4886 pause_filter_count = 0;
4887 pause_filter_thresh = 0;
4888 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
4889 pause_filter_thresh = 0;
4890 }
4891
4892 if (nested) {
4893 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
4894 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
4895 }
4896
4897 /*
4898 * KVM's MMU doesn't support using 2-level paging for itself, and thus
4899 * NPT isn't supported if the host is using 2-level paging since host
4900 * CR4 is unchanged on VMRUN.
4901 */
4902 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
4903 npt_enabled = false;
4904
4905 if (!boot_cpu_has(X86_FEATURE_NPT))
4906 npt_enabled = false;
4907
4908 /* Force VM NPT level equal to the host's paging level */
4909 kvm_configure_mmu(npt_enabled, get_npt_level(),
4910 get_npt_level(), PG_LEVEL_1G);
4911 pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
4912
e54f1ff2
KH
4913 /* Setup shadow_me_value and shadow_me_mask */
4914 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
4915
54744e17
SC
4916 /* Note, SEV setup consumes npt_enabled. */
4917 sev_hardware_setup();
4918
4919 svm_hv_hardware_setup();
4920
4921 svm_adjust_mmio_mask();
4922
4923 for_each_possible_cpu(cpu) {
4924 r = svm_cpu_init(cpu);
4925 if (r)
4926 goto err;
4927 }
4928
4929 if (nrips) {
4930 if (!boot_cpu_has(X86_FEATURE_NRIPS))
4931 nrips = false;
4932 }
4933
edf72123 4934 enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
54744e17
SC
4935
4936 if (enable_apicv) {
edf72123
ML
4937 if (!boot_cpu_has(X86_FEATURE_AVIC)) {
4938 pr_warn("AVIC is not supported in CPUID but force enabled");
4939 pr_warn("Your system might crash and burn");
4940 } else
4941 pr_info("AVIC enabled\n");
54744e17
SC
4942
4943 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
a3c19d5b
SC
4944 } else {
4945 svm_x86_ops.vcpu_blocking = NULL;
4946 svm_x86_ops.vcpu_unblocking = NULL;
f44509f8 4947 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
54744e17
SC
4948 }
4949
4950 if (vls) {
4951 if (!npt_enabled ||
4952 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
4953 !IS_ENABLED(CONFIG_X86_64)) {
4954 vls = false;
4955 } else {
4956 pr_info("Virtual VMLOAD VMSAVE supported\n");
4957 }
4958 }
4959
4960 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
4961 svm_gp_erratum_intercept = false;
4962
4963 if (vgif) {
4964 if (!boot_cpu_has(X86_FEATURE_VGIF))
4965 vgif = false;
4966 else
4967 pr_info("Virtual GIF supported\n");
4968 }
4969
4970 if (lbrv) {
4971 if (!boot_cpu_has(X86_FEATURE_LBRV))
4972 lbrv = false;
4973 else
4974 pr_info("LBR virtualization supported\n");
4975 }
4976
4977 if (!enable_pmu)
4978 pr_info("PMU virtualization is disabled\n");
4979
4980 svm_set_cpu_caps();
4981
4982 /*
4983 * It seems that on AMD processors PTE's accessed bit is
4984 * being set by the CPU hardware before the NPF vmexit.
4985 * This is not expected behaviour and our tests fail because
4986 * of it.
4987 * A workaround here is to disable support for
4988 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
4989 * In this case userspace can know if there is support using
4990 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
4991 * it
4992 * If future AMD CPU models change the behaviour described above,
4993 * this variable can be changed accordingly
4994 */
4995 allow_smaller_maxphyaddr = !npt_enabled;
4996
4997 return 0;
4998
4999err:
23e5092b 5000 svm_hardware_unsetup();
54744e17
SC
5001 return r;
5002}
5003
5004
d008dfdb
SC
5005static struct kvm_x86_init_ops svm_init_ops __initdata = {
5006 .cpu_has_kvm_support = has_svm,
5007 .disabled_by_bios = is_disabled,
5008 .hardware_setup = svm_hardware_setup,
5009 .check_processor_compatibility = svm_check_processor_compat,
5010
5011 .runtime_ops = &svm_x86_ops,
34886e79 5012 .pmu_ops = &amd_pmu_ops,
6aa8b732
AK
5013};
5014
5015static int __init svm_init(void)
5016{
d07f46f9
TL
5017 __unused_size_checks();
5018
d008dfdb 5019 return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
0ee75bea 5020 __alignof__(struct vcpu_svm), THIS_MODULE);
6aa8b732
AK
5021}
5022
5023static void __exit svm_exit(void)
5024{
cb498ea2 5025 kvm_exit();
6aa8b732
AK
5026}
5027
5028module_init(svm_init)
5029module_exit(svm_exit)