Commit | Line | Data |
---|---|---|
ef0f6496 JR |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* | |
3 | * Kernel-based Virtual Machine driver for Linux | |
4 | * | |
5 | * AMD SVM support | |
6 | * | |
7 | * Copyright (C) 2006 Qumranet, Inc. | |
8 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. | |
9 | * | |
10 | * Authors: | |
11 | * Yaniv Kamay <yaniv@qumranet.com> | |
12 | * Avi Kivity <avi@qumranet.com> | |
13 | */ | |
14 | ||
8d20bd63 | 15 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
ef0f6496 JR |
16 | |
17 | #include <linux/kvm_types.h> | |
18 | #include <linux/hashtable.h> | |
19 | #include <linux/amd-iommu.h> | |
20 | #include <linux/kvm_host.h> | |
21 | ||
22 | #include <asm/irq_remapping.h> | |
23 | ||
24 | #include "trace.h" | |
25 | #include "lapic.h" | |
26 | #include "x86.h" | |
27 | #include "irq.h" | |
28 | #include "svm.h" | |
29 | ||
59997159 SS |
30 | /* |
31 | * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID, | |
32 | * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry | |
33 | * if an interrupt can't be delivered, e.g. because the vCPU isn't running. | |
34 | * | |
35 | * For the vCPU ID, use however many bits are currently allowed for the max | |
36 | * guest physical APIC ID (limited by the size of the physical ID table), and | |
37 | * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the | |
38 | * size of the GATag is defined by hardware (32 bits), but is an opaque value | |
39 | * as far as hardware is concerned. | |
40 | */ | |
41 | #define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK | |
ef0f6496 | 42 | |
59997159 SS |
43 | #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) |
44 | #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) | |
ef0f6496 | 45 | |
59997159 | 46 | #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) |
ef0f6496 JR |
47 | #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) |
48 | ||
c281794e SC |
49 | #define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ |
50 | ((vcpu_id) & AVIC_VCPU_ID_MASK)) | |
51 | #define AVIC_GATAG(vm_id, vcpu_id) \ | |
52 | ({ \ | |
53 | u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \ | |
54 | \ | |
55 | WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \ | |
56 | WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ | |
57 | ga_tag; \ | |
58 | }) | |
59 | ||
60 | static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u); | |
59997159 | 61 | |
4bdec12a SS |
62 | static bool force_avic; |
63 | module_param_unsafe(force_avic, bool, 0444); | |
64 | ||
ef0f6496 JR |
65 | /* Note: |
66 | * This hash table is used to map VM_ID to a struct kvm_svm, | |
67 | * when handling AMD IOMMU GALOG notification to schedule in | |
68 | * a particular vCPU. | |
69 | */ | |
70 | #define SVM_VM_DATA_HASH_BITS 8 | |
71 | static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); | |
72 | static u32 next_vm_id = 0; | |
73 | static bool next_vm_id_wrapped = 0; | |
74 | static DEFINE_SPINLOCK(svm_vm_data_hash_lock); | |
f628a34a | 75 | bool x2avic_enabled; |
ef0f6496 JR |
76 | |
77 | /* | |
78 | * This is a wrapper of struct amd_iommu_ir_data. | |
79 | */ | |
80 | struct amd_svm_iommu_ir { | |
81 | struct list_head node; /* Used by SVM for per-vcpu ir_list */ | |
82 | void *data; /* Storing pointer to struct amd_ir_data */ | |
83 | }; | |
84 | ||
4d1d7942 SS |
85 | static void avic_activate_vmcb(struct vcpu_svm *svm) |
86 | { | |
87 | struct vmcb *vmcb = svm->vmcb01.ptr; | |
88 | ||
89 | vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); | |
90 | vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; | |
91 | ||
92 | vmcb->control.int_ctl |= AVIC_ENABLE_MASK; | |
0e311d33 | 93 | |
2008fab3 SC |
94 | /* |
95 | * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR | |
96 | * accesses, while interrupt injection to a running vCPU can be | |
97 | * achieved using AVIC doorbell. KVM disables the APIC access page | |
98 | * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling | |
99 | * AVIC in hybrid mode activates only the doorbell mechanism. | |
0e311d33 | 100 | */ |
f628a34a | 101 | if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { |
4d1d7942 SS |
102 | vmcb->control.int_ctl |= X2APIC_MODE_MASK; |
103 | vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; | |
104 | /* Disabling MSR intercept for x2APIC registers */ | |
105 | svm_set_x2apic_msr_interception(svm, false); | |
106 | } else { | |
0ccf3e7c SC |
107 | /* |
108 | * Flush the TLB, the guest may have inserted a non-APIC | |
109 | * mapping into the TLB while AVIC was disabled. | |
110 | */ | |
111 | kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); | |
112 | ||
0e311d33 | 113 | /* For xAVIC and hybrid-xAVIC modes */ |
4d1d7942 SS |
114 | vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; |
115 | /* Enabling MSR intercept for x2APIC registers */ | |
116 | svm_set_x2apic_msr_interception(svm, true); | |
117 | } | |
118 | } | |
119 | ||
120 | static void avic_deactivate_vmcb(struct vcpu_svm *svm) | |
121 | { | |
122 | struct vmcb *vmcb = svm->vmcb01.ptr; | |
123 | ||
124 | vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); | |
125 | vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; | |
126 | ||
091abbf5 ML |
127 | /* |
128 | * If running nested and the guest uses its own MSR bitmap, there | |
129 | * is no need to update L0's msr bitmap | |
130 | */ | |
131 | if (is_guest_mode(&svm->vcpu) && | |
132 | vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) | |
133 | return; | |
134 | ||
4d1d7942 SS |
135 | /* Enabling MSR intercept for x2APIC registers */ |
136 | svm_set_x2apic_msr_interception(svm, true); | |
137 | } | |
ef0f6496 JR |
138 | |
139 | /* Note: | |
140 | * This function is called from IOMMU driver to notify | |
141 | * SVM to schedule in a particular vCPU of a particular VM. | |
142 | */ | |
143 | int avic_ga_log_notifier(u32 ga_tag) | |
144 | { | |
145 | unsigned long flags; | |
146 | struct kvm_svm *kvm_svm; | |
147 | struct kvm_vcpu *vcpu = NULL; | |
148 | u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); | |
149 | u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); | |
150 | ||
151 | pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); | |
152 | trace_kvm_avic_ga_log(vm_id, vcpu_id); | |
153 | ||
154 | spin_lock_irqsave(&svm_vm_data_hash_lock, flags); | |
155 | hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { | |
156 | if (kvm_svm->avic_vm_id != vm_id) | |
157 | continue; | |
158 | vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); | |
159 | break; | |
160 | } | |
161 | spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); | |
162 | ||
163 | /* Note: | |
164 | * At this point, the IOMMU should have already set the pending | |
165 | * bit in the vAPIC backing page. So, we just need to schedule | |
166 | * in the vcpu. | |
167 | */ | |
168 | if (vcpu) | |
169 | kvm_vcpu_wake_up(vcpu); | |
170 | ||
171 | return 0; | |
172 | } | |
173 | ||
174 | void avic_vm_destroy(struct kvm *kvm) | |
175 | { | |
176 | unsigned long flags; | |
177 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm); | |
178 | ||
fdf513e3 | 179 | if (!enable_apicv) |
ef0f6496 JR |
180 | return; |
181 | ||
182 | if (kvm_svm->avic_logical_id_table_page) | |
183 | __free_page(kvm_svm->avic_logical_id_table_page); | |
184 | if (kvm_svm->avic_physical_id_table_page) | |
185 | __free_page(kvm_svm->avic_physical_id_table_page); | |
186 | ||
187 | spin_lock_irqsave(&svm_vm_data_hash_lock, flags); | |
188 | hash_del(&kvm_svm->hnode); | |
189 | spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); | |
190 | } | |
191 | ||
192 | int avic_vm_init(struct kvm *kvm) | |
193 | { | |
194 | unsigned long flags; | |
195 | int err = -ENOMEM; | |
196 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm); | |
197 | struct kvm_svm *k2; | |
198 | struct page *p_page; | |
199 | struct page *l_page; | |
200 | u32 vm_id; | |
201 | ||
fdf513e3 | 202 | if (!enable_apicv) |
ef0f6496 JR |
203 | return 0; |
204 | ||
205 | /* Allocating physical APIC ID table (4KB) */ | |
ae5a2a39 | 206 | p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); |
ef0f6496 JR |
207 | if (!p_page) |
208 | goto free_avic; | |
209 | ||
210 | kvm_svm->avic_physical_id_table_page = p_page; | |
ef0f6496 JR |
211 | |
212 | /* Allocating logical APIC ID table (4KB) */ | |
ae5a2a39 | 213 | l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); |
ef0f6496 JR |
214 | if (!l_page) |
215 | goto free_avic; | |
216 | ||
217 | kvm_svm->avic_logical_id_table_page = l_page; | |
ef0f6496 JR |
218 | |
219 | spin_lock_irqsave(&svm_vm_data_hash_lock, flags); | |
220 | again: | |
221 | vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; | |
222 | if (vm_id == 0) { /* id is 1-based, zero is not okay */ | |
223 | next_vm_id_wrapped = 1; | |
224 | goto again; | |
225 | } | |
226 | /* Is it still in use? Only possible if wrapped at least once */ | |
227 | if (next_vm_id_wrapped) { | |
228 | hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { | |
229 | if (k2->avic_vm_id == vm_id) | |
230 | goto again; | |
231 | } | |
232 | } | |
233 | kvm_svm->avic_vm_id = vm_id; | |
234 | hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); | |
235 | spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); | |
236 | ||
237 | return 0; | |
238 | ||
239 | free_avic: | |
240 | avic_vm_destroy(kvm); | |
241 | return err; | |
242 | } | |
243 | ||
1ee73a33 | 244 | void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) |
ef0f6496 | 245 | { |
ef0f6496 JR |
246 | struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); |
247 | phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); | |
248 | phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); | |
249 | phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); | |
250 | ||
251 | vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; | |
252 | vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; | |
253 | vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; | |
73143035 ML |
254 | vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; |
255 | ||
ef0f6496 | 256 | if (kvm_apicv_activated(svm->vcpu.kvm)) |
4d1d7942 | 257 | avic_activate_vmcb(svm); |
ef0f6496 | 258 | else |
4d1d7942 | 259 | avic_deactivate_vmcb(svm); |
ef0f6496 JR |
260 | } |
261 | ||
262 | static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, | |
263 | unsigned int index) | |
264 | { | |
265 | u64 *avic_physical_id_table; | |
266 | struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); | |
267 | ||
f628a34a SC |
268 | if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || |
269 | (index > X2AVIC_MAX_PHYSICAL_ID)) | |
ef0f6496 JR |
270 | return NULL; |
271 | ||
272 | avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); | |
273 | ||
274 | return &avic_physical_id_table[index]; | |
275 | } | |
276 | ||
ef0f6496 JR |
277 | static int avic_init_backing_page(struct kvm_vcpu *vcpu) |
278 | { | |
279 | u64 *entry, new_entry; | |
280 | int id = vcpu->vcpu_id; | |
281 | struct vcpu_svm *svm = to_svm(vcpu); | |
282 | ||
f628a34a SC |
283 | if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || |
284 | (id > X2AVIC_MAX_PHYSICAL_ID)) | |
ef0f6496 JR |
285 | return -EINVAL; |
286 | ||
63129754 | 287 | if (!vcpu->arch.apic->regs) |
ef0f6496 JR |
288 | return -EINVAL; |
289 | ||
290 | if (kvm_apicv_activated(vcpu->kvm)) { | |
291 | int ret; | |
292 | ||
c482f2ce SC |
293 | /* |
294 | * Note, AVIC hardware walks the nested page table to check | |
295 | * permissions, but does not use the SPA address specified in | |
296 | * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE | |
297 | * pointer field of the VMCB. | |
298 | */ | |
299 | ret = kvm_alloc_apic_access_page(vcpu->kvm); | |
ef0f6496 JR |
300 | if (ret) |
301 | return ret; | |
302 | } | |
303 | ||
63129754 | 304 | svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); |
ef0f6496 JR |
305 | |
306 | /* Setting AVIC backing page address in the phy APIC ID table */ | |
307 | entry = avic_get_physical_id_entry(vcpu, id); | |
308 | if (!entry) | |
309 | return -EINVAL; | |
310 | ||
311 | new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & | |
312 | AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | | |
313 | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); | |
314 | WRITE_ONCE(*entry, new_entry); | |
315 | ||
316 | svm->avic_physical_id_cache = entry; | |
317 | ||
318 | return 0; | |
319 | } | |
320 | ||
66fa226c | 321 | void avic_ring_doorbell(struct kvm_vcpu *vcpu) |
0a5f7842 ML |
322 | { |
323 | /* | |
324 | * Note, the vCPU could get migrated to a different pCPU at any point, | |
325 | * which could result in signalling the wrong/previous pCPU. But if | |
326 | * that happens the vCPU is guaranteed to do a VMRUN (after being | |
327 | * migrated) and thus will process pending interrupts, i.e. a doorbell | |
328 | * is not needed (and the spurious one is harmless). | |
329 | */ | |
330 | int cpu = READ_ONCE(vcpu->cpu); | |
331 | ||
39b6b8c3 | 332 | if (cpu != get_cpu()) { |
0a5f7842 | 333 | wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); |
39b6b8c3 SS |
334 | trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); |
335 | } | |
0a5f7842 ML |
336 | put_cpu(); |
337 | } | |
338 | ||
1d22a597 SC |
339 | |
340 | static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) | |
341 | { | |
342 | vcpu->arch.apic->irr_pending = true; | |
343 | svm_complete_interrupt_delivery(vcpu, | |
344 | icrl & APIC_MODE_MASK, | |
345 | icrl & APIC_INT_LEVELTRIG, | |
346 | icrl & APIC_VECTOR_MASK); | |
347 | } | |
348 | ||
bbfc7aa6 SC |
349 | static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, |
350 | u32 icrl) | |
351 | { | |
352 | /* | |
353 | * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, | |
354 | * i.e. APIC ID == vCPU ID. | |
355 | */ | |
356 | struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); | |
357 | ||
358 | /* Once again, nothing to do if the target vCPU doesn't exist. */ | |
359 | if (unlikely(!target_vcpu)) | |
360 | return; | |
361 | ||
362 | avic_kick_vcpu(target_vcpu, icrl); | |
363 | } | |
364 | ||
365 | static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, | |
366 | u32 logid_index, u32 icrl) | |
367 | { | |
368 | u32 physical_id; | |
369 | ||
370 | if (avic_logical_id_table) { | |
371 | u32 logid_entry = avic_logical_id_table[logid_index]; | |
372 | ||
373 | /* Nothing to do if the logical destination is invalid. */ | |
374 | if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) | |
375 | return; | |
376 | ||
377 | physical_id = logid_entry & | |
378 | AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; | |
379 | } else { | |
380 | /* | |
381 | * For x2APIC, the logical APIC ID is a read-only value that is | |
382 | * derived from the x2APIC ID, thus the x2APIC ID can be found | |
383 | * by reversing the calculation (stored in logid_index). Note, | |
384 | * bits 31:20 of the x2APIC ID aren't propagated to the logical | |
385 | * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. | |
386 | */ | |
387 | physical_id = logid_index; | |
388 | } | |
389 | ||
390 | avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); | |
391 | } | |
392 | ||
7223fd2d SS |
393 | /* |
394 | * A fast-path version of avic_kick_target_vcpus(), which attempts to match | |
395 | * destination APIC ID to vCPU without looping through all vCPUs. | |
396 | */ | |
397 | static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, | |
398 | u32 icrl, u32 icrh, u32 index) | |
e6c804a8 | 399 | { |
7223fd2d SS |
400 | int dest_mode = icrl & APIC_DEST_MASK; |
401 | int shorthand = icrl & APIC_SHORT_MASK; | |
402 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm); | |
bbfc7aa6 | 403 | u32 dest; |
7223fd2d SS |
404 | |
405 | if (shorthand != APIC_DEST_NOSHORT) | |
406 | return -EINVAL; | |
407 | ||
603ccef4 ML |
408 | if (apic_x2apic_mode(source)) |
409 | dest = icrh; | |
410 | else | |
bf348f66 | 411 | dest = GET_XAPIC_DEST_FIELD(icrh); |
603ccef4 | 412 | |
7223fd2d | 413 | if (dest_mode == APIC_DEST_PHYSICAL) { |
603ccef4 ML |
414 | /* broadcast destination, use slow path */ |
415 | if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) | |
416 | return -EINVAL; | |
417 | if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) | |
418 | return -EINVAL; | |
419 | ||
bbfc7aa6 | 420 | if (WARN_ON_ONCE(dest != index)) |
603ccef4 ML |
421 | return -EINVAL; |
422 | ||
bbfc7aa6 | 423 | avic_kick_vcpu_by_physical_id(kvm, dest, icrl); |
7223fd2d | 424 | } else { |
bbfc7aa6 SC |
425 | u32 *avic_logical_id_table; |
426 | unsigned long bitmap, i; | |
427 | u32 cluster; | |
603ccef4 ML |
428 | |
429 | if (apic_x2apic_mode(source)) { | |
430 | /* 16 bit dest mask, 16 bit cluster id */ | |
da3fb46d | 431 | bitmap = dest & 0xFFFF; |
603ccef4 ML |
432 | cluster = (dest >> 16) << 4; |
433 | } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { | |
434 | /* 8 bit dest mask*/ | |
435 | bitmap = dest; | |
436 | cluster = 0; | |
7223fd2d | 437 | } else { |
603ccef4 ML |
438 | /* 4 bit desk mask, 4 bit cluster id */ |
439 | bitmap = dest & 0xF; | |
440 | cluster = (dest >> 4) << 2; | |
7223fd2d | 441 | } |
7223fd2d | 442 | |
8578e451 | 443 | /* Nothing to do if there are no destinations in the cluster. */ |
603ccef4 | 444 | if (unlikely(!bitmap)) |
603ccef4 | 445 | return 0; |
7223fd2d | 446 | |
bbfc7aa6 SC |
447 | if (apic_x2apic_mode(source)) |
448 | avic_logical_id_table = NULL; | |
449 | else | |
450 | avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); | |
8c9e639d | 451 | |
bbfc7aa6 SC |
452 | /* |
453 | * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical | |
454 | * IDs, thus each bit in the destination is guaranteed to map | |
455 | * to at most one vCPU. | |
456 | */ | |
457 | for_each_set_bit(i, &bitmap, 16) | |
458 | avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, | |
459 | cluster + i, icrl); | |
7223fd2d SS |
460 | } |
461 | ||
603ccef4 | 462 | return 0; |
7223fd2d SS |
463 | } |
464 | ||
465 | static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, | |
466 | u32 icrl, u32 icrh, u32 index) | |
467 | { | |
a879a88e | 468 | u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); |
46808a4c | 469 | unsigned long i; |
7223fd2d SS |
470 | struct kvm_vcpu *vcpu; |
471 | ||
472 | if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) | |
473 | return; | |
e6c804a8 | 474 | |
9f084f7c SS |
475 | trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); |
476 | ||
202470d5 SC |
477 | /* |
478 | * Wake any target vCPUs that are blocking, i.e. waiting for a wake | |
479 | * event. There's no need to signal doorbells, as hardware has handled | |
480 | * vCPUs that were in guest at the time of the IPI, and vCPUs that have | |
481 | * since entered the guest will have processed pending IRQs at VMRUN. | |
482 | */ | |
e6c804a8 | 483 | kvm_for_each_vcpu(i, vcpu, kvm) { |
202470d5 | 484 | if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, |
1d22a597 SC |
485 | dest, icrl & APIC_DEST_MASK)) |
486 | avic_kick_vcpu(vcpu, icrl); | |
e6c804a8 SC |
487 | } |
488 | } | |
489 | ||
63129754 | 490 | int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) |
ef0f6496 | 491 | { |
63129754 | 492 | struct vcpu_svm *svm = to_svm(vcpu); |
ef0f6496 JR |
493 | u32 icrh = svm->vmcb->control.exit_info_1 >> 32; |
494 | u32 icrl = svm->vmcb->control.exit_info_1; | |
495 | u32 id = svm->vmcb->control.exit_info_2 >> 32; | |
7223fd2d | 496 | u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; |
63129754 | 497 | struct kvm_lapic *apic = vcpu->arch.apic; |
ef0f6496 | 498 | |
63129754 | 499 | trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); |
ef0f6496 JR |
500 | |
501 | switch (id) { | |
5aede752 | 502 | case AVIC_IPI_FAILURE_INVALID_TARGET: |
ef0f6496 JR |
503 | case AVIC_IPI_FAILURE_INVALID_INT_TYPE: |
504 | /* | |
b51818af | 505 | * Emulate IPIs that are not handled by AVIC hardware, which |
5aede752 SC |
506 | * only virtualizes Fixed, Edge-Triggered INTRs, and falls over |
507 | * if _any_ targets are invalid, e.g. if the logical mode mask | |
508 | * is a superset of running vCPUs. | |
509 | * | |
510 | * The exit is a trap, e.g. ICR holds the correct value and RIP | |
511 | * has been advanced, KVM is responsible only for emulating the | |
512 | * IPI. Sadly, hardware may sometimes leave the BUSY flag set, | |
513 | * in which case KVM needs to emulate the ICR write as well in | |
b51818af | 514 | * order to clear the BUSY flag. |
ef0f6496 | 515 | */ |
b51818af SC |
516 | if (icrl & APIC_ICR_BUSY) |
517 | kvm_apic_write_nodecode(vcpu, APIC_ICR); | |
518 | else | |
519 | kvm_apic_send_ipi(apic, icrl, icrh); | |
ef0f6496 | 520 | break; |
e6c804a8 | 521 | case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: |
ef0f6496 JR |
522 | /* |
523 | * At this point, we expect that the AVIC HW has already | |
524 | * set the appropriate IRR bits on the valid target | |
525 | * vcpus. So, we just need to kick the appropriate vcpu. | |
526 | */ | |
7223fd2d | 527 | avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); |
ef0f6496 | 528 | break; |
ef0f6496 JR |
529 | case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: |
530 | WARN_ONCE(1, "Invalid backing page\n"); | |
531 | break; | |
2dcf37ab ML |
532 | case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: |
533 | /* Invalid IPI with vector < 16 */ | |
534 | break; | |
ef0f6496 | 535 | default: |
2dcf37ab | 536 | vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); |
ef0f6496 JR |
537 | } |
538 | ||
539 | return 1; | |
540 | } | |
541 | ||
f44509f8 ML |
542 | unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) |
543 | { | |
544 | if (is_guest_mode(vcpu)) | |
545 | return APICV_INHIBIT_REASON_NESTED; | |
546 | return 0; | |
547 | } | |
548 | ||
ef0f6496 JR |
549 | static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) |
550 | { | |
551 | struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); | |
ef0f6496 | 552 | u32 *logical_apic_id_table; |
1808c950 | 553 | u32 cluster, index; |
ef0f6496 | 554 | |
1808c950 | 555 | ldr = GET_APIC_LOGICAL_ID(ldr); |
ef0f6496 | 556 | |
1808c950 SC |
557 | if (flat) { |
558 | cluster = 0; | |
559 | } else { | |
560 | cluster = (ldr >> 4); | |
561 | if (cluster >= 0xf) | |
ef0f6496 | 562 | return NULL; |
1808c950 | 563 | ldr &= 0xf; |
ef0f6496 | 564 | } |
1808c950 SC |
565 | if (!ldr || !is_power_of_2(ldr)) |
566 | return NULL; | |
567 | ||
568 | index = __ffs(ldr); | |
569 | if (WARN_ON_ONCE(index > 7)) | |
570 | return NULL; | |
571 | index += (cluster << 2); | |
ef0f6496 JR |
572 | |
573 | logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); | |
574 | ||
575 | return &logical_apic_id_table[index]; | |
576 | } | |
577 | ||
4f160b7b | 578 | static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) |
ef0f6496 JR |
579 | { |
580 | bool flat; | |
581 | u32 *entry, new_entry; | |
582 | ||
583 | flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; | |
584 | entry = avic_get_logical_id_entry(vcpu, ldr, flat); | |
585 | if (!entry) | |
4f160b7b | 586 | return; |
ef0f6496 JR |
587 | |
588 | new_entry = READ_ONCE(*entry); | |
589 | new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; | |
590 | new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); | |
591 | new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; | |
592 | WRITE_ONCE(*entry, new_entry); | |
ef0f6496 JR |
593 | } |
594 | ||
595 | static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) | |
596 | { | |
597 | struct vcpu_svm *svm = to_svm(vcpu); | |
598 | bool flat = svm->dfr_reg == APIC_DFR_FLAT; | |
ab1b1dc1 | 599 | u32 *entry; |
ef0f6496 | 600 | |
ab1b1dc1 SS |
601 | /* Note: x2AVIC does not use logical APIC ID table */ |
602 | if (apic_x2apic_mode(vcpu->arch.apic)) | |
603 | return; | |
604 | ||
605 | entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); | |
ef0f6496 JR |
606 | if (entry) |
607 | clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); | |
608 | } | |
609 | ||
1ba59a44 | 610 | static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) |
ef0f6496 | 611 | { |
ef0f6496 JR |
612 | struct vcpu_svm *svm = to_svm(vcpu); |
613 | u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); | |
614 | u32 id = kvm_xapic_id(vcpu->arch.apic); | |
615 | ||
ab1b1dc1 SS |
616 | /* AVIC does not support LDR update for x2APIC */ |
617 | if (apic_x2apic_mode(vcpu->arch.apic)) | |
1ba59a44 | 618 | return; |
ab1b1dc1 | 619 | |
ef0f6496 | 620 | if (ldr == svm->ldr_reg) |
1ba59a44 | 621 | return; |
ef0f6496 JR |
622 | |
623 | avic_invalidate_logical_id_entry(vcpu); | |
624 | ||
4f160b7b SC |
625 | svm->ldr_reg = ldr; |
626 | avic_ldr_write(vcpu, id, ldr); | |
ef0f6496 JR |
627 | } |
628 | ||
ef0f6496 JR |
629 | static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) |
630 | { | |
631 | struct vcpu_svm *svm = to_svm(vcpu); | |
632 | u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); | |
633 | ||
634 | if (svm->dfr_reg == dfr) | |
635 | return; | |
636 | ||
637 | avic_invalidate_logical_id_entry(vcpu); | |
638 | svm->dfr_reg = dfr; | |
639 | } | |
640 | ||
ed60920e | 641 | static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) |
ef0f6496 | 642 | { |
ed60920e | 643 | u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & |
ef0f6496 JR |
644 | AVIC_UNACCEL_ACCESS_OFFSET_MASK; |
645 | ||
646 | switch (offset) { | |
ef0f6496 | 647 | case APIC_LDR: |
1ba59a44 | 648 | avic_handle_ldr_update(vcpu); |
ef0f6496 JR |
649 | break; |
650 | case APIC_DFR: | |
ed60920e | 651 | avic_handle_dfr_update(vcpu); |
ef0f6496 | 652 | break; |
a790e338 SC |
653 | case APIC_RRR: |
654 | /* Ignore writes to Read Remote Data, it's read-only. */ | |
655 | return 1; | |
ef0f6496 JR |
656 | default: |
657 | break; | |
658 | } | |
659 | ||
ed60920e | 660 | kvm_apic_write_nodecode(vcpu, offset); |
ef0f6496 JR |
661 | return 1; |
662 | } | |
663 | ||
664 | static bool is_avic_unaccelerated_access_trap(u32 offset) | |
665 | { | |
666 | bool ret = false; | |
667 | ||
668 | switch (offset) { | |
669 | case APIC_ID: | |
670 | case APIC_EOI: | |
671 | case APIC_RRR: | |
672 | case APIC_LDR: | |
673 | case APIC_DFR: | |
674 | case APIC_SPIV: | |
675 | case APIC_ESR: | |
676 | case APIC_ICR: | |
677 | case APIC_LVTT: | |
678 | case APIC_LVTTHMR: | |
679 | case APIC_LVTPC: | |
680 | case APIC_LVT0: | |
681 | case APIC_LVT1: | |
682 | case APIC_LVTERR: | |
683 | case APIC_TMICT: | |
684 | case APIC_TDCR: | |
685 | ret = true; | |
686 | break; | |
687 | default: | |
688 | break; | |
689 | } | |
690 | return ret; | |
691 | } | |
692 | ||
63129754 | 693 | int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) |
ef0f6496 | 694 | { |
63129754 | 695 | struct vcpu_svm *svm = to_svm(vcpu); |
ef0f6496 JR |
696 | int ret = 0; |
697 | u32 offset = svm->vmcb->control.exit_info_1 & | |
698 | AVIC_UNACCEL_ACCESS_OFFSET_MASK; | |
699 | u32 vector = svm->vmcb->control.exit_info_2 & | |
700 | AVIC_UNACCEL_ACCESS_VECTOR_MASK; | |
701 | bool write = (svm->vmcb->control.exit_info_1 >> 32) & | |
702 | AVIC_UNACCEL_ACCESS_WRITE_MASK; | |
703 | bool trap = is_avic_unaccelerated_access_trap(offset); | |
704 | ||
63129754 | 705 | trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, |
ef0f6496 JR |
706 | trap, write, vector); |
707 | if (trap) { | |
708 | /* Handling Trap */ | |
709 | WARN_ONCE(!write, "svm: Handling trap read.\n"); | |
ed60920e | 710 | ret = avic_unaccel_trap_write(vcpu); |
ef0f6496 JR |
711 | } else { |
712 | /* Handling Fault */ | |
63129754 | 713 | ret = kvm_emulate_instruction(vcpu, 0); |
ef0f6496 JR |
714 | } |
715 | ||
716 | return ret; | |
717 | } | |
718 | ||
719 | int avic_init_vcpu(struct vcpu_svm *svm) | |
720 | { | |
721 | int ret; | |
722 | struct kvm_vcpu *vcpu = &svm->vcpu; | |
723 | ||
fdf513e3 | 724 | if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) |
ef0f6496 JR |
725 | return 0; |
726 | ||
63129754 | 727 | ret = avic_init_backing_page(vcpu); |
ef0f6496 JR |
728 | if (ret) |
729 | return ret; | |
730 | ||
731 | INIT_LIST_HEAD(&svm->ir_list); | |
732 | spin_lock_init(&svm->ir_list_lock); | |
733 | svm->dfr_reg = APIC_DFR_FLAT; | |
734 | ||
735 | return ret; | |
736 | } | |
737 | ||
db6e7adf | 738 | void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) |
ef0f6496 | 739 | { |
ef0f6496 JR |
740 | avic_handle_dfr_update(vcpu); |
741 | avic_handle_ldr_update(vcpu); | |
742 | } | |
743 | ||
db6e7adf | 744 | static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) |
ef0f6496 JR |
745 | { |
746 | int ret = 0; | |
747 | unsigned long flags; | |
748 | struct amd_svm_iommu_ir *ir; | |
749 | struct vcpu_svm *svm = to_svm(vcpu); | |
750 | ||
751 | if (!kvm_arch_has_assigned_device(vcpu->kvm)) | |
752 | return 0; | |
753 | ||
754 | /* | |
755 | * Here, we go through the per-vcpu ir_list to update all existing | |
756 | * interrupt remapping table entry targeting this vcpu. | |
757 | */ | |
758 | spin_lock_irqsave(&svm->ir_list_lock, flags); | |
759 | ||
760 | if (list_empty(&svm->ir_list)) | |
761 | goto out; | |
762 | ||
763 | list_for_each_entry(ir, &svm->ir_list, node) { | |
764 | if (activate) | |
765 | ret = amd_iommu_activate_guest_mode(ir->data); | |
766 | else | |
767 | ret = amd_iommu_deactivate_guest_mode(ir->data); | |
768 | if (ret) | |
769 | break; | |
770 | } | |
771 | out: | |
772 | spin_unlock_irqrestore(&svm->ir_list_lock, flags); | |
773 | return ret; | |
774 | } | |
775 | ||
ef0f6496 JR |
776 | static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) |
777 | { | |
778 | unsigned long flags; | |
779 | struct amd_svm_iommu_ir *cur; | |
780 | ||
781 | spin_lock_irqsave(&svm->ir_list_lock, flags); | |
782 | list_for_each_entry(cur, &svm->ir_list, node) { | |
783 | if (cur->data != pi->ir_data) | |
784 | continue; | |
785 | list_del(&cur->node); | |
786 | kfree(cur); | |
787 | break; | |
788 | } | |
789 | spin_unlock_irqrestore(&svm->ir_list_lock, flags); | |
790 | } | |
791 | ||
792 | static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) | |
793 | { | |
794 | int ret = 0; | |
795 | unsigned long flags; | |
796 | struct amd_svm_iommu_ir *ir; | |
f3cebc75 | 797 | u64 entry; |
ef0f6496 JR |
798 | |
799 | /** | |
163b0991 | 800 | * In some cases, the existing irte is updated and re-set, |
ef0f6496 JR |
801 | * so we need to check here if it's already been * added |
802 | * to the ir_list. | |
803 | */ | |
804 | if (pi->ir_data && (pi->prev_ga_tag != 0)) { | |
805 | struct kvm *kvm = svm->vcpu.kvm; | |
806 | u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); | |
807 | struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); | |
808 | struct vcpu_svm *prev_svm; | |
809 | ||
810 | if (!prev_vcpu) { | |
811 | ret = -EINVAL; | |
812 | goto out; | |
813 | } | |
814 | ||
815 | prev_svm = to_svm(prev_vcpu); | |
816 | svm_ir_list_del(prev_svm, pi); | |
817 | } | |
818 | ||
819 | /** | |
820 | * Allocating new amd_iommu_pi_data, which will get | |
821 | * add to the per-vcpu ir_list. | |
822 | */ | |
823 | ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); | |
824 | if (!ir) { | |
825 | ret = -ENOMEM; | |
826 | goto out; | |
827 | } | |
828 | ir->data = pi->ir_data; | |
829 | ||
830 | spin_lock_irqsave(&svm->ir_list_lock, flags); | |
f3cebc75 SC |
831 | |
832 | /* | |
833 | * Update the target pCPU for IOMMU doorbells if the vCPU is running. | |
834 | * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM | |
835 | * will update the pCPU info when the vCPU awkened and/or scheduled in. | |
836 | * See also avic_vcpu_load(). | |
837 | */ | |
838 | entry = READ_ONCE(*(svm->avic_physical_id_cache)); | |
839 | if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) | |
840 | amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, | |
841 | true, pi->ir_data); | |
842 | ||
ef0f6496 JR |
843 | list_add(&ir->node, &svm->ir_list); |
844 | spin_unlock_irqrestore(&svm->ir_list_lock, flags); | |
845 | out: | |
846 | return ret; | |
847 | } | |
848 | ||
02ffbe63 | 849 | /* |
ef0f6496 JR |
850 | * Note: |
851 | * The HW cannot support posting multicast/broadcast | |
852 | * interrupts to a vCPU. So, we still use legacy interrupt | |
853 | * remapping for these kind of interrupts. | |
854 | * | |
855 | * For lowest-priority interrupts, we only support | |
856 | * those with single CPU as the destination, e.g. user | |
857 | * configures the interrupts via /proc/irq or uses | |
858 | * irqbalance to make the interrupts single-CPU. | |
859 | */ | |
860 | static int | |
861 | get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, | |
862 | struct vcpu_data *vcpu_info, struct vcpu_svm **svm) | |
863 | { | |
864 | struct kvm_lapic_irq irq; | |
865 | struct kvm_vcpu *vcpu = NULL; | |
866 | ||
867 | kvm_set_msi_irq(kvm, e, &irq); | |
868 | ||
869 | if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || | |
870 | !kvm_irq_is_postable(&irq)) { | |
871 | pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", | |
872 | __func__, irq.vector); | |
873 | return -1; | |
874 | } | |
875 | ||
876 | pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, | |
877 | irq.vector); | |
878 | *svm = to_svm(vcpu); | |
879 | vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); | |
880 | vcpu_info->vector = irq.vector; | |
881 | ||
882 | return 0; | |
883 | } | |
884 | ||
885 | /* | |
db6e7adf | 886 | * avic_pi_update_irte - set IRTE for Posted-Interrupts |
ef0f6496 JR |
887 | * |
888 | * @kvm: kvm | |
889 | * @host_irq: host irq of the interrupt | |
890 | * @guest_irq: gsi of the interrupt | |
891 | * @set: set or unset PI | |
892 | * returns 0 on success, < 0 on failure | |
893 | */ | |
db6e7adf SC |
894 | int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, |
895 | uint32_t guest_irq, bool set) | |
ef0f6496 JR |
896 | { |
897 | struct kvm_kernel_irq_routing_entry *e; | |
898 | struct kvm_irq_routing_table *irq_rt; | |
a80ced6e | 899 | int idx, ret = 0; |
ef0f6496 JR |
900 | |
901 | if (!kvm_arch_has_assigned_device(kvm) || | |
902 | !irq_remapping_cap(IRQ_POSTING_CAP)) | |
903 | return 0; | |
904 | ||
905 | pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", | |
906 | __func__, host_irq, guest_irq, set); | |
907 | ||
908 | idx = srcu_read_lock(&kvm->irq_srcu); | |
909 | irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); | |
a80ced6e YW |
910 | |
911 | if (guest_irq >= irq_rt->nr_rt_entries || | |
912 | hlist_empty(&irq_rt->map[guest_irq])) { | |
913 | pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", | |
914 | guest_irq, irq_rt->nr_rt_entries); | |
915 | goto out; | |
916 | } | |
ef0f6496 JR |
917 | |
918 | hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { | |
919 | struct vcpu_data vcpu_info; | |
920 | struct vcpu_svm *svm = NULL; | |
921 | ||
922 | if (e->type != KVM_IRQ_ROUTING_MSI) | |
923 | continue; | |
924 | ||
925 | /** | |
926 | * Here, we setup with legacy mode in the following cases: | |
927 | * 1. When cannot target interrupt to a specific vcpu. | |
928 | * 2. Unsetting posted interrupt. | |
d9f6e12f | 929 | * 3. APIC virtualization is disabled for the vcpu. |
ef0f6496 JR |
930 | * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) |
931 | */ | |
932 | if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && | |
933 | kvm_vcpu_apicv_active(&svm->vcpu)) { | |
934 | struct amd_iommu_pi_data pi; | |
935 | ||
936 | /* Try to enable guest_mode in IRTE */ | |
937 | pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & | |
938 | AVIC_HPA_MASK); | |
939 | pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, | |
940 | svm->vcpu.vcpu_id); | |
941 | pi.is_guest_mode = true; | |
942 | pi.vcpu_data = &vcpu_info; | |
943 | ret = irq_set_vcpu_affinity(host_irq, &pi); | |
944 | ||
945 | /** | |
946 | * Here, we successfully setting up vcpu affinity in | |
947 | * IOMMU guest mode. Now, we need to store the posted | |
948 | * interrupt information in a per-vcpu ir_list so that | |
949 | * we can reference to them directly when we update vcpu | |
950 | * scheduling information in IOMMU irte. | |
951 | */ | |
952 | if (!ret && pi.is_guest_mode) | |
953 | svm_ir_list_add(svm, &pi); | |
954 | } else { | |
955 | /* Use legacy mode in IRTE */ | |
956 | struct amd_iommu_pi_data pi; | |
957 | ||
958 | /** | |
959 | * Here, pi is used to: | |
960 | * - Tell IOMMU to use legacy mode for this interrupt. | |
961 | * - Retrieve ga_tag of prior interrupt remapping data. | |
962 | */ | |
f6426ab9 | 963 | pi.prev_ga_tag = 0; |
ef0f6496 JR |
964 | pi.is_guest_mode = false; |
965 | ret = irq_set_vcpu_affinity(host_irq, &pi); | |
966 | ||
967 | /** | |
968 | * Check if the posted interrupt was previously | |
969 | * setup with the guest_mode by checking if the ga_tag | |
970 | * was cached. If so, we need to clean up the per-vcpu | |
971 | * ir_list. | |
972 | */ | |
973 | if (!ret && pi.prev_ga_tag) { | |
974 | int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); | |
975 | struct kvm_vcpu *vcpu; | |
976 | ||
977 | vcpu = kvm_get_vcpu_by_id(kvm, id); | |
978 | if (vcpu) | |
979 | svm_ir_list_del(to_svm(vcpu), &pi); | |
980 | } | |
981 | } | |
982 | ||
983 | if (!ret && svm) { | |
984 | trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, | |
985 | e->gsi, vcpu_info.vector, | |
986 | vcpu_info.pi_desc_addr, set); | |
987 | } | |
988 | ||
989 | if (ret < 0) { | |
990 | pr_err("%s: failed to update PI IRTE\n", __func__); | |
991 | goto out; | |
992 | } | |
993 | } | |
994 | ||
995 | ret = 0; | |
996 | out: | |
997 | srcu_read_unlock(&kvm->irq_srcu, idx); | |
998 | return ret; | |
999 | } | |
1000 | ||
ef0f6496 JR |
1001 | static inline int |
1002 | avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) | |
1003 | { | |
1004 | int ret = 0; | |
ef0f6496 JR |
1005 | struct amd_svm_iommu_ir *ir; |
1006 | struct vcpu_svm *svm = to_svm(vcpu); | |
1007 | ||
4c08e737 SC |
1008 | lockdep_assert_held(&svm->ir_list_lock); |
1009 | ||
ef0f6496 JR |
1010 | if (!kvm_arch_has_assigned_device(vcpu->kvm)) |
1011 | return 0; | |
1012 | ||
1013 | /* | |
1014 | * Here, we go through the per-vcpu ir_list to update all existing | |
1015 | * interrupt remapping table entry targeting this vcpu. | |
1016 | */ | |
ef0f6496 | 1017 | if (list_empty(&svm->ir_list)) |
4c08e737 | 1018 | return 0; |
ef0f6496 JR |
1019 | |
1020 | list_for_each_entry(ir, &svm->ir_list, node) { | |
1021 | ret = amd_iommu_update_ga(cpu, r, ir->data); | |
1022 | if (ret) | |
4c08e737 | 1023 | return ret; |
ef0f6496 | 1024 | } |
4c08e737 | 1025 | return 0; |
ef0f6496 JR |
1026 | } |
1027 | ||
ba8ec273 | 1028 | void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
ef0f6496 JR |
1029 | { |
1030 | u64 entry; | |
ef0f6496 JR |
1031 | int h_physical_id = kvm_cpu_get_apicid(cpu); |
1032 | struct vcpu_svm *svm = to_svm(vcpu); | |
4c08e737 | 1033 | unsigned long flags; |
ef0f6496 | 1034 | |
935a7333 SC |
1035 | lockdep_assert_preemption_disabled(); |
1036 | ||
4a204f78 | 1037 | if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) |
ef0f6496 JR |
1038 | return; |
1039 | ||
782f6455 SC |
1040 | /* |
1041 | * No need to update anything if the vCPU is blocking, i.e. if the vCPU | |
1042 | * is being scheduled in after being preempted. The CPU entries in the | |
1043 | * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. | |
1044 | * If the vCPU was migrated, its new CPU value will be stuffed when the | |
1045 | * vCPU unblocks. | |
1046 | */ | |
1047 | if (kvm_vcpu_is_blocking(vcpu)) | |
1048 | return; | |
1049 | ||
f3cebc75 SC |
1050 | /* |
1051 | * Grab the per-vCPU interrupt remapping lock even if the VM doesn't | |
1052 | * _currently_ have assigned devices, as that can change. Holding | |
1053 | * ir_list_lock ensures that either svm_ir_list_add() will consume | |
1054 | * up-to-date entry information, or that this task will wait until | |
1055 | * svm_ir_list_add() completes to set the new target pCPU. | |
1056 | */ | |
4c08e737 SC |
1057 | spin_lock_irqsave(&svm->ir_list_lock, flags); |
1058 | ||
ef0f6496 | 1059 | entry = READ_ONCE(*(svm->avic_physical_id_cache)); |
e2ed3e64 | 1060 | WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); |
ef0f6496 JR |
1061 | |
1062 | entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; | |
1063 | entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); | |
782f6455 | 1064 | entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; |
ef0f6496 JR |
1065 | |
1066 | WRITE_ONCE(*(svm->avic_physical_id_cache), entry); | |
782f6455 | 1067 | avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); |
4c08e737 SC |
1068 | |
1069 | spin_unlock_irqrestore(&svm->ir_list_lock, flags); | |
ef0f6496 JR |
1070 | } |
1071 | ||
ba8ec273 | 1072 | void avic_vcpu_put(struct kvm_vcpu *vcpu) |
ef0f6496 JR |
1073 | { |
1074 | u64 entry; | |
1075 | struct vcpu_svm *svm = to_svm(vcpu); | |
4c08e737 | 1076 | unsigned long flags; |
ef0f6496 | 1077 | |
935a7333 SC |
1078 | lockdep_assert_preemption_disabled(); |
1079 | ||
4c08e737 SC |
1080 | /* |
1081 | * Note, reading the Physical ID entry outside of ir_list_lock is safe | |
1082 | * as only the pCPU that has loaded (or is loading) the vCPU is allowed | |
1083 | * to modify the entry, and preemption is disabled. I.e. the vCPU | |
1084 | * can't be scheduled out and thus avic_vcpu_{put,load}() can't run | |
1085 | * recursively. | |
1086 | */ | |
ef0f6496 | 1087 | entry = READ_ONCE(*(svm->avic_physical_id_cache)); |
782f6455 SC |
1088 | |
1089 | /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ | |
1090 | if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) | |
1091 | return; | |
1092 | ||
f3cebc75 SC |
1093 | /* |
1094 | * Take and hold the per-vCPU interrupt remapping lock while updating | |
1095 | * the Physical ID entry even though the lock doesn't protect against | |
1096 | * multiple writers (see above). Holding ir_list_lock ensures that | |
1097 | * either svm_ir_list_add() will consume up-to-date entry information, | |
1098 | * or that this task will wait until svm_ir_list_add() completes to | |
1099 | * mark the vCPU as not running. | |
1100 | */ | |
4c08e737 SC |
1101 | spin_lock_irqsave(&svm->ir_list_lock, flags); |
1102 | ||
782f6455 | 1103 | avic_update_iommu_vcpu_affinity(vcpu, -1, 0); |
ef0f6496 JR |
1104 | |
1105 | entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; | |
1106 | WRITE_ONCE(*(svm->avic_physical_id_cache), entry); | |
4c08e737 SC |
1107 | |
1108 | spin_unlock_irqrestore(&svm->ir_list_lock, flags); | |
1109 | ||
ef0f6496 JR |
1110 | } |
1111 | ||
e0bead97 | 1112 | void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) |
b652de1e SC |
1113 | { |
1114 | struct vcpu_svm *svm = to_svm(vcpu); | |
1115 | struct vmcb *vmcb = svm->vmcb01.ptr; | |
b652de1e | 1116 | |
f628a34a | 1117 | if (!lapic_in_kernel(vcpu) || !enable_apicv) |
b652de1e SC |
1118 | return; |
1119 | ||
e0bead97 | 1120 | if (kvm_vcpu_apicv_active(vcpu)) { |
b652de1e SC |
1121 | /** |
1122 | * During AVIC temporary deactivation, guest could update | |
1123 | * APIC ID, DFR and LDR registers, which would not be trapped | |
1124 | * by avic_unaccelerated_access_interception(). In this case, | |
1125 | * we need to check and update the AVIC logical APIC ID table | |
1126 | * accordingly before re-activating. | |
1127 | */ | |
1128 | avic_apicv_post_state_restore(vcpu); | |
4d1d7942 | 1129 | avic_activate_vmcb(svm); |
b652de1e | 1130 | } else { |
4d1d7942 | 1131 | avic_deactivate_vmcb(svm); |
b652de1e SC |
1132 | } |
1133 | vmcb_mark_dirty(vmcb, VMCB_AVIC); | |
e0bead97 SC |
1134 | } |
1135 | ||
1136 | void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) | |
1137 | { | |
1138 | bool activated = kvm_vcpu_apicv_active(vcpu); | |
1139 | ||
1140 | if (!enable_apicv) | |
1141 | return; | |
1142 | ||
1143 | avic_refresh_virtual_apic_mode(vcpu); | |
b652de1e SC |
1144 | |
1145 | if (activated) | |
ba8ec273 | 1146 | avic_vcpu_load(vcpu, vcpu->cpu); |
b652de1e SC |
1147 | else |
1148 | avic_vcpu_put(vcpu); | |
1149 | ||
1150 | avic_set_pi_irte_mode(vcpu, activated); | |
1151 | } | |
1152 | ||
a3c19d5b | 1153 | void avic_vcpu_blocking(struct kvm_vcpu *vcpu) |
ef0f6496 | 1154 | { |
935a7333 SC |
1155 | if (!kvm_vcpu_apicv_active(vcpu)) |
1156 | return; | |
1157 | ||
935a7333 SC |
1158 | /* |
1159 | * Unload the AVIC when the vCPU is about to block, _before_ | |
1160 | * the vCPU actually blocks. | |
1161 | * | |
1162 | * Any IRQs that arrive before IsRunning=0 will not cause an | |
1163 | * incomplete IPI vmexit on the source, therefore vIRR will also | |
1164 | * be checked by kvm_vcpu_check_block() before blocking. The | |
1165 | * memory barrier implicit in set_current_state orders writing | |
1166 | * IsRunning=0 before reading the vIRR. The processor needs a | |
1167 | * matching memory barrier on interrupt delivery between writing | |
1168 | * IRR and reading IsRunning; the lack of this barrier might be | |
1169 | * the cause of errata #1235). | |
1170 | */ | |
1171 | avic_vcpu_put(vcpu); | |
ef0f6496 JR |
1172 | } |
1173 | ||
a3c19d5b | 1174 | void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) |
ef0f6496 | 1175 | { |
935a7333 SC |
1176 | if (!kvm_vcpu_apicv_active(vcpu)) |
1177 | return; | |
1178 | ||
ba8ec273 | 1179 | avic_vcpu_load(vcpu, vcpu->cpu); |
ef0f6496 | 1180 | } |
4bdec12a SS |
1181 | |
1182 | /* | |
1183 | * Note: | |
1184 | * - The module param avic enable both xAPIC and x2APIC mode. | |
1185 | * - Hypervisor can support both xAVIC and x2AVIC in the same guest. | |
1186 | * - The mode can be switched at run-time. | |
1187 | */ | |
36b02567 | 1188 | bool avic_hardware_setup(void) |
4bdec12a SS |
1189 | { |
1190 | if (!npt_enabled) | |
1191 | return false; | |
1192 | ||
f628a34a SC |
1193 | /* AVIC is a prerequisite for x2AVIC. */ |
1194 | if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { | |
1195 | if (boot_cpu_has(X86_FEATURE_X2AVIC)) { | |
1196 | pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); | |
1197 | pr_warn(FW_BUG "Try enable AVIC using force_avic option"); | |
1198 | } | |
1199 | return false; | |
1200 | } | |
1201 | ||
4bdec12a | 1202 | if (boot_cpu_has(X86_FEATURE_AVIC)) { |
4bdec12a SS |
1203 | pr_info("AVIC enabled\n"); |
1204 | } else if (force_avic) { | |
1205 | /* | |
1206 | * Some older systems does not advertise AVIC support. | |
1207 | * See Revision Guide for specific AMD processor for more detail. | |
1208 | */ | |
4bdec12a SS |
1209 | pr_warn("AVIC is not supported in CPUID but force enabled"); |
1210 | pr_warn("Your system might crash and burn"); | |
1211 | } | |
1212 | ||
1213 | /* AVIC is a prerequisite for x2AVIC. */ | |
f628a34a SC |
1214 | x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); |
1215 | if (x2avic_enabled) | |
1216 | pr_info("x2AVIC enabled\n"); | |
4bdec12a | 1217 | |
f628a34a | 1218 | amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); |
4bdec12a | 1219 | |
f628a34a | 1220 | return true; |
4bdec12a | 1221 | } |