Commit | Line | Data |
---|---|---|
23200b7a JM |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. | |
4 | * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. | |
5 | * | |
6 | * KVM Xen emulation | |
7 | */ | |
8 | ||
9 | #include "x86.h" | |
10 | #include "xen.h" | |
79033beb | 11 | #include "hyperv.h" |
1a65105a | 12 | #include "lapic.h" |
23200b7a | 13 | |
2fd6df2f | 14 | #include <linux/eventfd.h> |
23200b7a | 15 | #include <linux/kvm_host.h> |
30b5c851 | 16 | #include <linux/sched/stat.h> |
23200b7a JM |
17 | |
18 | #include <trace/events/kvm.h> | |
13ffb97a | 19 | #include <xen/interface/xen.h> |
30b5c851 | 20 | #include <xen/interface/vcpu.h> |
28d1629f | 21 | #include <xen/interface/version.h> |
14243b38 | 22 | #include <xen/interface/event_channel.h> |
0ec6c5c5 | 23 | #include <xen/interface/sched.h> |
23200b7a JM |
24 | |
25 | #include "trace.h" | |
26 | ||
53639526 | 27 | static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); |
2fd6df2f JM |
28 | static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data); |
29 | static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); | |
30 | ||
7d6bbebb DW |
31 | DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); |
32 | ||
13ffb97a JM |
33 | static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) |
34 | { | |
1cfc9c4b | 35 | struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; |
55749769 | 36 | struct pvclock_wall_clock *wc; |
629b5348 | 37 | gpa_t gpa = gfn_to_gpa(gfn); |
55749769 DW |
38 | u32 *wc_sec_hi; |
39 | u32 wc_version; | |
40 | u64 wall_nsec; | |
319afe68 | 41 | int ret = 0; |
13ffb97a JM |
42 | int idx = srcu_read_lock(&kvm->srcu); |
43 | ||
1cfc9c4b DW |
44 | if (gfn == GPA_INVALID) { |
45 | kvm_gfn_to_pfn_cache_destroy(kvm, gpc); | |
629b5348 | 46 | goto out; |
319afe68 | 47 | } |
1cfc9c4b | 48 | |
55749769 | 49 | do { |
d0d96121 | 50 | ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN, |
cf1d88b3 | 51 | gpa, PAGE_SIZE); |
55749769 DW |
52 | if (ret) |
53 | goto out; | |
54 | ||
55 | /* | |
56 | * This code mirrors kvm_write_wall_clock() except that it writes | |
57 | * directly through the pfn cache and doesn't mark the page dirty. | |
58 | */ | |
59 | wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); | |
60 | ||
61 | /* It could be invalid again already, so we need to check */ | |
62 | read_lock_irq(&gpc->lock); | |
63 | ||
64 | if (gpc->valid) | |
65 | break; | |
66 | ||
67 | read_unlock_irq(&gpc->lock); | |
68 | } while (1); | |
629b5348 JM |
69 | |
70 | /* Paranoia checks on the 32-bit struct layout */ | |
71 | BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); | |
72 | BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924); | |
73 | BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); | |
74 | ||
629b5348 JM |
75 | #ifdef CONFIG_X86_64 |
76 | /* Paranoia checks on the 64-bit struct layout */ | |
77 | BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00); | |
78 | BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c); | |
79 | ||
55749769 DW |
80 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { |
81 | struct shared_info *shinfo = gpc->khva; | |
82 | ||
83 | wc_sec_hi = &shinfo->wc_sec_hi; | |
84 | wc = &shinfo->wc; | |
85 | } else | |
629b5348 | 86 | #endif |
55749769 DW |
87 | { |
88 | struct compat_shared_info *shinfo = gpc->khva; | |
89 | ||
90 | wc_sec_hi = &shinfo->arch.wc_sec_hi; | |
91 | wc = &shinfo->wc; | |
92 | } | |
93 | ||
94 | /* Increment and ensure an odd value */ | |
95 | wc_version = wc->version = (wc->version + 1) | 1; | |
96 | smp_wmb(); | |
97 | ||
98 | wc->nsec = do_div(wall_nsec, 1000000000); | |
99 | wc->sec = (u32)wall_nsec; | |
100 | *wc_sec_hi = wall_nsec >> 32; | |
101 | smp_wmb(); | |
102 | ||
103 | wc->version = wc_version + 1; | |
104 | read_unlock_irq(&gpc->lock); | |
629b5348 | 105 | |
629b5348 | 106 | kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); |
13ffb97a | 107 | |
629b5348 | 108 | out: |
13ffb97a JM |
109 | srcu_read_unlock(&kvm->srcu, idx); |
110 | return ret; | |
111 | } | |
112 | ||
53639526 JM |
113 | void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) |
114 | { | |
115 | if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) { | |
116 | struct kvm_xen_evtchn e; | |
117 | ||
118 | e.vcpu_id = vcpu->vcpu_id; | |
119 | e.vcpu_idx = vcpu->vcpu_idx; | |
120 | e.port = vcpu->arch.xen.timer_virq; | |
121 | e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; | |
122 | ||
123 | kvm_xen_set_evtchn(&e, vcpu->kvm); | |
124 | ||
125 | vcpu->arch.xen.timer_expires = 0; | |
126 | atomic_set(&vcpu->arch.xen.timer_pending, 0); | |
127 | } | |
128 | } | |
129 | ||
130 | static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) | |
131 | { | |
132 | struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, | |
133 | arch.xen.timer); | |
134 | if (atomic_read(&vcpu->arch.xen.timer_pending)) | |
135 | return HRTIMER_NORESTART; | |
136 | ||
137 | atomic_inc(&vcpu->arch.xen.timer_pending); | |
138 | kvm_make_request(KVM_REQ_UNBLOCK, vcpu); | |
139 | kvm_vcpu_kick(vcpu); | |
140 | ||
141 | return HRTIMER_NORESTART; | |
142 | } | |
143 | ||
144 | static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) | |
145 | { | |
146 | atomic_set(&vcpu->arch.xen.timer_pending, 0); | |
147 | vcpu->arch.xen.timer_expires = guest_abs; | |
148 | ||
149 | if (delta_ns <= 0) { | |
150 | xen_timer_callback(&vcpu->arch.xen.timer); | |
151 | } else { | |
152 | ktime_t ktime_now = ktime_get(); | |
153 | hrtimer_start(&vcpu->arch.xen.timer, | |
154 | ktime_add_ns(ktime_now, delta_ns), | |
155 | HRTIMER_MODE_ABS_HARD); | |
156 | } | |
157 | } | |
158 | ||
159 | static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) | |
160 | { | |
161 | hrtimer_cancel(&vcpu->arch.xen.timer); | |
162 | vcpu->arch.xen.timer_expires = 0; | |
163 | atomic_set(&vcpu->arch.xen.timer_pending, 0); | |
164 | } | |
165 | ||
166 | static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) | |
167 | { | |
168 | hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, | |
169 | HRTIMER_MODE_ABS_HARD); | |
170 | vcpu->arch.xen.timer.function = xen_timer_callback; | |
171 | } | |
172 | ||
30b5c851 DW |
173 | static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) |
174 | { | |
175 | struct kvm_vcpu_xen *vx = &v->arch.xen; | |
176 | u64 now = get_kvmclock_ns(v->kvm); | |
177 | u64 delta_ns = now - vx->runstate_entry_time; | |
178 | u64 run_delay = current->sched_info.run_delay; | |
179 | ||
180 | if (unlikely(!vx->runstate_entry_time)) | |
181 | vx->current_runstate = RUNSTATE_offline; | |
182 | ||
183 | /* | |
184 | * Time waiting for the scheduler isn't "stolen" if the | |
185 | * vCPU wasn't running anyway. | |
186 | */ | |
187 | if (vx->current_runstate == RUNSTATE_running) { | |
188 | u64 steal_ns = run_delay - vx->last_steal; | |
189 | ||
190 | delta_ns -= steal_ns; | |
191 | ||
192 | vx->runstate_times[RUNSTATE_runnable] += steal_ns; | |
193 | } | |
194 | vx->last_steal = run_delay; | |
195 | ||
196 | vx->runstate_times[vx->current_runstate] += delta_ns; | |
197 | vx->current_runstate = state; | |
198 | vx->runstate_entry_time = now; | |
199 | } | |
200 | ||
201 | void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) | |
202 | { | |
203 | struct kvm_vcpu_xen *vx = &v->arch.xen; | |
a795cd43 DW |
204 | struct gfn_to_pfn_cache *gpc = &vx->runstate_cache; |
205 | uint64_t *user_times; | |
206 | unsigned long flags; | |
207 | size_t user_len; | |
208 | int *user_state; | |
30b5c851 DW |
209 | |
210 | kvm_xen_update_runstate(v, state); | |
211 | ||
a795cd43 | 212 | if (!vx->runstate_cache.active) |
30b5c851 DW |
213 | return; |
214 | ||
a795cd43 DW |
215 | if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) |
216 | user_len = sizeof(struct vcpu_runstate_info); | |
217 | else | |
218 | user_len = sizeof(struct compat_vcpu_runstate_info); | |
fcb732d8 | 219 | |
a795cd43 DW |
220 | read_lock_irqsave(&gpc->lock, flags); |
221 | while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, | |
222 | user_len)) { | |
223 | read_unlock_irqrestore(&gpc->lock, flags); | |
fcb732d8 | 224 | |
a795cd43 DW |
225 | /* When invoked from kvm_sched_out() we cannot sleep */ |
226 | if (state == RUNSTATE_runnable) | |
227 | return; | |
228 | ||
229 | if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len)) | |
230 | return; | |
231 | ||
232 | read_lock_irqsave(&gpc->lock, flags); | |
233 | } | |
30b5c851 | 234 | |
30b5c851 | 235 | /* |
fcb732d8 DW |
236 | * The only difference between 32-bit and 64-bit versions of the |
237 | * runstate struct us the alignment of uint64_t in 32-bit, which | |
238 | * means that the 64-bit version has an additional 4 bytes of | |
239 | * padding after the first field 'state'. | |
240 | * | |
241 | * So we use 'int __user *user_state' to point to the state field, | |
242 | * and 'uint64_t __user *user_times' for runstate_entry_time. So | |
243 | * the actual array of time[] in each state starts at user_times[1]. | |
30b5c851 | 244 | */ |
fcb732d8 DW |
245 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); |
246 | BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); | |
fcb732d8 | 247 | BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); |
fcb732d8 | 248 | #ifdef CONFIG_X86_64 |
30b5c851 DW |
249 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != |
250 | offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); | |
251 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != | |
252 | offsetof(struct compat_vcpu_runstate_info, time) + 4); | |
30b5c851 | 253 | #endif |
a795cd43 DW |
254 | |
255 | user_state = gpc->khva; | |
256 | ||
257 | if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) | |
258 | user_times = gpc->khva + offsetof(struct vcpu_runstate_info, | |
259 | state_entry_time); | |
260 | else | |
261 | user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info, | |
262 | state_entry_time); | |
263 | ||
30b5c851 DW |
264 | /* |
265 | * First write the updated state_entry_time at the appropriate | |
266 | * location determined by 'offset'. | |
267 | */ | |
6a834754 | 268 | BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != |
a795cd43 | 269 | sizeof(user_times[0])); |
6a834754 | 270 | BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != |
a795cd43 | 271 | sizeof(user_times[0])); |
30b5c851 | 272 | |
a795cd43 | 273 | user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; |
30b5c851 DW |
274 | smp_wmb(); |
275 | ||
276 | /* | |
277 | * Next, write the new runstate. This is in the *same* place | |
278 | * for 32-bit and 64-bit guests, asserted here for paranoia. | |
279 | */ | |
280 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != | |
281 | offsetof(struct compat_vcpu_runstate_info, state)); | |
6a834754 | 282 | BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) != |
30b5c851 | 283 | sizeof(vx->current_runstate)); |
6a834754 | 284 | BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != |
30b5c851 DW |
285 | sizeof(vx->current_runstate)); |
286 | ||
a795cd43 | 287 | *user_state = vx->current_runstate; |
30b5c851 DW |
288 | |
289 | /* | |
290 | * Write the actual runstate times immediately after the | |
291 | * runstate_entry_time. | |
292 | */ | |
293 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != | |
294 | offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); | |
295 | BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != | |
296 | offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); | |
6a834754 DW |
297 | BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != |
298 | sizeof_field(struct compat_vcpu_runstate_info, time)); | |
299 | BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != | |
30b5c851 DW |
300 | sizeof(vx->runstate_times)); |
301 | ||
a795cd43 | 302 | memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); |
30b5c851 DW |
303 | smp_wmb(); |
304 | ||
305 | /* | |
306 | * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's | |
307 | * runstate_entry_time field. | |
308 | */ | |
a795cd43 | 309 | user_times[0] &= ~XEN_RUNSTATE_UPDATE; |
fcb732d8 DW |
310 | smp_wmb(); |
311 | ||
a795cd43 | 312 | read_unlock_irqrestore(&gpc->lock, flags); |
fcb732d8 | 313 | |
a795cd43 | 314 | mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); |
30b5c851 DW |
315 | } |
316 | ||
fde0451b DW |
317 | static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) |
318 | { | |
319 | struct kvm_lapic_irq irq = { }; | |
320 | int r; | |
321 | ||
322 | irq.dest_id = v->vcpu_id; | |
323 | irq.vector = v->arch.xen.upcall_vector; | |
324 | irq.dest_mode = APIC_DEST_PHYSICAL; | |
325 | irq.shorthand = APIC_DEST_NOSHORT; | |
326 | irq.delivery_mode = APIC_DM_FIXED; | |
327 | irq.level = 1; | |
328 | ||
329 | /* The fast version will always work for physical unicast */ | |
330 | WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL)); | |
331 | } | |
332 | ||
7caf9571 DW |
333 | /* |
334 | * On event channel delivery, the vcpu_info may not have been accessible. | |
335 | * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which | |
336 | * need to be marked into the vcpu_info (and evtchn_upcall_pending set). | |
337 | * Do so now that we can sleep in the context of the vCPU to bring the | |
338 | * page in, and refresh the pfn cache for it. | |
339 | */ | |
340 | void kvm_xen_inject_pending_events(struct kvm_vcpu *v) | |
40da8ccd | 341 | { |
14243b38 | 342 | unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); |
7caf9571 DW |
343 | struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; |
344 | unsigned long flags; | |
345 | ||
346 | if (!evtchn_pending_sel) | |
347 | return; | |
348 | ||
349 | /* | |
350 | * Yes, this is an open-coded loop. But that's just what put_user() | |
351 | * does anyway. Page it in and retry the instruction. We're just a | |
352 | * little more honest about it. | |
353 | */ | |
354 | read_lock_irqsave(&gpc->lock, flags); | |
355 | while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, | |
356 | sizeof(struct vcpu_info))) { | |
357 | read_unlock_irqrestore(&gpc->lock, flags); | |
358 | ||
359 | if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, | |
360 | sizeof(struct vcpu_info))) | |
361 | return; | |
362 | ||
363 | read_lock_irqsave(&gpc->lock, flags); | |
364 | } | |
365 | ||
366 | /* Now gpc->khva is a valid kernel address for the vcpu_info */ | |
367 | if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { | |
368 | struct vcpu_info *vi = gpc->khva; | |
369 | ||
370 | asm volatile(LOCK_PREFIX "orq %0, %1\n" | |
371 | "notq %0\n" | |
372 | LOCK_PREFIX "andq %0, %2\n" | |
373 | : "=r" (evtchn_pending_sel), | |
374 | "+m" (vi->evtchn_pending_sel), | |
375 | "+m" (v->arch.xen.evtchn_pending_sel) | |
376 | : "0" (evtchn_pending_sel)); | |
377 | WRITE_ONCE(vi->evtchn_upcall_pending, 1); | |
378 | } else { | |
379 | u32 evtchn_pending_sel32 = evtchn_pending_sel; | |
380 | struct compat_vcpu_info *vi = gpc->khva; | |
381 | ||
382 | asm volatile(LOCK_PREFIX "orl %0, %1\n" | |
383 | "notl %0\n" | |
384 | LOCK_PREFIX "andl %0, %2\n" | |
385 | : "=r" (evtchn_pending_sel32), | |
386 | "+m" (vi->evtchn_pending_sel), | |
387 | "+m" (v->arch.xen.evtchn_pending_sel) | |
388 | : "0" (evtchn_pending_sel32)); | |
389 | WRITE_ONCE(vi->evtchn_upcall_pending, 1); | |
390 | } | |
391 | read_unlock_irqrestore(&gpc->lock, flags); | |
392 | ||
fde0451b DW |
393 | /* For the per-vCPU lapic vector, deliver it as MSI. */ |
394 | if (v->arch.xen.upcall_vector) | |
395 | kvm_xen_inject_vcpu_vector(v); | |
396 | ||
7caf9571 DW |
397 | mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); |
398 | } | |
399 | ||
400 | int __kvm_xen_has_interrupt(struct kvm_vcpu *v) | |
401 | { | |
402 | struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; | |
403 | unsigned long flags; | |
40da8ccd DW |
404 | u8 rc = 0; |
405 | ||
406 | /* | |
407 | * If the global upcall vector (HVMIRQ_callback_vector) is set and | |
408 | * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. | |
409 | */ | |
40da8ccd DW |
410 | |
411 | /* No need for compat handling here */ | |
412 | BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != | |
413 | offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); | |
414 | BUILD_BUG_ON(sizeof(rc) != | |
6a834754 | 415 | sizeof_field(struct vcpu_info, evtchn_upcall_pending)); |
40da8ccd | 416 | BUILD_BUG_ON(sizeof(rc) != |
6a834754 | 417 | sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); |
40da8ccd | 418 | |
7caf9571 DW |
419 | read_lock_irqsave(&gpc->lock, flags); |
420 | while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, | |
421 | sizeof(struct vcpu_info))) { | |
422 | read_unlock_irqrestore(&gpc->lock, flags); | |
0985dba8 | 423 | |
7caf9571 DW |
424 | /* |
425 | * This function gets called from kvm_vcpu_block() after setting the | |
426 | * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately | |
427 | * from a HLT. So we really mustn't sleep. If the page ended up absent | |
428 | * at that point, just return 1 in order to trigger an immediate wake, | |
429 | * and we'll end up getting called again from a context where we *can* | |
430 | * fault in the page and wait for it. | |
431 | */ | |
432 | if (in_atomic() || !task_is_running(current)) | |
433 | return 1; | |
0985dba8 | 434 | |
7caf9571 DW |
435 | if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, |
436 | sizeof(struct vcpu_info))) { | |
14243b38 DW |
437 | /* |
438 | * If this failed, userspace has screwed up the | |
439 | * vcpu_info mapping. No interrupts for you. | |
440 | */ | |
441 | return 0; | |
442 | } | |
7caf9571 | 443 | read_lock_irqsave(&gpc->lock, flags); |
14243b38 DW |
444 | } |
445 | ||
7caf9571 DW |
446 | rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending; |
447 | read_unlock_irqrestore(&gpc->lock, flags); | |
40da8ccd DW |
448 | return rc; |
449 | } | |
450 | ||
a76b9641 JM |
451 | int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) |
452 | { | |
453 | int r = -ENOENT; | |
454 | ||
13ffb97a | 455 | |
a76b9641 | 456 | switch (data->type) { |
a3833b81 | 457 | case KVM_XEN_ATTR_TYPE_LONG_MODE: |
13ffb97a JM |
458 | if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { |
459 | r = -EINVAL; | |
460 | } else { | |
2fd6df2f | 461 | mutex_lock(&kvm->lock); |
13ffb97a | 462 | kvm->arch.xen.long_mode = !!data->u.long_mode; |
2fd6df2f | 463 | mutex_unlock(&kvm->lock); |
13ffb97a JM |
464 | r = 0; |
465 | } | |
466 | break; | |
a3833b81 | 467 | |
13ffb97a | 468 | case KVM_XEN_ATTR_TYPE_SHARED_INFO: |
2fd6df2f | 469 | mutex_lock(&kvm->lock); |
13ffb97a | 470 | r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); |
2fd6df2f | 471 | mutex_unlock(&kvm->lock); |
a3833b81 | 472 | break; |
13ffb97a | 473 | |
40da8ccd | 474 | case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: |
0c165b3c | 475 | if (data->u.vector && data->u.vector < 0x10) |
40da8ccd DW |
476 | r = -EINVAL; |
477 | else { | |
2fd6df2f | 478 | mutex_lock(&kvm->lock); |
40da8ccd | 479 | kvm->arch.xen.upcall_vector = data->u.vector; |
2fd6df2f | 480 | mutex_unlock(&kvm->lock); |
40da8ccd DW |
481 | r = 0; |
482 | } | |
483 | break; | |
484 | ||
2fd6df2f JM |
485 | case KVM_XEN_ATTR_TYPE_EVTCHN: |
486 | r = kvm_xen_setattr_evtchn(kvm, data); | |
487 | break; | |
488 | ||
28d1629f DW |
489 | case KVM_XEN_ATTR_TYPE_XEN_VERSION: |
490 | mutex_lock(&kvm->lock); | |
491 | kvm->arch.xen.xen_version = data->u.xen_version; | |
492 | mutex_unlock(&kvm->lock); | |
493 | r = 0; | |
494 | break; | |
495 | ||
a76b9641 JM |
496 | default: |
497 | break; | |
498 | } | |
499 | ||
a76b9641 JM |
500 | return r; |
501 | } | |
502 | ||
503 | int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) | |
504 | { | |
505 | int r = -ENOENT; | |
506 | ||
507 | mutex_lock(&kvm->lock); | |
508 | ||
509 | switch (data->type) { | |
a3833b81 DW |
510 | case KVM_XEN_ATTR_TYPE_LONG_MODE: |
511 | data->u.long_mode = kvm->arch.xen.long_mode; | |
512 | r = 0; | |
513 | break; | |
13ffb97a JM |
514 | |
515 | case KVM_XEN_ATTR_TYPE_SHARED_INFO: | |
1cfc9c4b DW |
516 | if (kvm->arch.xen.shinfo_cache.active) |
517 | data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); | |
518 | else | |
519 | data->u.shared_info.gfn = GPA_INVALID; | |
0c165b3c | 520 | r = 0; |
13ffb97a JM |
521 | break; |
522 | ||
40da8ccd DW |
523 | case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: |
524 | data->u.vector = kvm->arch.xen.upcall_vector; | |
525 | r = 0; | |
526 | break; | |
527 | ||
28d1629f DW |
528 | case KVM_XEN_ATTR_TYPE_XEN_VERSION: |
529 | data->u.xen_version = kvm->arch.xen.xen_version; | |
530 | r = 0; | |
531 | break; | |
532 | ||
a76b9641 JM |
533 | default: |
534 | break; | |
535 | } | |
536 | ||
537 | mutex_unlock(&kvm->lock); | |
538 | return r; | |
539 | } | |
540 | ||
3e324615 DW |
541 | int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) |
542 | { | |
73e69a86 | 543 | int idx, r = -ENOENT; |
3e324615 DW |
544 | |
545 | mutex_lock(&vcpu->kvm->lock); | |
73e69a86 | 546 | idx = srcu_read_lock(&vcpu->kvm->srcu); |
3e324615 DW |
547 | |
548 | switch (data->type) { | |
73e69a86 JM |
549 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: |
550 | /* No compat necessary here. */ | |
551 | BUILD_BUG_ON(sizeof(struct vcpu_info) != | |
552 | sizeof(struct compat_vcpu_info)); | |
7d7c5f76 DW |
553 | BUILD_BUG_ON(offsetof(struct vcpu_info, time) != |
554 | offsetof(struct compat_vcpu_info, time)); | |
73e69a86 | 555 | |
0c165b3c | 556 | if (data->u.gpa == GPA_INVALID) { |
7caf9571 | 557 | kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); |
7d7c5f76 | 558 | r = 0; |
0c165b3c DW |
559 | break; |
560 | } | |
561 | ||
7caf9571 | 562 | r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, |
73e69a86 | 563 | &vcpu->arch.xen.vcpu_info_cache, |
7caf9571 | 564 | NULL, KVM_HOST_USES_PFN, data->u.gpa, |
73e69a86 | 565 | sizeof(struct vcpu_info)); |
7caf9571 | 566 | if (!r) |
aa096aa0 | 567 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
7caf9571 | 568 | |
73e69a86 JM |
569 | break; |
570 | ||
f2340cd9 | 571 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: |
0c165b3c | 572 | if (data->u.gpa == GPA_INVALID) { |
69d413cf DW |
573 | kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, |
574 | &vcpu->arch.xen.vcpu_time_info_cache); | |
7d7c5f76 | 575 | r = 0; |
0c165b3c DW |
576 | break; |
577 | } | |
578 | ||
69d413cf | 579 | r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, |
f2340cd9 | 580 | &vcpu->arch.xen.vcpu_time_info_cache, |
69d413cf | 581 | NULL, KVM_HOST_USES_PFN, data->u.gpa, |
f2340cd9 | 582 | sizeof(struct pvclock_vcpu_time_info)); |
69d413cf | 583 | if (!r) |
f2340cd9 | 584 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
f2340cd9 JM |
585 | break; |
586 | ||
30b5c851 DW |
587 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: |
588 | if (!sched_info_on()) { | |
589 | r = -EOPNOTSUPP; | |
590 | break; | |
591 | } | |
592 | if (data->u.gpa == GPA_INVALID) { | |
a795cd43 DW |
593 | kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, |
594 | &vcpu->arch.xen.runstate_cache); | |
30b5c851 DW |
595 | r = 0; |
596 | break; | |
597 | } | |
598 | ||
a795cd43 | 599 | r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, |
30b5c851 | 600 | &vcpu->arch.xen.runstate_cache, |
a795cd43 | 601 | NULL, KVM_HOST_USES_PFN, data->u.gpa, |
30b5c851 | 602 | sizeof(struct vcpu_runstate_info)); |
30b5c851 DW |
603 | break; |
604 | ||
605 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: | |
606 | if (!sched_info_on()) { | |
607 | r = -EOPNOTSUPP; | |
608 | break; | |
609 | } | |
610 | if (data->u.runstate.state > RUNSTATE_offline) { | |
611 | r = -EINVAL; | |
612 | break; | |
613 | } | |
614 | ||
615 | kvm_xen_update_runstate(vcpu, data->u.runstate.state); | |
616 | r = 0; | |
617 | break; | |
618 | ||
619 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA: | |
620 | if (!sched_info_on()) { | |
621 | r = -EOPNOTSUPP; | |
622 | break; | |
623 | } | |
624 | if (data->u.runstate.state > RUNSTATE_offline) { | |
625 | r = -EINVAL; | |
626 | break; | |
627 | } | |
628 | if (data->u.runstate.state_entry_time != | |
629 | (data->u.runstate.time_running + | |
630 | data->u.runstate.time_runnable + | |
631 | data->u.runstate.time_blocked + | |
632 | data->u.runstate.time_offline)) { | |
633 | r = -EINVAL; | |
634 | break; | |
635 | } | |
636 | if (get_kvmclock_ns(vcpu->kvm) < | |
637 | data->u.runstate.state_entry_time) { | |
638 | r = -EINVAL; | |
639 | break; | |
640 | } | |
641 | ||
642 | vcpu->arch.xen.current_runstate = data->u.runstate.state; | |
643 | vcpu->arch.xen.runstate_entry_time = | |
644 | data->u.runstate.state_entry_time; | |
645 | vcpu->arch.xen.runstate_times[RUNSTATE_running] = | |
646 | data->u.runstate.time_running; | |
647 | vcpu->arch.xen.runstate_times[RUNSTATE_runnable] = | |
648 | data->u.runstate.time_runnable; | |
649 | vcpu->arch.xen.runstate_times[RUNSTATE_blocked] = | |
650 | data->u.runstate.time_blocked; | |
651 | vcpu->arch.xen.runstate_times[RUNSTATE_offline] = | |
652 | data->u.runstate.time_offline; | |
653 | vcpu->arch.xen.last_steal = current->sched_info.run_delay; | |
654 | r = 0; | |
655 | break; | |
656 | ||
657 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST: | |
658 | if (!sched_info_on()) { | |
659 | r = -EOPNOTSUPP; | |
660 | break; | |
661 | } | |
662 | if (data->u.runstate.state > RUNSTATE_offline && | |
663 | data->u.runstate.state != (u64)-1) { | |
664 | r = -EINVAL; | |
665 | break; | |
666 | } | |
667 | /* The adjustment must add up */ | |
668 | if (data->u.runstate.state_entry_time != | |
669 | (data->u.runstate.time_running + | |
670 | data->u.runstate.time_runnable + | |
671 | data->u.runstate.time_blocked + | |
672 | data->u.runstate.time_offline)) { | |
673 | r = -EINVAL; | |
674 | break; | |
675 | } | |
676 | ||
677 | if (get_kvmclock_ns(vcpu->kvm) < | |
678 | (vcpu->arch.xen.runstate_entry_time + | |
679 | data->u.runstate.state_entry_time)) { | |
680 | r = -EINVAL; | |
681 | break; | |
682 | } | |
683 | ||
684 | vcpu->arch.xen.runstate_entry_time += | |
685 | data->u.runstate.state_entry_time; | |
686 | vcpu->arch.xen.runstate_times[RUNSTATE_running] += | |
687 | data->u.runstate.time_running; | |
688 | vcpu->arch.xen.runstate_times[RUNSTATE_runnable] += | |
689 | data->u.runstate.time_runnable; | |
690 | vcpu->arch.xen.runstate_times[RUNSTATE_blocked] += | |
691 | data->u.runstate.time_blocked; | |
692 | vcpu->arch.xen.runstate_times[RUNSTATE_offline] += | |
693 | data->u.runstate.time_offline; | |
694 | ||
695 | if (data->u.runstate.state <= RUNSTATE_offline) | |
696 | kvm_xen_update_runstate(vcpu, data->u.runstate.state); | |
697 | r = 0; | |
698 | break; | |
699 | ||
942c2490 DW |
700 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: |
701 | if (data->u.vcpu_id >= KVM_MAX_VCPUS) | |
702 | r = -EINVAL; | |
703 | else { | |
704 | vcpu->arch.xen.vcpu_id = data->u.vcpu_id; | |
705 | r = 0; | |
706 | } | |
707 | break; | |
708 | ||
53639526 JM |
709 | case KVM_XEN_VCPU_ATTR_TYPE_TIMER: |
710 | if (data->u.timer.port) { | |
711 | if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) { | |
712 | r = -EINVAL; | |
713 | break; | |
714 | } | |
715 | vcpu->arch.xen.timer_virq = data->u.timer.port; | |
af735db3 CD |
716 | |
717 | if (!vcpu->arch.xen.timer.function) | |
718 | kvm_xen_init_timer(vcpu); | |
53639526 JM |
719 | |
720 | /* Restart the timer if it's set */ | |
721 | if (data->u.timer.expires_ns) | |
722 | kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, | |
723 | data->u.timer.expires_ns - | |
724 | get_kvmclock_ns(vcpu->kvm)); | |
725 | } else if (kvm_xen_timer_enabled(vcpu)) { | |
726 | kvm_xen_stop_timer(vcpu); | |
727 | vcpu->arch.xen.timer_virq = 0; | |
728 | } | |
729 | ||
730 | r = 0; | |
731 | break; | |
732 | ||
fde0451b DW |
733 | case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: |
734 | if (data->u.vector && data->u.vector < 0x10) | |
735 | r = -EINVAL; | |
736 | else { | |
737 | vcpu->arch.xen.upcall_vector = data->u.vector; | |
738 | r = 0; | |
739 | } | |
740 | break; | |
741 | ||
3e324615 DW |
742 | default: |
743 | break; | |
744 | } | |
745 | ||
73e69a86 | 746 | srcu_read_unlock(&vcpu->kvm->srcu, idx); |
3e324615 DW |
747 | mutex_unlock(&vcpu->kvm->lock); |
748 | return r; | |
749 | } | |
750 | ||
751 | int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) | |
752 | { | |
753 | int r = -ENOENT; | |
754 | ||
755 | mutex_lock(&vcpu->kvm->lock); | |
756 | ||
757 | switch (data->type) { | |
73e69a86 | 758 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: |
7caf9571 | 759 | if (vcpu->arch.xen.vcpu_info_cache.active) |
73e69a86 | 760 | data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; |
0c165b3c DW |
761 | else |
762 | data->u.gpa = GPA_INVALID; | |
763 | r = 0; | |
73e69a86 JM |
764 | break; |
765 | ||
f2340cd9 | 766 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: |
69d413cf | 767 | if (vcpu->arch.xen.vcpu_time_info_cache.active) |
f2340cd9 | 768 | data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; |
0c165b3c DW |
769 | else |
770 | data->u.gpa = GPA_INVALID; | |
771 | r = 0; | |
f2340cd9 JM |
772 | break; |
773 | ||
30b5c851 DW |
774 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: |
775 | if (!sched_info_on()) { | |
776 | r = -EOPNOTSUPP; | |
777 | break; | |
778 | } | |
a795cd43 | 779 | if (vcpu->arch.xen.runstate_cache.active) { |
30b5c851 DW |
780 | data->u.gpa = vcpu->arch.xen.runstate_cache.gpa; |
781 | r = 0; | |
782 | } | |
783 | break; | |
784 | ||
785 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: | |
786 | if (!sched_info_on()) { | |
787 | r = -EOPNOTSUPP; | |
788 | break; | |
789 | } | |
790 | data->u.runstate.state = vcpu->arch.xen.current_runstate; | |
791 | r = 0; | |
792 | break; | |
793 | ||
794 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA: | |
795 | if (!sched_info_on()) { | |
796 | r = -EOPNOTSUPP; | |
797 | break; | |
798 | } | |
799 | data->u.runstate.state = vcpu->arch.xen.current_runstate; | |
800 | data->u.runstate.state_entry_time = | |
801 | vcpu->arch.xen.runstate_entry_time; | |
802 | data->u.runstate.time_running = | |
803 | vcpu->arch.xen.runstate_times[RUNSTATE_running]; | |
804 | data->u.runstate.time_runnable = | |
805 | vcpu->arch.xen.runstate_times[RUNSTATE_runnable]; | |
806 | data->u.runstate.time_blocked = | |
807 | vcpu->arch.xen.runstate_times[RUNSTATE_blocked]; | |
808 | data->u.runstate.time_offline = | |
809 | vcpu->arch.xen.runstate_times[RUNSTATE_offline]; | |
810 | r = 0; | |
811 | break; | |
812 | ||
813 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST: | |
814 | r = -EINVAL; | |
815 | break; | |
816 | ||
942c2490 DW |
817 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: |
818 | data->u.vcpu_id = vcpu->arch.xen.vcpu_id; | |
819 | r = 0; | |
820 | break; | |
821 | ||
53639526 JM |
822 | case KVM_XEN_VCPU_ATTR_TYPE_TIMER: |
823 | data->u.timer.port = vcpu->arch.xen.timer_virq; | |
824 | data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; | |
825 | data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; | |
826 | r = 0; | |
827 | break; | |
828 | ||
fde0451b DW |
829 | case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: |
830 | data->u.vector = vcpu->arch.xen.upcall_vector; | |
831 | r = 0; | |
832 | break; | |
833 | ||
3e324615 DW |
834 | default: |
835 | break; | |
836 | } | |
837 | ||
838 | mutex_unlock(&vcpu->kvm->lock); | |
839 | return r; | |
840 | } | |
841 | ||
23200b7a JM |
842 | int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) |
843 | { | |
844 | struct kvm *kvm = vcpu->kvm; | |
845 | u32 page_num = data & ~PAGE_MASK; | |
846 | u64 page_addr = data & PAGE_MASK; | |
a3833b81 DW |
847 | bool lm = is_long_mode(vcpu); |
848 | ||
849 | /* Latch long_mode for shared_info pages etc. */ | |
850 | vcpu->kvm->arch.xen.long_mode = lm; | |
23200b7a JM |
851 | |
852 | /* | |
853 | * If Xen hypercall intercept is enabled, fill the hypercall | |
854 | * page with VMCALL/VMMCALL instructions since that's what | |
855 | * we catch. Else the VMM has provided the hypercall pages | |
856 | * with instructions of its own choosing, so use those. | |
857 | */ | |
858 | if (kvm_xen_hypercall_enabled(kvm)) { | |
859 | u8 instructions[32]; | |
860 | int i; | |
861 | ||
862 | if (page_num) | |
863 | return 1; | |
864 | ||
865 | /* mov imm32, %eax */ | |
866 | instructions[0] = 0xb8; | |
867 | ||
868 | /* vmcall / vmmcall */ | |
0264a351 | 869 | static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); |
23200b7a JM |
870 | |
871 | /* ret */ | |
872 | instructions[8] = 0xc3; | |
873 | ||
874 | /* int3 to pad */ | |
875 | memset(instructions + 9, 0xcc, sizeof(instructions) - 9); | |
876 | ||
877 | for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) { | |
878 | *(u32 *)&instructions[1] = i; | |
879 | if (kvm_vcpu_write_guest(vcpu, | |
880 | page_addr + (i * sizeof(instructions)), | |
881 | instructions, sizeof(instructions))) | |
882 | return 1; | |
883 | } | |
884 | } else { | |
448841f0 SC |
885 | /* |
886 | * Note, truncation is a non-issue as 'lm' is guaranteed to be | |
887 | * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. | |
888 | */ | |
889 | hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 | |
890 | : kvm->arch.xen_hvm_config.blob_addr_32; | |
23200b7a JM |
891 | u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 |
892 | : kvm->arch.xen_hvm_config.blob_size_32; | |
893 | u8 *page; | |
894 | ||
895 | if (page_num >= blob_size) | |
896 | return 1; | |
897 | ||
898 | blob_addr += page_num * PAGE_SIZE; | |
899 | ||
900 | page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE); | |
901 | if (IS_ERR(page)) | |
902 | return PTR_ERR(page); | |
903 | ||
904 | if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { | |
905 | kfree(page); | |
906 | return 1; | |
907 | } | |
908 | } | |
909 | return 0; | |
910 | } | |
911 | ||
78e9878c DW |
912 | int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) |
913 | { | |
661a20fa DW |
914 | /* Only some feature flags need to be *enabled* by userspace */ |
915 | u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | | |
916 | KVM_XEN_HVM_CONFIG_EVTCHN_SEND; | |
917 | ||
918 | if (xhc->flags & ~permitted_flags) | |
78e9878c DW |
919 | return -EINVAL; |
920 | ||
921 | /* | |
922 | * With hypercall interception the kernel generates its own | |
923 | * hypercall page so it must not be provided. | |
924 | */ | |
925 | if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) && | |
926 | (xhc->blob_addr_32 || xhc->blob_addr_64 || | |
927 | xhc->blob_size_32 || xhc->blob_size_64)) | |
928 | return -EINVAL; | |
929 | ||
7d6bbebb DW |
930 | mutex_lock(&kvm->lock); |
931 | ||
932 | if (xhc->msr && !kvm->arch.xen_hvm_config.msr) | |
933 | static_branch_inc(&kvm_xen_enabled.key); | |
934 | else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) | |
935 | static_branch_slow_dec_deferred(&kvm_xen_enabled); | |
936 | ||
78e9878c | 937 | memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); |
7d6bbebb DW |
938 | |
939 | mutex_unlock(&kvm->lock); | |
78e9878c DW |
940 | return 0; |
941 | } | |
942 | ||
23200b7a JM |
943 | static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) |
944 | { | |
945 | kvm_rax_write(vcpu, result); | |
946 | return kvm_skip_emulated_instruction(vcpu); | |
947 | } | |
948 | ||
949 | static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) | |
950 | { | |
951 | struct kvm_run *run = vcpu->run; | |
952 | ||
953 | if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip))) | |
954 | return 1; | |
955 | ||
956 | return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); | |
957 | } | |
958 | ||
1a65105a BO |
959 | static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, |
960 | evtchn_port_t *ports) | |
961 | { | |
962 | struct kvm *kvm = vcpu->kvm; | |
963 | struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; | |
964 | unsigned long *pending_bits; | |
965 | unsigned long flags; | |
966 | bool ret = true; | |
967 | int idx, i; | |
968 | ||
969 | read_lock_irqsave(&gpc->lock, flags); | |
970 | idx = srcu_read_lock(&kvm->srcu); | |
971 | if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) | |
972 | goto out_rcu; | |
973 | ||
974 | ret = false; | |
975 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { | |
976 | struct shared_info *shinfo = gpc->khva; | |
977 | pending_bits = (unsigned long *)&shinfo->evtchn_pending; | |
978 | } else { | |
979 | struct compat_shared_info *shinfo = gpc->khva; | |
980 | pending_bits = (unsigned long *)&shinfo->evtchn_pending; | |
981 | } | |
982 | ||
983 | for (i = 0; i < nr_ports; i++) { | |
984 | if (test_bit(ports[i], pending_bits)) { | |
985 | ret = true; | |
986 | break; | |
987 | } | |
988 | } | |
989 | ||
990 | out_rcu: | |
991 | srcu_read_unlock(&kvm->srcu, idx); | |
992 | read_unlock_irqrestore(&gpc->lock, flags); | |
993 | ||
994 | return ret; | |
995 | } | |
996 | ||
997 | static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, | |
998 | u64 param, u64 *r) | |
999 | { | |
1000 | int idx, i; | |
1001 | struct sched_poll sched_poll; | |
1002 | evtchn_port_t port, *ports; | |
1003 | gpa_t gpa; | |
1004 | ||
1005 | if (!longmode || !lapic_in_kernel(vcpu) || | |
1006 | !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) | |
1007 | return false; | |
1008 | ||
1009 | idx = srcu_read_lock(&vcpu->kvm->srcu); | |
1010 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); | |
1011 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | |
1012 | ||
1013 | if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, | |
1014 | sizeof(sched_poll))) { | |
1015 | *r = -EFAULT; | |
1016 | return true; | |
1017 | } | |
1018 | ||
1019 | if (unlikely(sched_poll.nr_ports > 1)) { | |
1020 | /* Xen (unofficially) limits number of pollers to 128 */ | |
1021 | if (sched_poll.nr_ports > 128) { | |
1022 | *r = -EINVAL; | |
1023 | return true; | |
1024 | } | |
1025 | ||
1026 | ports = kmalloc_array(sched_poll.nr_ports, | |
1027 | sizeof(*ports), GFP_KERNEL); | |
1028 | if (!ports) { | |
1029 | *r = -ENOMEM; | |
1030 | return true; | |
1031 | } | |
1032 | } else | |
1033 | ports = &port; | |
1034 | ||
1035 | for (i = 0; i < sched_poll.nr_ports; i++) { | |
1036 | idx = srcu_read_lock(&vcpu->kvm->srcu); | |
1037 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, | |
1038 | (gva_t)(sched_poll.ports + i), | |
1039 | NULL); | |
1040 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | |
1041 | ||
1042 | if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, | |
1043 | &ports[i], sizeof(port))) { | |
1044 | *r = -EFAULT; | |
1045 | goto out; | |
1046 | } | |
1047 | } | |
1048 | ||
1049 | if (sched_poll.nr_ports == 1) | |
1050 | vcpu->arch.xen.poll_evtchn = port; | |
1051 | else | |
1052 | vcpu->arch.xen.poll_evtchn = -1; | |
1053 | ||
79f772b9 | 1054 | set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); |
1a65105a BO |
1055 | |
1056 | if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { | |
1057 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; | |
1058 | ||
1059 | if (sched_poll.timeout) | |
1060 | mod_timer(&vcpu->arch.xen.poll_timer, | |
1061 | jiffies + nsecs_to_jiffies(sched_poll.timeout)); | |
1062 | ||
1063 | kvm_vcpu_halt(vcpu); | |
1064 | ||
1065 | if (sched_poll.timeout) | |
1066 | del_timer(&vcpu->arch.xen.poll_timer); | |
1067 | ||
1068 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | |
1069 | kvm_clear_request(KVM_REQ_UNHALT, vcpu); | |
1070 | } | |
1071 | ||
1072 | vcpu->arch.xen.poll_evtchn = 0; | |
1073 | *r = 0; | |
1074 | out: | |
1075 | /* Really, this is only needed in case of timeout */ | |
79f772b9 | 1076 | clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); |
1a65105a BO |
1077 | |
1078 | if (unlikely(sched_poll.nr_ports > 1)) | |
1079 | kfree(ports); | |
1080 | return true; | |
1081 | } | |
1082 | ||
1083 | static void cancel_evtchn_poll(struct timer_list *t) | |
1084 | { | |
1085 | struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); | |
1086 | ||
1087 | kvm_make_request(KVM_REQ_UNBLOCK, vcpu); | |
1088 | kvm_vcpu_kick(vcpu); | |
1089 | } | |
1090 | ||
1091 | static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode, | |
1092 | int cmd, u64 param, u64 *r) | |
0ec6c5c5 JM |
1093 | { |
1094 | switch (cmd) { | |
1a65105a BO |
1095 | case SCHEDOP_poll: |
1096 | if (kvm_xen_schedop_poll(vcpu, longmode, param, r)) | |
1097 | return true; | |
1098 | fallthrough; | |
0ec6c5c5 JM |
1099 | case SCHEDOP_yield: |
1100 | kvm_vcpu_on_spin(vcpu, true); | |
1101 | *r = 0; | |
1102 | return true; | |
1103 | default: | |
1104 | break; | |
1105 | } | |
1106 | ||
1107 | return false; | |
1108 | } | |
1109 | ||
53639526 JM |
1110 | struct compat_vcpu_set_singleshot_timer { |
1111 | uint64_t timeout_abs_ns; | |
1112 | uint32_t flags; | |
1113 | } __attribute__((packed)); | |
1114 | ||
1115 | static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, | |
1116 | int vcpu_id, u64 param, u64 *r) | |
1117 | { | |
1118 | struct vcpu_set_singleshot_timer oneshot; | |
1119 | s64 delta; | |
1120 | gpa_t gpa; | |
1121 | int idx; | |
1122 | ||
1123 | if (!kvm_xen_timer_enabled(vcpu)) | |
1124 | return false; | |
1125 | ||
1126 | switch (cmd) { | |
1127 | case VCPUOP_set_singleshot_timer: | |
1128 | if (vcpu->arch.xen.vcpu_id != vcpu_id) { | |
1129 | *r = -EINVAL; | |
1130 | return true; | |
1131 | } | |
1132 | idx = srcu_read_lock(&vcpu->kvm->srcu); | |
1133 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); | |
1134 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | |
1135 | ||
1136 | /* | |
1137 | * The only difference for 32-bit compat is the 4 bytes of | |
1138 | * padding after the interesting part of the structure. So | |
1139 | * for a faithful emulation of Xen we have to *try* to copy | |
1140 | * the padding and return -EFAULT if we can't. Otherwise we | |
1141 | * might as well just have copied the 12-byte 32-bit struct. | |
1142 | */ | |
1143 | BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != | |
1144 | offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns)); | |
1145 | BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != | |
1146 | sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns)); | |
1147 | BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) != | |
1148 | offsetof(struct vcpu_set_singleshot_timer, flags)); | |
1149 | BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) != | |
1150 | sizeof_field(struct vcpu_set_singleshot_timer, flags)); | |
1151 | ||
1152 | if (!gpa || | |
1153 | kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) : | |
1154 | sizeof(struct compat_vcpu_set_singleshot_timer))) { | |
1155 | *r = -EFAULT; | |
1156 | return true; | |
1157 | } | |
1158 | ||
1159 | delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); | |
1160 | if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) { | |
1161 | *r = -ETIME; | |
1162 | return true; | |
1163 | } | |
1164 | ||
1165 | kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); | |
1166 | *r = 0; | |
1167 | return true; | |
1168 | ||
1169 | case VCPUOP_stop_singleshot_timer: | |
1170 | if (vcpu->arch.xen.vcpu_id != vcpu_id) { | |
1171 | *r = -EINVAL; | |
1172 | return true; | |
1173 | } | |
1174 | kvm_xen_stop_timer(vcpu); | |
1175 | *r = 0; | |
1176 | return true; | |
1177 | } | |
1178 | ||
1179 | return false; | |
1180 | } | |
1181 | ||
1182 | static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout, | |
1183 | u64 *r) | |
1184 | { | |
1185 | if (!kvm_xen_timer_enabled(vcpu)) | |
1186 | return false; | |
1187 | ||
1188 | if (timeout) { | |
1189 | uint64_t guest_now = get_kvmclock_ns(vcpu->kvm); | |
1190 | int64_t delta = timeout - guest_now; | |
1191 | ||
1192 | /* Xen has a 'Linux workaround' in do_set_timer_op() which | |
1193 | * checks for negative absolute timeout values (caused by | |
1194 | * integer overflow), and for values about 13 days in the | |
1195 | * future (2^50ns) which would be caused by jiffies | |
1196 | * overflow. For those cases, it sets the timeout 100ms in | |
1197 | * the future (not *too* soon, since if a guest really did | |
1198 | * set a long timeout on purpose we don't want to keep | |
1199 | * churning CPU time by waking it up). | |
1200 | */ | |
1201 | if (unlikely((int64_t)timeout < 0 || | |
1202 | (delta > 0 && (uint32_t) (delta >> 50) != 0))) { | |
1203 | delta = 100 * NSEC_PER_MSEC; | |
1204 | timeout = guest_now + delta; | |
1205 | } | |
1206 | ||
1207 | kvm_xen_start_timer(vcpu, timeout, delta); | |
1208 | } else { | |
1209 | kvm_xen_stop_timer(vcpu); | |
1210 | } | |
1211 | ||
1212 | *r = 0; | |
1213 | return true; | |
1214 | } | |
1215 | ||
23200b7a JM |
1216 | int kvm_xen_hypercall(struct kvm_vcpu *vcpu) |
1217 | { | |
1218 | bool longmode; | |
2fd6df2f JM |
1219 | u64 input, params[6], r = -ENOSYS; |
1220 | bool handled = false; | |
23200b7a JM |
1221 | |
1222 | input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); | |
1223 | ||
79033beb JM |
1224 | /* Hyper-V hypercalls get bit 31 set in EAX */ |
1225 | if ((input & 0x80000000) && | |
8f014550 | 1226 | kvm_hv_hypercall_enabled(vcpu)) |
79033beb JM |
1227 | return kvm_hv_hypercall(vcpu); |
1228 | ||
b5aead00 | 1229 | longmode = is_64_bit_hypercall(vcpu); |
23200b7a JM |
1230 | if (!longmode) { |
1231 | params[0] = (u32)kvm_rbx_read(vcpu); | |
1232 | params[1] = (u32)kvm_rcx_read(vcpu); | |
1233 | params[2] = (u32)kvm_rdx_read(vcpu); | |
1234 | params[3] = (u32)kvm_rsi_read(vcpu); | |
1235 | params[4] = (u32)kvm_rdi_read(vcpu); | |
1236 | params[5] = (u32)kvm_rbp_read(vcpu); | |
1237 | } | |
1238 | #ifdef CONFIG_X86_64 | |
1239 | else { | |
1240 | params[0] = (u64)kvm_rdi_read(vcpu); | |
1241 | params[1] = (u64)kvm_rsi_read(vcpu); | |
1242 | params[2] = (u64)kvm_rdx_read(vcpu); | |
1243 | params[3] = (u64)kvm_r10_read(vcpu); | |
1244 | params[4] = (u64)kvm_r8_read(vcpu); | |
1245 | params[5] = (u64)kvm_r9_read(vcpu); | |
1246 | } | |
1247 | #endif | |
1248 | trace_kvm_xen_hypercall(input, params[0], params[1], params[2], | |
1249 | params[3], params[4], params[5]); | |
1250 | ||
2fd6df2f | 1251 | switch (input) { |
28d1629f DW |
1252 | case __HYPERVISOR_xen_version: |
1253 | if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) { | |
1254 | r = vcpu->kvm->arch.xen.xen_version; | |
1255 | handled = true; | |
1256 | } | |
1257 | break; | |
2fd6df2f JM |
1258 | case __HYPERVISOR_event_channel_op: |
1259 | if (params[0] == EVTCHNOP_send) | |
1260 | handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r); | |
1261 | break; | |
0ec6c5c5 | 1262 | case __HYPERVISOR_sched_op: |
1a65105a BO |
1263 | handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0], |
1264 | params[1], &r); | |
0ec6c5c5 | 1265 | break; |
53639526 JM |
1266 | case __HYPERVISOR_vcpu_op: |
1267 | handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1], | |
1268 | params[2], &r); | |
1269 | break; | |
1270 | case __HYPERVISOR_set_timer_op: { | |
1271 | u64 timeout = params[0]; | |
1272 | /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */ | |
1273 | if (!longmode) | |
1274 | timeout |= params[1] << 32; | |
1275 | handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r); | |
1276 | break; | |
1277 | } | |
2fd6df2f JM |
1278 | default: |
1279 | break; | |
1280 | } | |
1281 | ||
1282 | if (handled) | |
1283 | return kvm_xen_hypercall_set_result(vcpu, r); | |
1284 | ||
23200b7a JM |
1285 | vcpu->run->exit_reason = KVM_EXIT_XEN; |
1286 | vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; | |
1287 | vcpu->run->xen.u.hcall.longmode = longmode; | |
0264a351 | 1288 | vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu); |
23200b7a JM |
1289 | vcpu->run->xen.u.hcall.input = input; |
1290 | vcpu->run->xen.u.hcall.params[0] = params[0]; | |
1291 | vcpu->run->xen.u.hcall.params[1] = params[1]; | |
1292 | vcpu->run->xen.u.hcall.params[2] = params[2]; | |
1293 | vcpu->run->xen.u.hcall.params[3] = params[3]; | |
1294 | vcpu->run->xen.u.hcall.params[4] = params[4]; | |
1295 | vcpu->run->xen.u.hcall.params[5] = params[5]; | |
1296 | vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu); | |
1297 | vcpu->arch.complete_userspace_io = | |
1298 | kvm_xen_hypercall_complete_userspace; | |
1299 | ||
1300 | return 0; | |
1301 | } | |
14243b38 DW |
1302 | |
1303 | static inline int max_evtchn_port(struct kvm *kvm) | |
1304 | { | |
1305 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) | |
1306 | return EVTCHN_2L_NR_CHANNELS; | |
1307 | else | |
1308 | return COMPAT_EVTCHN_2L_NR_CHANNELS; | |
1309 | } | |
1310 | ||
1a65105a BO |
1311 | static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) |
1312 | { | |
1313 | int poll_evtchn = vcpu->arch.xen.poll_evtchn; | |
1314 | ||
1315 | if ((poll_evtchn == port || poll_evtchn == -1) && | |
79f772b9 | 1316 | test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) { |
1a65105a BO |
1317 | kvm_make_request(KVM_REQ_UNBLOCK, vcpu); |
1318 | kvm_vcpu_kick(vcpu); | |
1319 | } | |
1320 | } | |
1321 | ||
14243b38 | 1322 | /* |
8733068b DW |
1323 | * The return value from this function is propagated to kvm_set_irq() API, |
1324 | * so it returns: | |
14243b38 DW |
1325 | * < 0 Interrupt was ignored (masked or not delivered for other reasons) |
1326 | * = 0 Interrupt was coalesced (previous irq is still pending) | |
1327 | * > 0 Number of CPUs interrupt was delivered to | |
8733068b DW |
1328 | * |
1329 | * It is also called directly from kvm_arch_set_irq_inatomic(), where the | |
1330 | * only check on its return value is a comparison with -EWOULDBLOCK'. | |
14243b38 | 1331 | */ |
8733068b | 1332 | int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) |
14243b38 DW |
1333 | { |
1334 | struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; | |
1335 | struct kvm_vcpu *vcpu; | |
1336 | unsigned long *pending_bits, *mask_bits; | |
1337 | unsigned long flags; | |
1338 | int port_word_bit; | |
1339 | bool kick_vcpu = false; | |
8733068b | 1340 | int vcpu_idx, idx, rc; |
14243b38 | 1341 | |
8733068b DW |
1342 | vcpu_idx = READ_ONCE(xe->vcpu_idx); |
1343 | if (vcpu_idx >= 0) | |
1344 | vcpu = kvm_get_vcpu(kvm, vcpu_idx); | |
1345 | else { | |
1346 | vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); | |
1347 | if (!vcpu) | |
1348 | return -EINVAL; | |
79f772b9 | 1349 | WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); |
8733068b | 1350 | } |
14243b38 | 1351 | |
7caf9571 | 1352 | if (!vcpu->arch.xen.vcpu_info_cache.active) |
8733068b | 1353 | return -EINVAL; |
14243b38 | 1354 | |
8733068b DW |
1355 | if (xe->port >= max_evtchn_port(kvm)) |
1356 | return -EINVAL; | |
14243b38 DW |
1357 | |
1358 | rc = -EWOULDBLOCK; | |
14243b38 DW |
1359 | |
1360 | idx = srcu_read_lock(&kvm->srcu); | |
7caf9571 DW |
1361 | |
1362 | read_lock_irqsave(&gpc->lock, flags); | |
14243b38 DW |
1363 | if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) |
1364 | goto out_rcu; | |
1365 | ||
1366 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { | |
1367 | struct shared_info *shinfo = gpc->khva; | |
1368 | pending_bits = (unsigned long *)&shinfo->evtchn_pending; | |
1369 | mask_bits = (unsigned long *)&shinfo->evtchn_mask; | |
8733068b | 1370 | port_word_bit = xe->port / 64; |
14243b38 DW |
1371 | } else { |
1372 | struct compat_shared_info *shinfo = gpc->khva; | |
1373 | pending_bits = (unsigned long *)&shinfo->evtchn_pending; | |
1374 | mask_bits = (unsigned long *)&shinfo->evtchn_mask; | |
8733068b | 1375 | port_word_bit = xe->port / 32; |
14243b38 DW |
1376 | } |
1377 | ||
1378 | /* | |
1379 | * If this port wasn't already set, and if it isn't masked, then | |
1380 | * we try to set the corresponding bit in the in-kernel shadow of | |
1381 | * evtchn_pending_sel for the target vCPU. And if *that* wasn't | |
1382 | * already set, then we kick the vCPU in question to write to the | |
1383 | * *real* evtchn_pending_sel in its own guest vcpu_info struct. | |
1384 | */ | |
8733068b | 1385 | if (test_and_set_bit(xe->port, pending_bits)) { |
14243b38 | 1386 | rc = 0; /* It was already raised */ |
8733068b DW |
1387 | } else if (test_bit(xe->port, mask_bits)) { |
1388 | rc = -ENOTCONN; /* Masked */ | |
1a65105a | 1389 | kvm_xen_check_poller(vcpu, xe->port); |
14243b38 | 1390 | } else { |
7caf9571 DW |
1391 | rc = 1; /* Delivered to the bitmap in shared_info. */ |
1392 | /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ | |
1393 | read_unlock_irqrestore(&gpc->lock, flags); | |
1394 | gpc = &vcpu->arch.xen.vcpu_info_cache; | |
1395 | ||
1396 | read_lock_irqsave(&gpc->lock, flags); | |
1397 | if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { | |
1398 | /* | |
1399 | * Could not access the vcpu_info. Set the bit in-kernel | |
1400 | * and prod the vCPU to deliver it for itself. | |
1401 | */ | |
1402 | if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) | |
1403 | kick_vcpu = true; | |
1404 | goto out_rcu; | |
1405 | } | |
1406 | ||
1407 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { | |
1408 | struct vcpu_info *vcpu_info = gpc->khva; | |
1409 | if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) { | |
1410 | WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); | |
1411 | kick_vcpu = true; | |
1412 | } | |
1413 | } else { | |
1414 | struct compat_vcpu_info *vcpu_info = gpc->khva; | |
1415 | if (!test_and_set_bit(port_word_bit, | |
1416 | (unsigned long *)&vcpu_info->evtchn_pending_sel)) { | |
1417 | WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); | |
1418 | kick_vcpu = true; | |
1419 | } | |
1420 | } | |
fde0451b DW |
1421 | |
1422 | /* For the per-vCPU lapic vector, deliver it as MSI. */ | |
1423 | if (kick_vcpu && vcpu->arch.xen.upcall_vector) { | |
1424 | kvm_xen_inject_vcpu_vector(vcpu); | |
1425 | kick_vcpu = false; | |
1426 | } | |
14243b38 DW |
1427 | } |
1428 | ||
1429 | out_rcu: | |
14243b38 | 1430 | read_unlock_irqrestore(&gpc->lock, flags); |
7caf9571 | 1431 | srcu_read_unlock(&kvm->srcu, idx); |
14243b38 DW |
1432 | |
1433 | if (kick_vcpu) { | |
7caf9571 | 1434 | kvm_make_request(KVM_REQ_UNBLOCK, vcpu); |
14243b38 DW |
1435 | kvm_vcpu_kick(vcpu); |
1436 | } | |
1437 | ||
1438 | return rc; | |
1439 | } | |
1440 | ||
8733068b | 1441 | static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) |
14243b38 DW |
1442 | { |
1443 | bool mm_borrowed = false; | |
1444 | int rc; | |
1445 | ||
8733068b | 1446 | rc = kvm_xen_set_evtchn_fast(xe, kvm); |
14243b38 DW |
1447 | if (rc != -EWOULDBLOCK) |
1448 | return rc; | |
1449 | ||
1450 | if (current->mm != kvm->mm) { | |
1451 | /* | |
1452 | * If not on a thread which already belongs to this KVM, | |
1453 | * we'd better be in the irqfd workqueue. | |
1454 | */ | |
1455 | if (WARN_ON_ONCE(current->mm)) | |
1456 | return -EINVAL; | |
1457 | ||
1458 | kthread_use_mm(kvm->mm); | |
1459 | mm_borrowed = true; | |
1460 | } | |
1461 | ||
1462 | /* | |
1463 | * For the irqfd workqueue, using the main kvm->lock mutex is | |
1464 | * fine since this function is invoked from kvm_set_irq() with | |
1465 | * no other lock held, no srcu. In future if it will be called | |
1466 | * directly from a vCPU thread (e.g. on hypercall for an IPI) | |
1467 | * then it may need to switch to using a leaf-node mutex for | |
1468 | * serializing the shared_info mapping. | |
1469 | */ | |
1470 | mutex_lock(&kvm->lock); | |
1471 | ||
1472 | /* | |
1473 | * It is theoretically possible for the page to be unmapped | |
1474 | * and the MMU notifier to invalidate the shared_info before | |
1475 | * we even get to use it. In that case, this looks like an | |
1476 | * infinite loop. It was tempting to do it via the userspace | |
1477 | * HVA instead... but that just *hides* the fact that it's | |
1478 | * an infinite loop, because if a fault occurs and it waits | |
1479 | * for the page to come back, it can *still* immediately | |
1480 | * fault and have to wait again, repeatedly. | |
1481 | * | |
1482 | * Conversely, the page could also have been reinstated by | |
1483 | * another thread before we even obtain the mutex above, so | |
1484 | * check again *first* before remapping it. | |
1485 | */ | |
1486 | do { | |
1487 | struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; | |
1488 | int idx; | |
1489 | ||
8733068b | 1490 | rc = kvm_xen_set_evtchn_fast(xe, kvm); |
14243b38 DW |
1491 | if (rc != -EWOULDBLOCK) |
1492 | break; | |
1493 | ||
1494 | idx = srcu_read_lock(&kvm->srcu); | |
cf1d88b3 | 1495 | rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); |
14243b38 DW |
1496 | srcu_read_unlock(&kvm->srcu, idx); |
1497 | } while(!rc); | |
1498 | ||
1499 | mutex_unlock(&kvm->lock); | |
1500 | ||
1501 | if (mm_borrowed) | |
1502 | kthread_unuse_mm(kvm->mm); | |
1503 | ||
1504 | return rc; | |
1505 | } | |
1506 | ||
8733068b DW |
1507 | /* This is the version called from kvm_set_irq() as the .set function */ |
1508 | static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, | |
1509 | int irq_source_id, int level, bool line_status) | |
1510 | { | |
1511 | if (!level) | |
1512 | return -EINVAL; | |
1513 | ||
1514 | return kvm_xen_set_evtchn(&e->xen_evtchn, kvm); | |
1515 | } | |
1516 | ||
1517 | /* | |
1518 | * Set up an event channel interrupt from the KVM IRQ routing table. | |
1519 | * Used for e.g. PIRQ from passed through physical devices. | |
1520 | */ | |
14243b38 DW |
1521 | int kvm_xen_setup_evtchn(struct kvm *kvm, |
1522 | struct kvm_kernel_irq_routing_entry *e, | |
1523 | const struct kvm_irq_routing_entry *ue) | |
1524 | ||
1525 | { | |
8733068b DW |
1526 | struct kvm_vcpu *vcpu; |
1527 | ||
14243b38 DW |
1528 | if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) |
1529 | return -EINVAL; | |
1530 | ||
1531 | /* We only support 2 level event channels for now */ | |
1532 | if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) | |
1533 | return -EINVAL; | |
1534 | ||
8733068b DW |
1535 | /* |
1536 | * Xen gives us interesting mappings from vCPU index to APIC ID, | |
1537 | * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs | |
1538 | * to find it. Do that once at setup time, instead of every time. | |
1539 | * But beware that on live update / live migration, the routing | |
1540 | * table might be reinstated before the vCPU threads have finished | |
1541 | * recreating their vCPUs. | |
1542 | */ | |
1543 | vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); | |
1544 | if (vcpu) | |
79f772b9 | 1545 | e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx; |
8733068b DW |
1546 | else |
1547 | e->xen_evtchn.vcpu_idx = -1; | |
1548 | ||
14243b38 | 1549 | e->xen_evtchn.port = ue->u.xen_evtchn.port; |
8733068b | 1550 | e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu; |
14243b38 DW |
1551 | e->xen_evtchn.priority = ue->u.xen_evtchn.priority; |
1552 | e->set = evtchn_set_fn; | |
1553 | ||
1554 | return 0; | |
1555 | } | |
a795cd43 | 1556 | |
35025735 DW |
1557 | /* |
1558 | * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl. | |
1559 | */ | |
1560 | int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe) | |
1561 | { | |
1562 | struct kvm_xen_evtchn e; | |
1563 | int ret; | |
1564 | ||
1565 | if (!uxe->port || uxe->port >= max_evtchn_port(kvm)) | |
1566 | return -EINVAL; | |
1567 | ||
1568 | /* We only support 2 level event channels for now */ | |
1569 | if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) | |
1570 | return -EINVAL; | |
1571 | ||
1572 | e.port = uxe->port; | |
1573 | e.vcpu_id = uxe->vcpu; | |
1574 | e.vcpu_idx = -1; | |
1575 | e.priority = uxe->priority; | |
1576 | ||
1577 | ret = kvm_xen_set_evtchn(&e, kvm); | |
1578 | ||
1579 | /* | |
1580 | * None of that 'return 1 if it actually got delivered' nonsense. | |
1581 | * We don't care if it was masked (-ENOTCONN) either. | |
1582 | */ | |
1583 | if (ret > 0 || ret == -ENOTCONN) | |
1584 | ret = 0; | |
1585 | ||
1586 | return ret; | |
1587 | } | |
1588 | ||
2fd6df2f JM |
1589 | /* |
1590 | * Support for *outbound* event channel events via the EVTCHNOP_send hypercall. | |
1591 | */ | |
1592 | struct evtchnfd { | |
1593 | u32 send_port; | |
1594 | u32 type; | |
1595 | union { | |
1596 | struct kvm_xen_evtchn port; | |
1597 | struct { | |
1598 | u32 port; /* zero */ | |
1599 | struct eventfd_ctx *ctx; | |
1600 | } eventfd; | |
1601 | } deliver; | |
1602 | }; | |
1603 | ||
1604 | /* | |
1605 | * Update target vCPU or priority for a registered sending channel. | |
1606 | */ | |
1607 | static int kvm_xen_eventfd_update(struct kvm *kvm, | |
1608 | struct kvm_xen_hvm_attr *data) | |
1609 | { | |
1610 | u32 port = data->u.evtchn.send_port; | |
1611 | struct evtchnfd *evtchnfd; | |
1612 | ||
1613 | if (!port || port >= max_evtchn_port(kvm)) | |
1614 | return -EINVAL; | |
1615 | ||
1616 | mutex_lock(&kvm->lock); | |
1617 | evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); | |
1618 | mutex_unlock(&kvm->lock); | |
1619 | ||
1620 | if (!evtchnfd) | |
1621 | return -ENOENT; | |
1622 | ||
1623 | /* For an UPDATE, nothing may change except the priority/vcpu */ | |
1624 | if (evtchnfd->type != data->u.evtchn.type) | |
1625 | return -EINVAL; | |
1626 | ||
1627 | /* | |
1628 | * Port cannot change, and if it's zero that was an eventfd | |
1629 | * which can't be changed either. | |
1630 | */ | |
1631 | if (!evtchnfd->deliver.port.port || | |
1632 | evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port) | |
1633 | return -EINVAL; | |
1634 | ||
1635 | /* We only support 2 level event channels for now */ | |
1636 | if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) | |
1637 | return -EINVAL; | |
1638 | ||
1639 | mutex_lock(&kvm->lock); | |
1640 | evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; | |
1641 | if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) { | |
1642 | evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; | |
1643 | evtchnfd->deliver.port.vcpu_idx = -1; | |
1644 | } | |
1645 | mutex_unlock(&kvm->lock); | |
1646 | return 0; | |
1647 | } | |
1648 | ||
1649 | /* | |
1650 | * Configure the target (eventfd or local port delivery) for sending on | |
1651 | * a given event channel. | |
1652 | */ | |
1653 | static int kvm_xen_eventfd_assign(struct kvm *kvm, | |
1654 | struct kvm_xen_hvm_attr *data) | |
1655 | { | |
1656 | u32 port = data->u.evtchn.send_port; | |
1657 | struct eventfd_ctx *eventfd = NULL; | |
1658 | struct evtchnfd *evtchnfd = NULL; | |
1659 | int ret = -EINVAL; | |
1660 | ||
1661 | if (!port || port >= max_evtchn_port(kvm)) | |
1662 | return -EINVAL; | |
1663 | ||
1664 | evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL); | |
1665 | if (!evtchnfd) | |
1666 | return -ENOMEM; | |
1667 | ||
1668 | switch(data->u.evtchn.type) { | |
1669 | case EVTCHNSTAT_ipi: | |
1670 | /* IPI must map back to the same port# */ | |
1671 | if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port) | |
1672 | goto out; /* -EINVAL */ | |
1673 | break; | |
1674 | ||
1675 | case EVTCHNSTAT_interdomain: | |
1676 | if (data->u.evtchn.deliver.port.port) { | |
1677 | if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm)) | |
1678 | goto out; /* -EINVAL */ | |
1679 | } else { | |
1680 | eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd); | |
1681 | if (IS_ERR(eventfd)) { | |
1682 | ret = PTR_ERR(eventfd); | |
1683 | goto out; | |
1684 | } | |
1685 | } | |
1686 | break; | |
1687 | ||
1688 | case EVTCHNSTAT_virq: | |
1689 | case EVTCHNSTAT_closed: | |
1690 | case EVTCHNSTAT_unbound: | |
1691 | case EVTCHNSTAT_pirq: | |
1692 | default: /* Unknown event channel type */ | |
1693 | goto out; /* -EINVAL */ | |
1694 | } | |
1695 | ||
1696 | evtchnfd->send_port = data->u.evtchn.send_port; | |
1697 | evtchnfd->type = data->u.evtchn.type; | |
1698 | if (eventfd) { | |
1699 | evtchnfd->deliver.eventfd.ctx = eventfd; | |
1700 | } else { | |
1701 | /* We only support 2 level event channels for now */ | |
1702 | if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) | |
1703 | goto out; /* -EINVAL; */ | |
1704 | ||
1705 | evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port; | |
1706 | evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; | |
1707 | evtchnfd->deliver.port.vcpu_idx = -1; | |
1708 | evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; | |
1709 | } | |
1710 | ||
1711 | mutex_lock(&kvm->lock); | |
1712 | ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, | |
1713 | GFP_KERNEL); | |
1714 | mutex_unlock(&kvm->lock); | |
1715 | if (ret >= 0) | |
1716 | return 0; | |
1717 | ||
1718 | if (ret == -ENOSPC) | |
1719 | ret = -EEXIST; | |
1720 | out: | |
1721 | if (eventfd) | |
1722 | eventfd_ctx_put(eventfd); | |
1723 | kfree(evtchnfd); | |
1724 | return ret; | |
1725 | } | |
1726 | ||
1727 | static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) | |
1728 | { | |
1729 | struct evtchnfd *evtchnfd; | |
1730 | ||
1731 | mutex_lock(&kvm->lock); | |
1732 | evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); | |
1733 | mutex_unlock(&kvm->lock); | |
1734 | ||
1735 | if (!evtchnfd) | |
1736 | return -ENOENT; | |
1737 | ||
1738 | if (kvm) | |
1739 | synchronize_srcu(&kvm->srcu); | |
1740 | if (!evtchnfd->deliver.port.port) | |
1741 | eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); | |
1742 | kfree(evtchnfd); | |
1743 | return 0; | |
1744 | } | |
1745 | ||
1746 | static int kvm_xen_eventfd_reset(struct kvm *kvm) | |
1747 | { | |
1748 | struct evtchnfd *evtchnfd; | |
1749 | int i; | |
1750 | ||
1751 | mutex_lock(&kvm->lock); | |
1752 | idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { | |
1753 | idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); | |
1754 | synchronize_srcu(&kvm->srcu); | |
1755 | if (!evtchnfd->deliver.port.port) | |
1756 | eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); | |
1757 | kfree(evtchnfd); | |
1758 | } | |
1759 | mutex_unlock(&kvm->lock); | |
1760 | ||
1761 | return 0; | |
1762 | } | |
1763 | ||
1764 | static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data) | |
1765 | { | |
1766 | u32 port = data->u.evtchn.send_port; | |
1767 | ||
1768 | if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET) | |
1769 | return kvm_xen_eventfd_reset(kvm); | |
1770 | ||
1771 | if (!port || port >= max_evtchn_port(kvm)) | |
1772 | return -EINVAL; | |
1773 | ||
1774 | if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN) | |
1775 | return kvm_xen_eventfd_deassign(kvm, port); | |
1776 | if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE) | |
1777 | return kvm_xen_eventfd_update(kvm, data); | |
1778 | if (data->u.evtchn.flags) | |
1779 | return -EINVAL; | |
1780 | ||
1781 | return kvm_xen_eventfd_assign(kvm, data); | |
1782 | } | |
1783 | ||
1784 | static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) | |
1785 | { | |
1786 | struct evtchnfd *evtchnfd; | |
1787 | struct evtchn_send send; | |
1788 | gpa_t gpa; | |
1789 | int idx; | |
1790 | ||
1791 | idx = srcu_read_lock(&vcpu->kvm->srcu); | |
1792 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); | |
1793 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | |
1794 | ||
1795 | if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) { | |
1796 | *r = -EFAULT; | |
1797 | return true; | |
1798 | } | |
1799 | ||
1800 | /* The evtchn_ports idr is protected by vcpu->kvm->srcu */ | |
1801 | evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port); | |
1802 | if (!evtchnfd) | |
1803 | return false; | |
1804 | ||
1805 | if (evtchnfd->deliver.port.port) { | |
1806 | int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm); | |
1807 | if (ret < 0 && ret != -ENOTCONN) | |
1808 | return false; | |
1809 | } else { | |
1810 | eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1); | |
1811 | } | |
1812 | ||
1813 | *r = 0; | |
1814 | return true; | |
1815 | } | |
1816 | ||
942c2490 DW |
1817 | void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) |
1818 | { | |
1819 | vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; | |
1a65105a BO |
1820 | vcpu->arch.xen.poll_evtchn = 0; |
1821 | timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); | |
942c2490 DW |
1822 | } |
1823 | ||
a795cd43 DW |
1824 | void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) |
1825 | { | |
53639526 JM |
1826 | if (kvm_xen_timer_enabled(vcpu)) |
1827 | kvm_xen_stop_timer(vcpu); | |
1828 | ||
a795cd43 DW |
1829 | kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, |
1830 | &vcpu->arch.xen.runstate_cache); | |
7caf9571 DW |
1831 | kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, |
1832 | &vcpu->arch.xen.vcpu_info_cache); | |
69d413cf DW |
1833 | kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, |
1834 | &vcpu->arch.xen.vcpu_time_info_cache); | |
1a65105a | 1835 | del_timer_sync(&vcpu->arch.xen.poll_timer); |
a795cd43 | 1836 | } |
2fd6df2f JM |
1837 | |
1838 | void kvm_xen_init_vm(struct kvm *kvm) | |
1839 | { | |
1840 | idr_init(&kvm->arch.xen.evtchn_ports); | |
1841 | } | |
1842 | ||
1843 | void kvm_xen_destroy_vm(struct kvm *kvm) | |
1844 | { | |
1845 | struct evtchnfd *evtchnfd; | |
1846 | int i; | |
1847 | ||
1848 | kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); | |
1849 | ||
1850 | idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { | |
1851 | if (!evtchnfd->deliver.port.port) | |
1852 | eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); | |
1853 | kfree(evtchnfd); | |
1854 | } | |
1855 | idr_destroy(&kvm->arch.xen.evtchn_ports); | |
1856 | ||
1857 | if (kvm->arch.xen_hvm_config.msr) | |
1858 | static_branch_slow_dec_deferred(&kvm_xen_enabled); | |
1859 | } |