KVM: Remove kvm_make_all_cpus_request_except()
[linux-block.git] / virt / kvm / kvm_main.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
6aa8b732
AK
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9611c187 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
6aa8b732
AK
14 */
15
af669ac6 16#include <kvm/iodev.h>
6aa8b732 17
edf88417 18#include <linux/kvm_host.h>
6aa8b732
AK
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
6aa8b732 22#include <linux/percpu.h>
6aa8b732
AK
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
6aa8b732 26#include <linux/reboot.h>
6aa8b732
AK
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
fb3600cc 30#include <linux/syscore_ops.h>
774c47f1 31#include <linux/cpu.h>
174cd4b1 32#include <linux/sched/signal.h>
6e84f315 33#include <linux/sched/mm.h>
03441a34 34#include <linux/sched/stat.h>
d9e368d6
AK
35#include <linux/cpumask.h>
36#include <linux/smp.h>
d6d28168 37#include <linux/anon_inodes.h>
04d2cc77 38#include <linux/profile.h>
7aa81cc0 39#include <linux/kvm_para.h>
6fc138d2 40#include <linux/pagemap.h>
8d4e1288 41#include <linux/mman.h>
35149e21 42#include <linux/swap.h>
e56d532f 43#include <linux/bitops.h>
547de29e 44#include <linux/spinlock.h>
6ff5894c 45#include <linux/compat.h>
bc6678a3 46#include <linux/srcu.h>
8f0b1ab6 47#include <linux/hugetlb.h>
5a0e3ad6 48#include <linux/slab.h>
743eeb0b
SL
49#include <linux/sort.h>
50#include <linux/bsearch.h>
c011d23b 51#include <linux/io.h>
2eb06c30 52#include <linux/lockdep.h>
c57c8046 53#include <linux/kthread.h>
2fdef3a2 54#include <linux/suspend.h>
6aa8b732 55
e495606d 56#include <asm/processor.h>
2ea75be3 57#include <asm/ioctl.h>
7c0f6ba6 58#include <linux/uaccess.h>
6aa8b732 59
5f94c174 60#include "coalesced_mmio.h"
af585b92 61#include "async_pf.h"
982ed0de 62#include "kvm_mm.h"
3c3c29fd 63#include "vfio.h"
5f94c174 64
4c8c3c7f
VS
65#include <trace/events/ipi.h>
66
229456fc
MT
67#define CREATE_TRACE_POINTS
68#include <trace/events/kvm.h>
69
fb04a1ed
PX
70#include <linux/kvm_dirty_ring.h>
71
4c8c3c7f 72
536a6f88
JF
73/* Worst case buffer size needed for holding an integer. */
74#define ITOA_MAX_LEN 12
75
6aa8b732
AK
76MODULE_AUTHOR("Qumranet");
77MODULE_LICENSE("GPL");
78
920552b2 79/* Architectures should define their poll value according to the halt latency */
ec76d819 80unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
039c5d1b 81module_param(halt_poll_ns, uint, 0644);
ec76d819 82EXPORT_SYMBOL_GPL(halt_poll_ns);
f7819512 83
aca6ff29 84/* Default doubles per-vcpu halt_poll_ns. */
ec76d819 85unsigned int halt_poll_ns_grow = 2;
039c5d1b 86module_param(halt_poll_ns_grow, uint, 0644);
ec76d819 87EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
aca6ff29 88
49113d36
NW
89/* The start value to grow halt_poll_ns from */
90unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
91module_param(halt_poll_ns_grow_start, uint, 0644);
92EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
93
aca6ff29 94/* Default resets per-vcpu halt_poll_ns . */
ec76d819 95unsigned int halt_poll_ns_shrink;
039c5d1b 96module_param(halt_poll_ns_shrink, uint, 0644);
ec76d819 97EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
aca6ff29 98
fa40a821
MT
99/*
100 * Ordering of locks:
101 *
b7d409de 102 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
fa40a821
MT
103 */
104
0d9ce162 105DEFINE_MUTEX(kvm_lock);
e9b11c17 106LIST_HEAD(vm_list);
133de902 107
aaba298c 108static struct kmem_cache *kvm_vcpu_cache;
1165f5fe 109
15ad7146 110static __read_mostly struct preempt_ops kvm_preempt_ops;
7495e22b 111static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
15ad7146 112
76f7c879 113struct dentry *kvm_debugfs_dir;
e23a808b 114EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
6aa8b732 115
09cbcef6 116static const struct file_operations stat_fops_per_vm;
536a6f88 117
bccf2150
AK
118static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
119 unsigned long arg);
de8e5d74 120#ifdef CONFIG_KVM_COMPAT
1dda606c
AG
121static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
122 unsigned long arg);
7ddfd3e0
MZ
123#define KVM_COMPAT(c) .compat_ioctl = (c)
124#else
9cb09e7c
MZ
125/*
126 * For architectures that don't implement a compat infrastructure,
127 * adopt a double line of defense:
128 * - Prevent a compat task from opening /dev/kvm
129 * - If the open has been done by a 64bit task, and the KVM fd
130 * passed to a compat task, let the ioctls fail.
131 */
7ddfd3e0
MZ
132static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
133 unsigned long arg) { return -EINVAL; }
b9876e6d
MZ
134
135static int kvm_no_compat_open(struct inode *inode, struct file *file)
136{
137 return is_compat_task() ? -ENODEV : 0;
138}
139#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
140 .open = kvm_no_compat_open
1dda606c 141#endif
10474ae8
AG
142static int hardware_enable_all(void);
143static void hardware_disable_all(void);
bccf2150 144
e93f8a0f 145static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
7940876e 146
286de8f6
CI
147#define KVM_EVENT_CREATE_VM 0
148#define KVM_EVENT_DESTROY_VM 1
149static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
150static unsigned long long kvm_createvm_count;
151static unsigned long long kvm_active_vms;
152
baff59cc
VK
153static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
154
683412cc
MZ
155__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
156{
157}
158
284dc493 159bool kvm_is_zone_device_page(struct page *page)
a78986aa
SC
160{
161 /*
162 * The metadata used by is_zone_device_page() to determine whether or
163 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
164 * the device has been pinned, e.g. by get_user_pages(). WARN if the
165 * page_count() is zero to help detect bad usage of this helper.
166 */
284dc493 167 if (WARN_ON_ONCE(!page_count(page)))
a78986aa
SC
168 return false;
169
284dc493 170 return is_zone_device_page(page);
a78986aa
SC
171}
172
b14b2690
SC
173/*
174 * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
175 * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
176 * is likely incomplete, it has been compiled purely through people wanting to
177 * back guest with a certain type of memory and encountering issues.
178 */
179struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
cbff90a7 180{
b14b2690
SC
181 struct page *page;
182
183 if (!pfn_valid(pfn))
184 return NULL;
185
186 page = pfn_to_page(pfn);
187 if (!PageReserved(page))
188 return page;
189
190 /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
191 if (is_zero_pfn(pfn))
192 return page;
193
a78986aa
SC
194 /*
195 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
196 * perspective they are "normal" pages, albeit with slightly different
197 * usage rules.
198 */
b14b2690
SC
199 if (kvm_is_zone_device_page(page))
200 return page;
cbff90a7 201
b14b2690 202 return NULL;
cbff90a7
BAY
203}
204
bccf2150
AK
205/*
206 * Switches to specified vcpu, until a matching vcpu_put()
207 */
ec7660cc 208void vcpu_load(struct kvm_vcpu *vcpu)
6aa8b732 209{
ec7660cc 210 int cpu = get_cpu();
7495e22b
PB
211
212 __this_cpu_write(kvm_running_vcpu, vcpu);
15ad7146 213 preempt_notifier_register(&vcpu->preempt_notifier);
313a3dc7 214 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146 215 put_cpu();
6aa8b732 216}
2f1fe811 217EXPORT_SYMBOL_GPL(vcpu_load);
6aa8b732 218
313a3dc7 219void vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 220{
15ad7146 221 preempt_disable();
313a3dc7 222 kvm_arch_vcpu_put(vcpu);
15ad7146 223 preempt_notifier_unregister(&vcpu->preempt_notifier);
7495e22b 224 __this_cpu_write(kvm_running_vcpu, NULL);
15ad7146 225 preempt_enable();
6aa8b732 226}
2f1fe811 227EXPORT_SYMBOL_GPL(vcpu_put);
6aa8b732 228
7a97cec2
PB
229/* TODO: merge with kvm_arch_vcpu_should_kick */
230static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
231{
232 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
233
234 /*
235 * We need to wait for the VCPU to reenable interrupts and get out of
236 * READING_SHADOW_PAGE_TABLES mode.
237 */
238 if (req & KVM_REQUEST_WAIT)
239 return mode != OUTSIDE_GUEST_MODE;
240
241 /*
242 * Need to kick a running VCPU, but otherwise there is nothing to do.
243 */
244 return mode == IN_GUEST_MODE;
245}
246
f24b44e4 247static void ack_kick(void *_completed)
d9e368d6 248{
d9e368d6
AK
249}
250
620b2438 251static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
b49defe8 252{
b49defe8
PB
253 if (cpumask_empty(cpus))
254 return false;
255
f24b44e4 256 smp_call_function_many(cpus, ack_kick, NULL, wait);
b49defe8
PB
257 return true;
258}
259
b56bd8e0
JL
260static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
261 struct cpumask *tmp, int current_cpu)
ae0946cd
VK
262{
263 int cpu;
264
df06dae3
SC
265 if (likely(!(req & KVM_REQUEST_NO_ACTION)))
266 __kvm_make_request(req, vcpu);
ae0946cd
VK
267
268 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
269 return;
270
ae0946cd
VK
271 /*
272 * Note, the vCPU could get migrated to a different pCPU at any point
273 * after kvm_request_needs_ipi(), which could result in sending an IPI
274 * to the previous pCPU. But, that's OK because the purpose of the IPI
275 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
276 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
277 * after this point is also OK, as the requirement is only that KVM wait
278 * for vCPUs that were reading SPTEs _before_ any changes were
279 * finalized. See kvm_vcpu_kick() for more details on handling requests.
280 */
281 if (kvm_request_needs_ipi(vcpu, req)) {
282 cpu = READ_ONCE(vcpu->cpu);
283 if (cpu != -1 && cpu != current_cpu)
284 __cpumask_set_cpu(cpu, tmp);
285 }
286}
287
7053df4e 288bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
620b2438 289 unsigned long *vcpu_bitmap)
d9e368d6 290{
d9e368d6 291 struct kvm_vcpu *vcpu;
620b2438 292 struct cpumask *cpus;
ae0946cd 293 int i, me;
7053df4e 294 bool called;
6ef7a1bc 295
3cba4130 296 me = get_cpu();
7053df4e 297
620b2438
VK
298 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
299 cpumask_clear(cpus);
300
ae0946cd
VK
301 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
302 vcpu = kvm_get_vcpu(kvm, i);
381cecc5 303 if (!vcpu)
7053df4e 304 continue;
b56bd8e0 305 kvm_make_vcpu_request(vcpu, req, cpus, me);
49846896 306 }
7053df4e 307
620b2438 308 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
3cba4130 309 put_cpu();
7053df4e
VK
310
311 return called;
312}
313
82e9c84d 314bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
7053df4e 315{
ae0946cd 316 struct kvm_vcpu *vcpu;
baff59cc 317 struct cpumask *cpus;
46808a4c 318 unsigned long i;
7053df4e 319 bool called;
46808a4c 320 int me;
7053df4e 321
ae0946cd
VK
322 me = get_cpu();
323
baff59cc
VK
324 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
325 cpumask_clear(cpus);
326
82e9c84d 327 kvm_for_each_vcpu(i, vcpu, kvm)
b56bd8e0 328 kvm_make_vcpu_request(vcpu, req, cpus, me);
ae0946cd
VK
329
330 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
331 put_cpu();
7053df4e 332
49846896 333 return called;
d9e368d6 334}
a2486020 335EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
54163a34 336
49846896 337void kvm_flush_remote_tlbs(struct kvm *kvm)
2e53d63a 338{
3cc4e148 339 ++kvm->stat.generic.remote_tlb_flush_requests;
6bc6db00 340
4ae3cb3a
LT
341 /*
342 * We want to publish modifications to the page tables before reading
343 * mode. Pairs with a memory barrier in arch-specific code.
344 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
345 * and smp_mb in walk_shadow_page_lockless_begin/end.
346 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
347 *
348 * There is already an smp_mb__after_atomic() before
349 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
350 * barrier here.
351 */
a1342c80 352 if (!kvm_arch_flush_remote_tlbs(kvm)
b08660e5 353 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
0193cc90 354 ++kvm->stat.generic.remote_tlb_flush;
2e53d63a 355}
2ba9f0d8 356EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
2e53d63a 357
d4788996
DM
358void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
359{
360 if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
361 return;
362
363 /*
364 * Fall back to a flushing entire TLBs if the architecture range-based
365 * TLB invalidation is unsupported or can't be performed for whatever
366 * reason.
367 */
368 kvm_flush_remote_tlbs(kvm);
369}
370
619b5072
DM
371void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
372 const struct kvm_memory_slot *memslot)
373{
374 /*
375 * All current use cases for flushing the TLBs for a specific memslot
376 * are related to dirty logging, and many do the TLB flush out of
377 * mmu_lock. The interaction between the various operations on memslot
378 * must be serialized by slots_locks to ensure the TLB flush from one
379 * operation is observed by any other operation on the same memslot.
380 */
381 lockdep_assert_held(&kvm->slots_lock);
382 kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
383}
2e53d63a 384
683412cc
MZ
385static void kvm_flush_shadow_all(struct kvm *kvm)
386{
387 kvm_arch_flush_shadow_all(kvm);
388 kvm_arch_guest_memory_reclaimed(kvm);
389}
390
6926f95a
SC
391#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
392static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
393 gfp_t gfp_flags)
394{
395 gfp_flags |= mc->gfp_zero;
396
397 if (mc->kmem_cache)
398 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
399 else
400 return (void *)__get_free_page(gfp_flags);
401}
402
837f66c7 403int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
6926f95a 404{
63f4b210 405 gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
6926f95a
SC
406 void *obj;
407
408 if (mc->nobjs >= min)
409 return 0;
837f66c7
DM
410
411 if (unlikely(!mc->objects)) {
412 if (WARN_ON_ONCE(!capacity))
413 return -EIO;
414
ea3689d9 415 mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
837f66c7
DM
416 if (!mc->objects)
417 return -ENOMEM;
418
419 mc->capacity = capacity;
420 }
421
422 /* It is illegal to request a different capacity across topups. */
423 if (WARN_ON_ONCE(mc->capacity != capacity))
424 return -EIO;
425
426 while (mc->nobjs < mc->capacity) {
427 obj = mmu_memory_cache_alloc_obj(mc, gfp);
6926f95a
SC
428 if (!obj)
429 return mc->nobjs >= min ? 0 : -ENOMEM;
430 mc->objects[mc->nobjs++] = obj;
431 }
432 return 0;
433}
434
837f66c7
DM
435int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
436{
437 return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
438}
439
6926f95a
SC
440int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
441{
442 return mc->nobjs;
443}
444
445void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
446{
447 while (mc->nobjs) {
448 if (mc->kmem_cache)
449 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
450 else
451 free_page((unsigned long)mc->objects[--mc->nobjs]);
452 }
837f66c7
DM
453
454 kvfree(mc->objects);
455
456 mc->objects = NULL;
457 mc->capacity = 0;
6926f95a
SC
458}
459
460void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
461{
462 void *p;
463
464 if (WARN_ON(!mc->nobjs))
465 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
466 else
467 p = mc->objects[--mc->nobjs];
468 BUG_ON(!p);
469 return p;
470}
471#endif
472
8bd826d6 473static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
fb3f0f51 474{
fb3f0f51
RR
475 mutex_init(&vcpu->mutex);
476 vcpu->cpu = -1;
fb3f0f51
RR
477 vcpu->kvm = kvm;
478 vcpu->vcpu_id = id;
34bb10b7 479 vcpu->pid = NULL;
510958e9 480#ifndef __KVM_HAVE_ARCH_WQP
da4ad88c 481 rcuwait_init(&vcpu->wait);
510958e9 482#endif
af585b92 483 kvm_async_pf_vcpu_init(vcpu);
fb3f0f51 484
4c088493
R
485 kvm_vcpu_set_in_spin_loop(vcpu, false);
486 kvm_vcpu_set_dy_eligible(vcpu, false);
3a08a8f9 487 vcpu->preempted = false;
d73eb57b 488 vcpu->ready = false;
d5c48deb 489 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
a54d8066 490 vcpu->last_used_slot = NULL;
58fc1166
OU
491
492 /* Fill the stats id string for the vcpu */
493 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
494 task_pid_nr(current), id);
fb3f0f51 495}
fb3f0f51 496
27592ae8 497static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
4543bdc0
SC
498{
499 kvm_arch_vcpu_destroy(vcpu);
5593473a 500 kvm_dirty_ring_free(&vcpu->dirty_ring);
e529ef66 501
9941d224
SC
502 /*
503 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
504 * the vcpu->pid pointer, and at destruction time all file descriptors
505 * are already gone.
506 */
507 put_pid(rcu_dereference_protected(vcpu->pid, 1));
508
8bd826d6 509 free_page((unsigned long)vcpu->run);
e529ef66 510 kmem_cache_free(kvm_vcpu_cache, vcpu);
4543bdc0 511}
27592ae8
MZ
512
513void kvm_destroy_vcpus(struct kvm *kvm)
514{
46808a4c 515 unsigned long i;
27592ae8
MZ
516 struct kvm_vcpu *vcpu;
517
518 kvm_for_each_vcpu(i, vcpu, kvm) {
519 kvm_vcpu_destroy(vcpu);
c5b07754 520 xa_erase(&kvm->vcpu_array, i);
27592ae8
MZ
521 }
522
523 atomic_set(&kvm->online_vcpus, 0);
524}
525EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
4543bdc0 526
f128cf8c 527#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
e930bffe
AA
528static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
529{
530 return container_of(mn, struct kvm, mmu_notifier);
531}
532
e97b39c5 533typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
3039bcc7 534
8569992d 535typedef void (*on_lock_fn_t)(struct kvm *kvm);
683412cc 536
e97b39c5
SC
537struct kvm_mmu_notifier_range {
538 /*
539 * 64-bit addresses, as KVM notifiers can operate on host virtual
540 * addresses (unsigned long) and guest physical addresses (64-bit).
541 */
542 u64 start;
543 u64 end;
3e1efe2b 544 union kvm_mmu_notifier_arg arg;
e97b39c5 545 gfn_handler_t handler;
f922bd9b 546 on_lock_fn_t on_lock;
3039bcc7
SC
547 bool flush_on_ret;
548 bool may_block;
549};
550
cec29eef
SC
551/*
552 * The inner-most helper returns a tuple containing the return value from the
553 * arch- and action-specific handler, plus a flag indicating whether or not at
554 * least one memslot was found, i.e. if the handler found guest memory.
555 *
556 * Note, most notifiers are averse to booleans, so even though KVM tracks the
557 * return from arch code as a bool, outer helpers will cast it to an int. :-(
558 */
559typedef struct kvm_mmu_notifier_return {
560 bool ret;
561 bool found_memslot;
562} kvm_mn_ret_t;
563
f922bd9b
SC
564/*
565 * Use a dedicated stub instead of NULL to indicate that there is no callback
566 * function/handler. The compiler technically can't guarantee that a real
567 * function will have a non-zero address, and so it will generate code to
568 * check for !NULL, whereas comparing against a stub will be elided at compile
569 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
570 */
571static void kvm_null_fn(void)
572{
573
574}
575#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
576
3e1efe2b
SC
577static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
578
ed922739
MS
579/* Iterate over each memslot intersecting [start, last] (inclusive) range */
580#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
581 for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
582 node; \
583 node = interval_tree_iter_next(node, start, last)) \
584
cec29eef
SC
585static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
586 const struct kvm_mmu_notifier_range *range)
3039bcc7 587{
cec29eef
SC
588 struct kvm_mmu_notifier_return r = {
589 .ret = false,
590 .found_memslot = false,
591 };
f922bd9b 592 struct kvm_gfn_range gfn_range;
3039bcc7
SC
593 struct kvm_memory_slot *slot;
594 struct kvm_memslots *slots;
3039bcc7
SC
595 int i, idx;
596
ed922739 597 if (WARN_ON_ONCE(range->end <= range->start))
cec29eef 598 return r;
ed922739 599
f922bd9b
SC
600 /* A null handler is allowed if and only if on_lock() is provided. */
601 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
602 IS_KVM_NULL_FN(range->handler)))
cec29eef 603 return r;
f922bd9b 604
3039bcc7
SC
605 idx = srcu_read_lock(&kvm->srcu);
606
eed52e43 607 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
ed922739
MS
608 struct interval_tree_node *node;
609
3039bcc7 610 slots = __kvm_memslots(kvm, i);
ed922739
MS
611 kvm_for_each_memslot_in_hva_range(node, slots,
612 range->start, range->end - 1) {
3039bcc7
SC
613 unsigned long hva_start, hva_end;
614
a54d8066 615 slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
e97b39c5
SC
616 hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
617 hva_end = min_t(unsigned long, range->end,
618 slot->userspace_addr + (slot->npages << PAGE_SHIFT));
3039bcc7
SC
619
620 /*
621 * To optimize for the likely case where the address
622 * range is covered by zero or one memslots, don't
623 * bother making these conditional (to avoid writes on
624 * the second or later invocation of the handler).
625 */
3e1efe2b 626 gfn_range.arg = range->arg;
3039bcc7
SC
627 gfn_range.may_block = range->may_block;
628
629 /*
630 * {gfn(page) | page intersects with [hva_start, hva_end)} =
631 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
632 */
633 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
634 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
635 gfn_range.slot = slot;
636
cec29eef
SC
637 if (!r.found_memslot) {
638 r.found_memslot = true;
8931a454 639 KVM_MMU_LOCK(kvm);
071064f1 640 if (!IS_KVM_NULL_FN(range->on_lock))
8569992d
CP
641 range->on_lock(kvm);
642
071064f1
PB
643 if (IS_KVM_NULL_FN(range->handler))
644 break;
8931a454 645 }
cec29eef 646 r.ret |= range->handler(kvm, &gfn_range);
3039bcc7
SC
647 }
648 }
649
cec29eef 650 if (range->flush_on_ret && r.ret)
3039bcc7
SC
651 kvm_flush_remote_tlbs(kvm);
652
193bbfaa 653 if (r.found_memslot)
8931a454 654 KVM_MMU_UNLOCK(kvm);
f922bd9b 655
3039bcc7
SC
656 srcu_read_unlock(&kvm->srcu, idx);
657
cec29eef 658 return r;
3039bcc7
SC
659}
660
661static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
662 unsigned long start,
663 unsigned long end,
3e1efe2b 664 union kvm_mmu_notifier_arg arg,
e97b39c5 665 gfn_handler_t handler)
3039bcc7
SC
666{
667 struct kvm *kvm = mmu_notifier_to_kvm(mn);
e97b39c5 668 const struct kvm_mmu_notifier_range range = {
3039bcc7
SC
669 .start = start,
670 .end = end,
3e1efe2b 671 .arg = arg,
3039bcc7 672 .handler = handler,
f922bd9b 673 .on_lock = (void *)kvm_null_fn,
3039bcc7
SC
674 .flush_on_ret = true,
675 .may_block = false,
676 };
3039bcc7 677
cec29eef 678 return __kvm_handle_hva_range(kvm, &range).ret;
3039bcc7
SC
679}
680
681static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
682 unsigned long start,
683 unsigned long end,
e97b39c5 684 gfn_handler_t handler)
3039bcc7
SC
685{
686 struct kvm *kvm = mmu_notifier_to_kvm(mn);
e97b39c5 687 const struct kvm_mmu_notifier_range range = {
3039bcc7
SC
688 .start = start,
689 .end = end,
3039bcc7 690 .handler = handler,
f922bd9b 691 .on_lock = (void *)kvm_null_fn,
3039bcc7
SC
692 .flush_on_ret = false,
693 .may_block = false,
694 };
3039bcc7 695
cec29eef 696 return __kvm_handle_hva_range(kvm, &range).ret;
3039bcc7 697}
2230f9e1
GS
698
699static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
700{
701 /*
702 * Skipping invalid memslots is correct if and only change_pte() is
703 * surrounded by invalidate_range_{start,end}(), which is currently
704 * guaranteed by the primary MMU. If that ever changes, KVM needs to
705 * unmap the memslot instead of skipping the memslot to ensure that KVM
706 * doesn't hold references to the old PFN.
707 */
708 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
709
710 if (range->slot->flags & KVM_MEMSLOT_INVALID)
711 return false;
712
713 return kvm_set_spte_gfn(kvm, range);
714}
715
3da0dd43
IE
716static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
717 struct mm_struct *mm,
718 unsigned long address,
719 pte_t pte)
720{
721 struct kvm *kvm = mmu_notifier_to_kvm(mn);
3e1efe2b 722 const union kvm_mmu_notifier_arg arg = { .pte = pte };
3da0dd43 723
501b9185
SC
724 trace_kvm_set_spte_hva(address);
725
c13fda23 726 /*
52ac8b35 727 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
20ec3ebd
CP
728 * If mmu_invalidate_in_progress is zero, then no in-progress
729 * invalidations, including this one, found a relevant memslot at
730 * start(); rechecking memslots here is unnecessary. Note, a false
731 * positive (count elevated by a different invalidation) is sub-optimal
732 * but functionally ok.
c13fda23 733 */
52ac8b35 734 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
20ec3ebd 735 if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
071064f1 736 return;
c13fda23 737
3e1efe2b 738 kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
3da0dd43
IE
739}
740
8569992d 741void kvm_mmu_invalidate_begin(struct kvm *kvm)
e930bffe 742{
8569992d 743 lockdep_assert_held_write(&kvm->mmu_lock);
e930bffe
AA
744 /*
745 * The count increase must become visible at unlock time as no
746 * spte can be established without taking the mmu_lock and
747 * count is also read inside the mmu_lock critical section.
748 */
20ec3ebd 749 kvm->mmu_invalidate_in_progress++;
8569992d 750
20ec3ebd 751 if (likely(kvm->mmu_invalidate_in_progress == 1)) {
8569992d
CP
752 kvm->mmu_invalidate_range_start = INVALID_GPA;
753 kvm->mmu_invalidate_range_end = INVALID_GPA;
754 }
755}
756
757void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
758{
759 lockdep_assert_held_write(&kvm->mmu_lock);
760
761 WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
762
763 if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
20ec3ebd
CP
764 kvm->mmu_invalidate_range_start = start;
765 kvm->mmu_invalidate_range_end = end;
4a42d848
DS
766 } else {
767 /*
a413a625 768 * Fully tracking multiple concurrent ranges has diminishing
4a42d848
DS
769 * returns. Keep things simple and just find the minimal range
770 * which includes the current and new ranges. As there won't be
771 * enough information to subtract a range after its invalidate
772 * completes, any ranges invalidated concurrently will
773 * accumulate and persist until all outstanding invalidates
774 * complete.
775 */
20ec3ebd
CP
776 kvm->mmu_invalidate_range_start =
777 min(kvm->mmu_invalidate_range_start, start);
778 kvm->mmu_invalidate_range_end =
779 max(kvm->mmu_invalidate_range_end, end);
4a42d848 780 }
f922bd9b 781}
3039bcc7 782
a7800aa8 783bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
8569992d
CP
784{
785 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
786 return kvm_unmap_gfn_range(kvm, range);
787}
788
f922bd9b
SC
789static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
790 const struct mmu_notifier_range *range)
791{
792 struct kvm *kvm = mmu_notifier_to_kvm(mn);
e97b39c5 793 const struct kvm_mmu_notifier_range hva_range = {
f922bd9b
SC
794 .start = range->start,
795 .end = range->end,
8569992d 796 .handler = kvm_mmu_unmap_gfn_range,
20ec3ebd 797 .on_lock = kvm_mmu_invalidate_begin,
f922bd9b
SC
798 .flush_on_ret = true,
799 .may_block = mmu_notifier_range_blockable(range),
800 };
565f3be2 801
f922bd9b
SC
802 trace_kvm_unmap_hva_range(range->start, range->end);
803
52ac8b35
PB
804 /*
805 * Prevent memslot modification between range_start() and range_end()
806 * so that conditionally locking provides the same result in both
20ec3ebd 807 * functions. Without that guarantee, the mmu_invalidate_in_progress
52ac8b35
PB
808 * adjustments will be imbalanced.
809 *
810 * Pairs with the decrement in range_end().
811 */
812 spin_lock(&kvm->mn_invalidate_lock);
813 kvm->mn_active_invalidate_count++;
814 spin_unlock(&kvm->mn_invalidate_lock);
815
58cd407c
SC
816 /*
817 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
818 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
819 * each cache's lock. There are relatively few caches in existence at
820 * any given time, and the caches themselves can check for hva overlap,
821 * i.e. don't need to rely on memslot overlap checks for performance.
822 * Because this runs without holding mmu_lock, the pfn caches must use
20ec3ebd
CP
823 * mn_active_invalidate_count (see above) instead of
824 * mmu_invalidate_in_progress.
58cd407c 825 */
982ed0de
DW
826 gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
827 hva_range.may_block);
828
cec29eef
SC
829 /*
830 * If one or more memslots were found and thus zapped, notify arch code
831 * that guest memory has been reclaimed. This needs to be done *after*
832 * dropping mmu_lock, as x86's reclaim path is slooooow.
833 */
834 if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
835 kvm_arch_guest_memory_reclaimed(kvm);
93065ac7 836
e649b3f0 837 return 0;
e930bffe
AA
838}
839
8569992d 840void kvm_mmu_invalidate_end(struct kvm *kvm)
e930bffe 841{
8569992d
CP
842 lockdep_assert_held_write(&kvm->mmu_lock);
843
e930bffe
AA
844 /*
845 * This sequence increase will notify the kvm page fault that
846 * the page that is going to be mapped in the spte could have
847 * been freed.
848 */
20ec3ebd 849 kvm->mmu_invalidate_seq++;
a355aa54 850 smp_wmb();
e930bffe
AA
851 /*
852 * The above sequence increase must be visible before the
a355aa54 853 * below count decrease, which is ensured by the smp_wmb above
20ec3ebd 854 * in conjunction with the smp_rmb in mmu_invalidate_retry().
e930bffe 855 */
20ec3ebd 856 kvm->mmu_invalidate_in_progress--;
c0db1923 857 KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
8569992d
CP
858
859 /*
860 * Assert that at least one range was added between start() and end().
861 * Not adding a range isn't fatal, but it is a KVM bug.
862 */
863 WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
f922bd9b
SC
864}
865
866static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
867 const struct mmu_notifier_range *range)
868{
869 struct kvm *kvm = mmu_notifier_to_kvm(mn);
e97b39c5 870 const struct kvm_mmu_notifier_range hva_range = {
f922bd9b
SC
871 .start = range->start,
872 .end = range->end,
f922bd9b 873 .handler = (void *)kvm_null_fn,
20ec3ebd 874 .on_lock = kvm_mmu_invalidate_end,
f922bd9b
SC
875 .flush_on_ret = false,
876 .may_block = mmu_notifier_range_blockable(range),
877 };
52ac8b35 878 bool wake;
f922bd9b
SC
879
880 __kvm_handle_hva_range(kvm, &hva_range);
e930bffe 881
52ac8b35
PB
882 /* Pairs with the increment in range_start(). */
883 spin_lock(&kvm->mn_invalidate_lock);
d489ec95
SC
884 if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
885 --kvm->mn_active_invalidate_count;
886 wake = !kvm->mn_active_invalidate_count;
52ac8b35
PB
887 spin_unlock(&kvm->mn_invalidate_lock);
888
889 /*
890 * There can only be one waiter, since the wait happens under
891 * slots_lock.
892 */
893 if (wake)
894 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
e930bffe
AA
895}
896
897static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
898 struct mm_struct *mm,
57128468
ALC
899 unsigned long start,
900 unsigned long end)
e930bffe 901{
501b9185
SC
902 trace_kvm_age_hva(start, end);
903
3e1efe2b
SC
904 return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
905 kvm_age_gfn);
e930bffe
AA
906}
907
1d7715c6
VD
908static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
909 struct mm_struct *mm,
910 unsigned long start,
911 unsigned long end)
912{
501b9185
SC
913 trace_kvm_age_hva(start, end);
914
1d7715c6
VD
915 /*
916 * Even though we do not flush TLB, this will still adversely
917 * affect performance on pre-Haswell Intel EPT, where there is
918 * no EPT Access Bit to clear so that we have to tear down EPT
919 * tables instead. If we find this unacceptable, we can always
920 * add a parameter to kvm_age_hva so that it effectively doesn't
921 * do anything on clear_young.
922 *
923 * Also note that currently we never issue secondary TLB flushes
924 * from clear_young, leaving this job up to the regular system
925 * cadence. If we find this inaccurate, we might come up with a
926 * more sophisticated heuristic later.
927 */
3039bcc7 928 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
1d7715c6
VD
929}
930
8ee53820
AA
931static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
932 struct mm_struct *mm,
933 unsigned long address)
934{
501b9185
SC
935 trace_kvm_test_age_hva(address);
936
3039bcc7
SC
937 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
938 kvm_test_age_gfn);
8ee53820
AA
939}
940
85db06e5
MT
941static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
942 struct mm_struct *mm)
943{
944 struct kvm *kvm = mmu_notifier_to_kvm(mn);
eda2beda
LJ
945 int idx;
946
947 idx = srcu_read_lock(&kvm->srcu);
683412cc 948 kvm_flush_shadow_all(kvm);
eda2beda 949 srcu_read_unlock(&kvm->srcu, idx);
85db06e5
MT
950}
951
e930bffe 952static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
e930bffe
AA
953 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
954 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
955 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
1d7715c6 956 .clear_young = kvm_mmu_notifier_clear_young,
8ee53820 957 .test_young = kvm_mmu_notifier_test_young,
3da0dd43 958 .change_pte = kvm_mmu_notifier_change_pte,
85db06e5 959 .release = kvm_mmu_notifier_release,
e930bffe 960};
4c07b0a4
AK
961
962static int kvm_init_mmu_notifier(struct kvm *kvm)
963{
964 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
965 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
966}
967
f128cf8c 968#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
4c07b0a4
AK
969
970static int kvm_init_mmu_notifier(struct kvm *kvm)
971{
972 return 0;
973}
974
f128cf8c 975#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
e930bffe 976
2fdef3a2
SS
977#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
978static int kvm_pm_notifier_call(struct notifier_block *bl,
979 unsigned long state,
980 void *unused)
981{
982 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
983
984 return kvm_arch_pm_notifier(kvm, state);
985}
986
987static void kvm_init_pm_notifier(struct kvm *kvm)
988{
989 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
990 /* Suspend KVM before we suspend ftrace, RCU, etc. */
991 kvm->pm_notifier.priority = INT_MAX;
992 register_pm_notifier(&kvm->pm_notifier);
993}
994
995static void kvm_destroy_pm_notifier(struct kvm *kvm)
996{
997 unregister_pm_notifier(&kvm->pm_notifier);
998}
999#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
1000static void kvm_init_pm_notifier(struct kvm *kvm)
1001{
1002}
1003
1004static void kvm_destroy_pm_notifier(struct kvm *kvm)
1005{
1006}
1007#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
1008
a47d2b07
PB
1009static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
1010{
1011 if (!memslot->dirty_bitmap)
1012 return;
1013
1014 kvfree(memslot->dirty_bitmap);
1015 memslot->dirty_bitmap = NULL;
1016}
1017
a54d8066 1018/* This does not remove the slot from struct kvm_memslots data structures */
e96c81ee 1019static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
a47d2b07 1020{
a7800aa8
SC
1021 if (slot->flags & KVM_MEM_GUEST_MEMFD)
1022 kvm_gmem_unbind(slot);
1023
e96c81ee 1024 kvm_destroy_dirty_bitmap(slot);
a47d2b07 1025
e96c81ee 1026 kvm_arch_free_memslot(kvm, slot);
a47d2b07 1027
a54d8066 1028 kfree(slot);
a47d2b07
PB
1029}
1030
1031static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
1032{
a54d8066 1033 struct hlist_node *idnode;
a47d2b07 1034 struct kvm_memory_slot *memslot;
a54d8066 1035 int bkt;
a47d2b07 1036
a54d8066
MS
1037 /*
1038 * The same memslot objects live in both active and inactive sets,
1039 * arbitrarily free using index '1' so the second invocation of this
1040 * function isn't operating over a structure with dangling pointers
1041 * (even though this function isn't actually touching them).
1042 */
1043 if (!slots->node_idx)
a47d2b07
PB
1044 return;
1045
a54d8066 1046 hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
e96c81ee 1047 kvm_free_memslot(kvm, memslot);
bf3e05bc
XG
1048}
1049
bc9e9e67
JZ
1050static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1051{
1052 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1053 case KVM_STATS_TYPE_INSTANT:
1054 return 0444;
1055 case KVM_STATS_TYPE_CUMULATIVE:
1056 case KVM_STATS_TYPE_PEAK:
1057 default:
1058 return 0644;
1059 }
1060}
1061
1062
536a6f88
JF
1063static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1064{
1065 int i;
bc9e9e67
JZ
1066 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1067 kvm_vcpu_stats_header.num_desc;
536a6f88 1068
a44a4cc1 1069 if (IS_ERR(kvm->debugfs_dentry))
536a6f88
JF
1070 return;
1071
1072 debugfs_remove_recursive(kvm->debugfs_dentry);
1073
9d5a1dce
LC
1074 if (kvm->debugfs_stat_data) {
1075 for (i = 0; i < kvm_debugfs_num_entries; i++)
1076 kfree(kvm->debugfs_stat_data[i]);
1077 kfree(kvm->debugfs_stat_data);
1078 }
536a6f88
JF
1079}
1080
59f82aad 1081static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
536a6f88 1082{
85cd39af
PB
1083 static DEFINE_MUTEX(kvm_debugfs_lock);
1084 struct dentry *dent;
536a6f88
JF
1085 char dir_name[ITOA_MAX_LEN * 2];
1086 struct kvm_stat_data *stat_data;
bc9e9e67 1087 const struct _kvm_stats_desc *pdesc;
b74ed7a6 1088 int i, ret = -ENOMEM;
bc9e9e67
JZ
1089 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1090 kvm_vcpu_stats_header.num_desc;
536a6f88
JF
1091
1092 if (!debugfs_initialized())
1093 return 0;
1094
59f82aad 1095 snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
85cd39af
PB
1096 mutex_lock(&kvm_debugfs_lock);
1097 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1098 if (dent) {
1099 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1100 dput(dent);
1101 mutex_unlock(&kvm_debugfs_lock);
1102 return 0;
1103 }
1104 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1105 mutex_unlock(&kvm_debugfs_lock);
1106 if (IS_ERR(dent))
1107 return 0;
536a6f88 1108
85cd39af 1109 kvm->debugfs_dentry = dent;
536a6f88
JF
1110 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1111 sizeof(*kvm->debugfs_stat_data),
b12ce36a 1112 GFP_KERNEL_ACCOUNT);
536a6f88 1113 if (!kvm->debugfs_stat_data)
b74ed7a6 1114 goto out_err;
536a6f88 1115
bc9e9e67
JZ
1116 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1117 pdesc = &kvm_vm_stats_desc[i];
b12ce36a 1118 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
536a6f88 1119 if (!stat_data)
b74ed7a6 1120 goto out_err;
536a6f88
JF
1121
1122 stat_data->kvm = kvm;
bc9e9e67
JZ
1123 stat_data->desc = pdesc;
1124 stat_data->kind = KVM_STAT_VM;
1125 kvm->debugfs_stat_data[i] = stat_data;
1126 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1127 kvm->debugfs_dentry, stat_data,
1128 &stat_fops_per_vm);
1129 }
1130
1131 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1132 pdesc = &kvm_vcpu_stats_desc[i];
b12ce36a 1133 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
536a6f88 1134 if (!stat_data)
b74ed7a6 1135 goto out_err;
536a6f88
JF
1136
1137 stat_data->kvm = kvm;
bc9e9e67
JZ
1138 stat_data->desc = pdesc;
1139 stat_data->kind = KVM_STAT_VCPU;
004d62eb 1140 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
bc9e9e67 1141 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
09cbcef6
MP
1142 kvm->debugfs_dentry, stat_data,
1143 &stat_fops_per_vm);
536a6f88 1144 }
3165af73 1145
284851ee 1146 kvm_arch_create_vm_debugfs(kvm);
536a6f88 1147 return 0;
b74ed7a6
OU
1148out_err:
1149 kvm_destroy_vm_debugfs(kvm);
1150 return ret;
536a6f88
JF
1151}
1152
1aa9b957
JS
1153/*
1154 * Called after the VM is otherwise initialized, but just before adding it to
1155 * the vm_list.
1156 */
1157int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1158{
1159 return 0;
1160}
1161
1162/*
1163 * Called just after removing the VM from the vm_list, but before doing any
1164 * other destruction.
1165 */
1166void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1167{
1168}
1169
3165af73
PX
1170/*
1171 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1172 * be setup already, so we can create arch-specific debugfs entries under it.
1173 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1174 * a per-arch destroy interface is not needed.
1175 */
284851ee 1176void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
3165af73 1177{
3165af73
PX
1178}
1179
b74ed7a6 1180static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
6aa8b732 1181{
d89f5eff 1182 struct kvm *kvm = kvm_arch_alloc_vm();
a54d8066 1183 struct kvm_memslots *slots;
9121923c 1184 int r = -ENOMEM;
a54d8066 1185 int i, j;
6aa8b732 1186
d89f5eff
JK
1187 if (!kvm)
1188 return ERR_PTR(-ENOMEM);
1189
531810ca 1190 KVM_MMU_LOCK_INIT(kvm);
f1f10076 1191 mmgrab(current->mm);
e9ad4ec8
PB
1192 kvm->mm = current->mm;
1193 kvm_eventfd_init(kvm);
1194 mutex_init(&kvm->lock);
1195 mutex_init(&kvm->irq_lock);
1196 mutex_init(&kvm->slots_lock);
b10a038e 1197 mutex_init(&kvm->slots_arch_lock);
52ac8b35
PB
1198 spin_lock_init(&kvm->mn_invalidate_lock);
1199 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
c5b07754 1200 xa_init(&kvm->vcpu_array);
5a475554
CP
1201#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1202 xa_init(&kvm->mem_attr_array);
1203#endif
52ac8b35 1204
982ed0de
DW
1205 INIT_LIST_HEAD(&kvm->gpc_list);
1206 spin_lock_init(&kvm->gpc_lock);
52ac8b35 1207
e9ad4ec8 1208 INIT_LIST_HEAD(&kvm->devices);
f502cc56 1209 kvm->max_vcpus = KVM_MAX_VCPUS;
e9ad4ec8 1210
1e702d9a
AW
1211 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1212
5c697c36
SC
1213 /*
1214 * Force subsequent debugfs file creations to fail if the VM directory
1215 * is not created (by kvm_create_vm_debugfs()).
1216 */
1217 kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1218
f2759c08
OU
1219 snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1220 task_pid_nr(current));
1221
8a44119a
PB
1222 if (init_srcu_struct(&kvm->srcu))
1223 goto out_err_no_srcu;
1224 if (init_srcu_struct(&kvm->irq_srcu))
1225 goto out_err_no_irq_srcu;
1226
e2d3fcaf 1227 refcount_set(&kvm->users_count, 1);
eed52e43 1228 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
a54d8066
MS
1229 for (j = 0; j < 2; j++) {
1230 slots = &kvm->__memslots[i][j];
9121923c 1231
a54d8066
MS
1232 atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1233 slots->hva_tree = RB_ROOT_CACHED;
1234 slots->gfn_tree = RB_ROOT;
1235 hash_init(slots->id_hash);
1236 slots->node_idx = j;
1237
1238 /* Generations must be different for each address space. */
1239 slots->generation = i;
1240 }
1241
1242 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
f481b069 1243 }
00f034a1 1244
e93f8a0f 1245 for (i = 0; i < KVM_NR_BUSES; i++) {
4a12f951 1246 rcu_assign_pointer(kvm->buses[i],
b12ce36a 1247 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
57e7fbee 1248 if (!kvm->buses[i])
a97b0e77 1249 goto out_err_no_arch_destroy_vm;
e93f8a0f 1250 }
e930bffe 1251
e08b9637 1252 r = kvm_arch_init_vm(kvm, type);
d89f5eff 1253 if (r)
a97b0e77 1254 goto out_err_no_arch_destroy_vm;
10474ae8
AG
1255
1256 r = hardware_enable_all();
1257 if (r)
719d93cd 1258 goto out_err_no_disable;
10474ae8 1259
c5b31cc2 1260#ifdef CONFIG_HAVE_KVM_IRQCHIP
136bdfee 1261 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
75858a84 1262#endif
6aa8b732 1263
74b5c5bf 1264 r = kvm_init_mmu_notifier(kvm);
1aa9b957
JS
1265 if (r)
1266 goto out_err_no_mmu_notifier;
1267
c2b82397
SC
1268 r = kvm_coalesced_mmio_init(kvm);
1269 if (r < 0)
1270 goto out_no_coalesced_mmio;
1271
4ba4f419
SC
1272 r = kvm_create_vm_debugfs(kvm, fdname);
1273 if (r)
1274 goto out_err_no_debugfs;
1275
1aa9b957 1276 r = kvm_arch_post_init_vm(kvm);
74b5c5bf 1277 if (r)
4ba4f419 1278 goto out_err;
74b5c5bf 1279
0d9ce162 1280 mutex_lock(&kvm_lock);
5e58cfe4 1281 list_add(&kvm->vm_list, &vm_list);
0d9ce162 1282 mutex_unlock(&kvm_lock);
d89f5eff 1283
2ecd9d29 1284 preempt_notifier_inc();
2fdef3a2 1285 kvm_init_pm_notifier(kvm);
2ecd9d29 1286
f17abe9a 1287 return kvm;
10474ae8
AG
1288
1289out_err:
4ba4f419
SC
1290 kvm_destroy_vm_debugfs(kvm);
1291out_err_no_debugfs:
c2b82397
SC
1292 kvm_coalesced_mmio_free(kvm);
1293out_no_coalesced_mmio:
f128cf8c 1294#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1aa9b957
JS
1295 if (kvm->mmu_notifier.ops)
1296 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1297#endif
1298out_err_no_mmu_notifier:
10474ae8 1299 hardware_disable_all();
719d93cd 1300out_err_no_disable:
a97b0e77 1301 kvm_arch_destroy_vm(kvm);
a97b0e77 1302out_err_no_arch_destroy_vm:
e2d3fcaf 1303 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
e93f8a0f 1304 for (i = 0; i < KVM_NR_BUSES; i++)
3898da94 1305 kfree(kvm_get_bus(kvm, i));
8a44119a
PB
1306 cleanup_srcu_struct(&kvm->irq_srcu);
1307out_err_no_irq_srcu:
1308 cleanup_srcu_struct(&kvm->srcu);
1309out_err_no_srcu:
d89f5eff 1310 kvm_arch_free_vm(kvm);
e9ad4ec8 1311 mmdrop(current->mm);
10474ae8 1312 return ERR_PTR(r);
f17abe9a
AK
1313}
1314
07f0a7bd
SW
1315static void kvm_destroy_devices(struct kvm *kvm)
1316{
e6e3b5a6 1317 struct kvm_device *dev, *tmp;
07f0a7bd 1318
a28ebea2
CD
1319 /*
1320 * We do not need to take the kvm->lock here, because nobody else
1321 * has a reference to the struct kvm at this point and therefore
1322 * cannot access the devices list anyhow.
1323 */
e6e3b5a6
GT
1324 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1325 list_del(&dev->vm_node);
07f0a7bd
SW
1326 dev->ops->destroy(dev);
1327 }
1328}
1329
f17abe9a
AK
1330static void kvm_destroy_vm(struct kvm *kvm)
1331{
e93f8a0f 1332 int i;
6d4e4c4f
AK
1333 struct mm_struct *mm = kvm->mm;
1334
2fdef3a2 1335 kvm_destroy_pm_notifier(kvm);
286de8f6 1336 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
536a6f88 1337 kvm_destroy_vm_debugfs(kvm);
ad8ba2cd 1338 kvm_arch_sync_events(kvm);
0d9ce162 1339 mutex_lock(&kvm_lock);
133de902 1340 list_del(&kvm->vm_list);
0d9ce162 1341 mutex_unlock(&kvm_lock);
1aa9b957
JS
1342 kvm_arch_pre_destroy_vm(kvm);
1343
399ec807 1344 kvm_free_irq_routing(kvm);
df630b8c 1345 for (i = 0; i < KVM_NR_BUSES; i++) {
3898da94 1346 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
4a12f951 1347
4a12f951
CB
1348 if (bus)
1349 kvm_io_bus_destroy(bus);
df630b8c
PX
1350 kvm->buses[i] = NULL;
1351 }
980da6ce 1352 kvm_coalesced_mmio_free(kvm);
f128cf8c 1353#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
e930bffe 1354 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
52ac8b35
PB
1355 /*
1356 * At this point, pending calls to invalidate_range_start()
1357 * have completed but no more MMU notifiers will run, so
1358 * mn_active_invalidate_count may remain unbalanced.
b0d23708 1359 * No threads can be waiting in kvm_swap_active_memslots() as the
52ac8b35
PB
1360 * last reference on KVM has been dropped, but freeing
1361 * memslots would deadlock without this manual intervention.
d497a0fa
SC
1362 *
1363 * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
1364 * notifier between a start() and end(), then there shouldn't be any
1365 * in-progress invalidations.
52ac8b35
PB
1366 */
1367 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
d497a0fa
SC
1368 if (kvm->mn_active_invalidate_count)
1369 kvm->mn_active_invalidate_count = 0;
1370 else
1371 WARN_ON(kvm->mmu_invalidate_in_progress);
f00be0ca 1372#else
683412cc 1373 kvm_flush_shadow_all(kvm);
5f94c174 1374#endif
d19a9cd2 1375 kvm_arch_destroy_vm(kvm);
07f0a7bd 1376 kvm_destroy_devices(kvm);
eed52e43 1377 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
a54d8066
MS
1378 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1379 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1380 }
820b3fcd 1381 cleanup_srcu_struct(&kvm->irq_srcu);
d89f5eff 1382 cleanup_srcu_struct(&kvm->srcu);
5a475554
CP
1383#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1384 xa_destroy(&kvm->mem_attr_array);
1385#endif
d89f5eff 1386 kvm_arch_free_vm(kvm);
2ecd9d29 1387 preempt_notifier_dec();
10474ae8 1388 hardware_disable_all();
6d4e4c4f 1389 mmdrop(mm);
f17abe9a
AK
1390}
1391
d39f13b0
IE
1392void kvm_get_kvm(struct kvm *kvm)
1393{
e3736c3e 1394 refcount_inc(&kvm->users_count);
d39f13b0
IE
1395}
1396EXPORT_SYMBOL_GPL(kvm_get_kvm);
1397
605c7130
PX
1398/*
1399 * Make sure the vm is not during destruction, which is a safe version of
1400 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1401 */
1402bool kvm_get_kvm_safe(struct kvm *kvm)
1403{
1404 return refcount_inc_not_zero(&kvm->users_count);
1405}
1406EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1407
d39f13b0
IE
1408void kvm_put_kvm(struct kvm *kvm)
1409{
e3736c3e 1410 if (refcount_dec_and_test(&kvm->users_count))
d39f13b0
IE
1411 kvm_destroy_vm(kvm);
1412}
1413EXPORT_SYMBOL_GPL(kvm_put_kvm);
1414
149487bd
SC
1415/*
1416 * Used to put a reference that was taken on behalf of an object associated
1417 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1418 * of the new file descriptor fails and the reference cannot be transferred to
1419 * its final owner. In such cases, the caller is still actively using @kvm and
1420 * will fail miserably if the refcount unexpectedly hits zero.
1421 */
1422void kvm_put_kvm_no_destroy(struct kvm *kvm)
1423{
1424 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1425}
1426EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
d39f13b0 1427
f17abe9a
AK
1428static int kvm_vm_release(struct inode *inode, struct file *filp)
1429{
1430 struct kvm *kvm = filp->private_data;
1431
721eecbf
GH
1432 kvm_irqfd_release(kvm);
1433
d39f13b0 1434 kvm_put_kvm(kvm);
6aa8b732
AK
1435 return 0;
1436}
1437
515a0127
TY
1438/*
1439 * Allocation size is twice as large as the actual dirty bitmap size.
0dff0846 1440 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
515a0127 1441 */
3c9bd400 1442static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
a36a57b1 1443{
37b2a651 1444 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
a36a57b1 1445
37b2a651 1446 memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
a36a57b1
TY
1447 if (!memslot->dirty_bitmap)
1448 return -ENOMEM;
1449
a36a57b1
TY
1450 return 0;
1451}
1452
a54d8066 1453static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
bf3e05bc 1454{
a54d8066
MS
1455 struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1456 int node_idx_inactive = active->node_idx ^ 1;
0e60b079 1457
a54d8066 1458 return &kvm->__memslots[as_id][node_idx_inactive];
0577d1ab
SC
1459}
1460
1461/*
a54d8066
MS
1462 * Helper to get the address space ID when one of memslot pointers may be NULL.
1463 * This also serves as a sanity that at least one of the pointers is non-NULL,
1464 * and that their address space IDs don't diverge.
0577d1ab 1465 */
a54d8066
MS
1466static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1467 struct kvm_memory_slot *b)
0577d1ab 1468{
a54d8066
MS
1469 if (WARN_ON_ONCE(!a && !b))
1470 return 0;
0577d1ab 1471
a54d8066
MS
1472 if (!a)
1473 return b->as_id;
1474 if (!b)
1475 return a->as_id;
0577d1ab 1476
a54d8066
MS
1477 WARN_ON_ONCE(a->as_id != b->as_id);
1478 return a->as_id;
0577d1ab 1479}
efbeec70 1480
a54d8066
MS
1481static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1482 struct kvm_memory_slot *slot)
0577d1ab 1483{
a54d8066
MS
1484 struct rb_root *gfn_tree = &slots->gfn_tree;
1485 struct rb_node **node, *parent;
1486 int idx = slots->node_idx;
0577d1ab 1487
a54d8066
MS
1488 parent = NULL;
1489 for (node = &gfn_tree->rb_node; *node; ) {
1490 struct kvm_memory_slot *tmp;
f85e2cb5 1491
a54d8066
MS
1492 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1493 parent = *node;
1494 if (slot->base_gfn < tmp->base_gfn)
1495 node = &(*node)->rb_left;
1496 else if (slot->base_gfn > tmp->base_gfn)
1497 node = &(*node)->rb_right;
1498 else
1499 BUG();
0577d1ab 1500 }
a54d8066
MS
1501
1502 rb_link_node(&slot->gfn_node[idx], parent, node);
1503 rb_insert_color(&slot->gfn_node[idx], gfn_tree);
0577d1ab
SC
1504}
1505
a54d8066
MS
1506static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1507 struct kvm_memory_slot *slot)
0577d1ab 1508{
a54d8066
MS
1509 rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1510}
0577d1ab 1511
a54d8066
MS
1512static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1513 struct kvm_memory_slot *old,
1514 struct kvm_memory_slot *new)
1515{
1516 int idx = slots->node_idx;
0577d1ab 1517
a54d8066 1518 WARN_ON_ONCE(old->base_gfn != new->base_gfn);
0577d1ab 1519
a54d8066
MS
1520 rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1521 &slots->gfn_tree);
0577d1ab
SC
1522}
1523
1524/*
a54d8066 1525 * Replace @old with @new in the inactive memslots.
0577d1ab 1526 *
a54d8066
MS
1527 * With NULL @old this simply adds @new.
1528 * With NULL @new this simply removes @old.
0577d1ab 1529 *
a54d8066
MS
1530 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1531 * appropriately.
0577d1ab 1532 */
a54d8066
MS
1533static void kvm_replace_memslot(struct kvm *kvm,
1534 struct kvm_memory_slot *old,
1535 struct kvm_memory_slot *new)
0577d1ab 1536{
a54d8066
MS
1537 int as_id = kvm_memslots_get_as_id(old, new);
1538 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1539 int idx = slots->node_idx;
0577d1ab 1540
a54d8066
MS
1541 if (old) {
1542 hash_del(&old->id_node[idx]);
1543 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
0577d1ab 1544
a54d8066
MS
1545 if ((long)old == atomic_long_read(&slots->last_used_slot))
1546 atomic_long_set(&slots->last_used_slot, (long)new);
0577d1ab 1547
a54d8066
MS
1548 if (!new) {
1549 kvm_erase_gfn_node(slots, old);
1e8617d3 1550 return;
a54d8066
MS
1551 }
1552 }
1e8617d3 1553
a54d8066
MS
1554 /*
1555 * Initialize @new's hva range. Do this even when replacing an @old
1556 * slot, kvm_copy_memslot() deliberately does not touch node data.
1557 */
1558 new->hva_node[idx].start = new->userspace_addr;
1559 new->hva_node[idx].last = new->userspace_addr +
1560 (new->npages << PAGE_SHIFT) - 1;
1561
1562 /*
1563 * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1564 * hva_node needs to be swapped with remove+insert even though hva can't
1565 * change when replacing an existing slot.
1566 */
1567 hash_add(slots->id_hash, &new->id_node[idx], new->id);
1568 interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1569
1570 /*
1571 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1572 * switch the node in the gfn tree instead of removing the old and
1573 * inserting the new as two separate operations. Replacement is a
1574 * single O(1) operation versus two O(log(n)) operations for
1575 * remove+insert.
1576 */
1577 if (old && old->base_gfn == new->base_gfn) {
1578 kvm_replace_gfn_node(slots, old, new);
1579 } else {
1580 if (old)
1581 kvm_erase_gfn_node(slots, old);
1582 kvm_insert_gfn_node(slots, new);
0577d1ab 1583 }
bf3e05bc
XG
1584}
1585
bb58b90b
SC
1586/*
1587 * Flags that do not access any of the extra space of struct
1588 * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
1589 * only allows these.
1590 */
1591#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
1592 (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
1593
a7800aa8
SC
1594static int check_memory_region_flags(struct kvm *kvm,
1595 const struct kvm_userspace_memory_region2 *mem)
a50d64d6 1596{
4d8b81ab
XG
1597 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1598
a7800aa8
SC
1599 if (kvm_arch_has_private_mem(kvm))
1600 valid_flags |= KVM_MEM_GUEST_MEMFD;
1601
1602 /* Dirty logging private memory is not currently supported. */
1603 if (mem->flags & KVM_MEM_GUEST_MEMFD)
1604 valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
1605
8886640d 1606#ifdef CONFIG_HAVE_KVM_READONLY_MEM
e5635922
SC
1607 /*
1608 * GUEST_MEMFD is incompatible with read-only memslots, as writes to
1609 * read-only memslots have emulated MMIO, not page fault, semantics,
1610 * and KVM doesn't allow emulated MMIO for private memory.
1611 */
1612 if (!(mem->flags & KVM_MEM_GUEST_MEMFD))
1613 valid_flags |= KVM_MEM_READONLY;
4d8b81ab
XG
1614#endif
1615
1616 if (mem->flags & ~valid_flags)
a50d64d6
XG
1617 return -EINVAL;
1618
1619 return 0;
1620}
1621
a54d8066 1622static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
7ec4fb44 1623{
a54d8066
MS
1624 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1625
1626 /* Grab the generation from the activate memslots. */
1627 u64 gen = __kvm_memslots(kvm, as_id)->generation;
7ec4fb44 1628
361209e0
SC
1629 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1630 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
ee3d1570 1631
52ac8b35
PB
1632 /*
1633 * Do not store the new memslots while there are invalidations in
071064f1
PB
1634 * progress, otherwise the locking in invalidate_range_start and
1635 * invalidate_range_end will be unbalanced.
52ac8b35
PB
1636 */
1637 spin_lock(&kvm->mn_invalidate_lock);
1638 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1639 while (kvm->mn_active_invalidate_count) {
1640 set_current_state(TASK_UNINTERRUPTIBLE);
1641 spin_unlock(&kvm->mn_invalidate_lock);
1642 schedule();
1643 spin_lock(&kvm->mn_invalidate_lock);
1644 }
1645 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
f481b069 1646 rcu_assign_pointer(kvm->memslots[as_id], slots);
52ac8b35 1647 spin_unlock(&kvm->mn_invalidate_lock);
b10a038e
BG
1648
1649 /*
1650 * Acquired in kvm_set_memslot. Must be released before synchronize
1651 * SRCU below in order to avoid deadlock with another thread
1652 * acquiring the slots_arch_lock in an srcu critical section.
1653 */
1654 mutex_unlock(&kvm->slots_arch_lock);
1655
7ec4fb44 1656 synchronize_srcu_expedited(&kvm->srcu);
e59dbe09 1657
ee3d1570 1658 /*
361209e0 1659 * Increment the new memslot generation a second time, dropping the
00116795 1660 * update in-progress flag and incrementing the generation based on
361209e0
SC
1661 * the number of address spaces. This provides a unique and easily
1662 * identifiable generation number while the memslots are in flux.
1663 */
1664 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1665
1666 /*
4bd518f1
PB
1667 * Generations must be unique even across address spaces. We do not need
1668 * a global counter for that, instead the generation space is evenly split
1669 * across address spaces. For example, with two address spaces, address
164bf7e5
SC
1670 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1671 * use generations 1, 3, 5, ...
ee3d1570 1672 */
eed52e43 1673 gen += kvm_arch_nr_memslot_as_ids(kvm);
ee3d1570 1674
15248258 1675 kvm_arch_memslots_updated(kvm, gen);
ee3d1570 1676
15248258 1677 slots->generation = gen;
7ec4fb44
GN
1678}
1679
07921665
SC
1680static int kvm_prepare_memory_region(struct kvm *kvm,
1681 const struct kvm_memory_slot *old,
1682 struct kvm_memory_slot *new,
1683 enum kvm_mr_change change)
ddc12f2a 1684{
07921665
SC
1685 int r;
1686
1687 /*
1688 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1689 * will be freed on "commit". If logging is enabled in both old and
1690 * new, reuse the existing bitmap. If logging is enabled only in the
1691 * new and KVM isn't using a ring buffer, allocate and initialize a
1692 * new bitmap.
1693 */
244893fa
SC
1694 if (change != KVM_MR_DELETE) {
1695 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1696 new->dirty_bitmap = NULL;
1697 else if (old && old->dirty_bitmap)
1698 new->dirty_bitmap = old->dirty_bitmap;
86bdf3eb 1699 else if (kvm_use_dirty_bitmap(kvm)) {
244893fa
SC
1700 r = kvm_alloc_dirty_bitmap(new);
1701 if (r)
1702 return r;
1703
1704 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1705 bitmap_set(new->dirty_bitmap, 0, new->npages);
1706 }
07921665
SC
1707 }
1708
1709 r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1710
1711 /* Free the bitmap on failure if it was allocated above. */
c87661f8 1712 if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
07921665
SC
1713 kvm_destroy_dirty_bitmap(new);
1714
1715 return r;
ddc12f2a
BG
1716}
1717
07921665
SC
1718static void kvm_commit_memory_region(struct kvm *kvm,
1719 struct kvm_memory_slot *old,
1720 const struct kvm_memory_slot *new,
1721 enum kvm_mr_change change)
ddc12f2a 1722{
6c7b2202
PB
1723 int old_flags = old ? old->flags : 0;
1724 int new_flags = new ? new->flags : 0;
07921665
SC
1725 /*
1726 * Update the total number of memslot pages before calling the arch
1727 * hook so that architectures can consume the result directly.
1728 */
1729 if (change == KVM_MR_DELETE)
1730 kvm->nr_memslot_pages -= old->npages;
1731 else if (change == KVM_MR_CREATE)
1732 kvm->nr_memslot_pages += new->npages;
1733
6c7b2202
PB
1734 if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1735 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1736 atomic_set(&kvm->nr_memslots_dirty_logging,
1737 atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1738 }
1739
07921665
SC
1740 kvm_arch_commit_memory_region(kvm, old, new, change);
1741
a54d8066
MS
1742 switch (change) {
1743 case KVM_MR_CREATE:
1744 /* Nothing more to do. */
1745 break;
1746 case KVM_MR_DELETE:
1747 /* Free the old memslot and all its metadata. */
1748 kvm_free_memslot(kvm, old);
1749 break;
1750 case KVM_MR_MOVE:
1751 case KVM_MR_FLAGS_ONLY:
1752 /*
1753 * Free the dirty bitmap as needed; the below check encompasses
1754 * both the flags and whether a ring buffer is being used)
1755 */
1756 if (old->dirty_bitmap && !new->dirty_bitmap)
1757 kvm_destroy_dirty_bitmap(old);
1758
1759 /*
1760 * The final quirk. Free the detached, old slot, but only its
1761 * memory, not any metadata. Metadata, including arch specific
1762 * data, may be reused by @new.
1763 */
1764 kfree(old);
1765 break;
1766 default:
1767 BUG();
1768 }
ddc12f2a
BG
1769}
1770
36947254 1771/*
a54d8066
MS
1772 * Activate @new, which must be installed in the inactive slots by the caller,
1773 * by swapping the active slots and then propagating @new to @old once @old is
1774 * unreachable and can be safely modified.
1775 *
1776 * With NULL @old this simply adds @new to @active (while swapping the sets).
1777 * With NULL @new this simply removes @old from @active and frees it
1778 * (while also swapping the sets).
36947254 1779 */
a54d8066
MS
1780static void kvm_activate_memslot(struct kvm *kvm,
1781 struct kvm_memory_slot *old,
1782 struct kvm_memory_slot *new)
36947254 1783{
a54d8066 1784 int as_id = kvm_memslots_get_as_id(old, new);
36947254 1785
a54d8066
MS
1786 kvm_swap_active_memslots(kvm, as_id);
1787
1788 /* Propagate the new memslot to the now inactive memslots. */
1789 kvm_replace_memslot(kvm, old, new);
1790}
1791
1792static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1793 const struct kvm_memory_slot *src)
1794{
1795 dest->base_gfn = src->base_gfn;
1796 dest->npages = src->npages;
1797 dest->dirty_bitmap = src->dirty_bitmap;
1798 dest->arch = src->arch;
1799 dest->userspace_addr = src->userspace_addr;
1800 dest->flags = src->flags;
1801 dest->id = src->id;
1802 dest->as_id = src->as_id;
1803}
1804
1805static void kvm_invalidate_memslot(struct kvm *kvm,
1806 struct kvm_memory_slot *old,
244893fa 1807 struct kvm_memory_slot *invalid_slot)
a54d8066 1808{
07921665 1809 /*
a54d8066
MS
1810 * Mark the current slot INVALID. As with all memslot modifications,
1811 * this must be done on an unreachable slot to avoid modifying the
1812 * current slot in the active tree.
07921665 1813 */
244893fa
SC
1814 kvm_copy_memslot(invalid_slot, old);
1815 invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1816 kvm_replace_memslot(kvm, old, invalid_slot);
a54d8066
MS
1817
1818 /*
1819 * Activate the slot that is now marked INVALID, but don't propagate
1820 * the slot to the now inactive slots. The slot is either going to be
1821 * deleted or recreated as a new slot.
1822 */
1823 kvm_swap_active_memslots(kvm, old->as_id);
1824
1825 /*
1826 * From this point no new shadow pages pointing to a deleted, or moved,
1827 * memslot will be created. Validation of sp->gfn happens in:
1828 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1829 * - kvm_is_visible_gfn (mmu_check_root)
1830 */
bcb63dcd 1831 kvm_arch_flush_shadow_memslot(kvm, old);
683412cc 1832 kvm_arch_guest_memory_reclaimed(kvm);
a54d8066 1833
b0d23708 1834 /* Was released by kvm_swap_active_memslots(), reacquire. */
a54d8066
MS
1835 mutex_lock(&kvm->slots_arch_lock);
1836
1837 /*
1838 * Copy the arch-specific field of the newly-installed slot back to the
1839 * old slot as the arch data could have changed between releasing
b0d23708 1840 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
a54d8066
MS
1841 * above. Writers are required to retrieve memslots *after* acquiring
1842 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1843 */
244893fa 1844 old->arch = invalid_slot->arch;
a54d8066
MS
1845}
1846
1847static void kvm_create_memslot(struct kvm *kvm,
244893fa 1848 struct kvm_memory_slot *new)
a54d8066 1849{
244893fa
SC
1850 /* Add the new memslot to the inactive set and activate. */
1851 kvm_replace_memslot(kvm, NULL, new);
1852 kvm_activate_memslot(kvm, NULL, new);
a54d8066
MS
1853}
1854
1855static void kvm_delete_memslot(struct kvm *kvm,
1856 struct kvm_memory_slot *old,
1857 struct kvm_memory_slot *invalid_slot)
1858{
1859 /*
1860 * Remove the old memslot (in the inactive memslots) by passing NULL as
244893fa 1861 * the "new" slot, and for the invalid version in the active slots.
a54d8066
MS
1862 */
1863 kvm_replace_memslot(kvm, old, NULL);
a54d8066 1864 kvm_activate_memslot(kvm, invalid_slot, NULL);
a54d8066 1865}
36947254 1866
244893fa
SC
1867static void kvm_move_memslot(struct kvm *kvm,
1868 struct kvm_memory_slot *old,
1869 struct kvm_memory_slot *new,
1870 struct kvm_memory_slot *invalid_slot)
a54d8066 1871{
a54d8066 1872 /*
244893fa
SC
1873 * Replace the old memslot in the inactive slots, and then swap slots
1874 * and replace the current INVALID with the new as well.
a54d8066 1875 */
244893fa
SC
1876 kvm_replace_memslot(kvm, old, new);
1877 kvm_activate_memslot(kvm, invalid_slot, new);
a54d8066 1878}
36947254 1879
a54d8066
MS
1880static void kvm_update_flags_memslot(struct kvm *kvm,
1881 struct kvm_memory_slot *old,
244893fa 1882 struct kvm_memory_slot *new)
a54d8066
MS
1883{
1884 /*
1885 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1886 * an intermediate step. Instead, the old memslot is simply replaced
1887 * with a new, updated copy in both memslot sets.
1888 */
244893fa
SC
1889 kvm_replace_memslot(kvm, old, new);
1890 kvm_activate_memslot(kvm, old, new);
36947254
SC
1891}
1892
cf47f50b 1893static int kvm_set_memslot(struct kvm *kvm,
a54d8066 1894 struct kvm_memory_slot *old,
ce5f0215 1895 struct kvm_memory_slot *new,
cf47f50b
SC
1896 enum kvm_mr_change change)
1897{
244893fa 1898 struct kvm_memory_slot *invalid_slot;
cf47f50b
SC
1899 int r;
1900
b10a038e 1901 /*
b0d23708 1902 * Released in kvm_swap_active_memslots().
b10a038e 1903 *
b0d23708
JM
1904 * Must be held from before the current memslots are copied until after
1905 * the new memslots are installed with rcu_assign_pointer, then
1906 * released before the synchronize srcu in kvm_swap_active_memslots().
b10a038e
BG
1907 *
1908 * When modifying memslots outside of the slots_lock, must be held
1909 * before reading the pointer to the current memslots until after all
1910 * changes to those memslots are complete.
1911 *
1912 * These rules ensure that installing new memslots does not lose
1913 * changes made to the previous memslots.
1914 */
1915 mutex_lock(&kvm->slots_arch_lock);
1916
a54d8066
MS
1917 /*
1918 * Invalidate the old slot if it's being deleted or moved. This is
1919 * done prior to actually deleting/moving the memslot to allow vCPUs to
1920 * continue running by ensuring there are no mappings or shadow pages
1921 * for the memslot when it is deleted/moved. Without pre-invalidation
1922 * (and without a lock), a window would exist between effecting the
1923 * delete/move and committing the changes in arch code where KVM or a
1924 * guest could access a non-existent memslot.
244893fa
SC
1925 *
1926 * Modifications are done on a temporary, unreachable slot. The old
1927 * slot needs to be preserved in case a later step fails and the
1928 * invalidation needs to be reverted.
a54d8066 1929 */
cf47f50b 1930 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
244893fa
SC
1931 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1932 if (!invalid_slot) {
1933 mutex_unlock(&kvm->slots_arch_lock);
1934 return -ENOMEM;
1935 }
1936 kvm_invalidate_memslot(kvm, old, invalid_slot);
1937 }
b10a038e 1938
a54d8066
MS
1939 r = kvm_prepare_memory_region(kvm, old, new, change);
1940 if (r) {
b10a038e 1941 /*
a54d8066
MS
1942 * For DELETE/MOVE, revert the above INVALID change. No
1943 * modifications required since the original slot was preserved
1944 * in the inactive slots. Changing the active memslots also
1945 * release slots_arch_lock.
b10a038e 1946 */
244893fa
SC
1947 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1948 kvm_activate_memslot(kvm, invalid_slot, old);
1949 kfree(invalid_slot);
1950 } else {
a54d8066 1951 mutex_unlock(&kvm->slots_arch_lock);
244893fa 1952 }
a54d8066 1953 return r;
cf47f50b
SC
1954 }
1955
bda44d84 1956 /*
a54d8066
MS
1957 * For DELETE and MOVE, the working slot is now active as the INVALID
1958 * version of the old slot. MOVE is particularly special as it reuses
1959 * the old slot and returns a copy of the old slot (in working_slot).
1960 * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1961 * old slot is detached but otherwise preserved.
bda44d84 1962 */
a54d8066 1963 if (change == KVM_MR_CREATE)
244893fa 1964 kvm_create_memslot(kvm, new);
a54d8066 1965 else if (change == KVM_MR_DELETE)
244893fa 1966 kvm_delete_memslot(kvm, old, invalid_slot);
a54d8066 1967 else if (change == KVM_MR_MOVE)
244893fa 1968 kvm_move_memslot(kvm, old, new, invalid_slot);
a54d8066 1969 else if (change == KVM_MR_FLAGS_ONLY)
244893fa 1970 kvm_update_flags_memslot(kvm, old, new);
a54d8066
MS
1971 else
1972 BUG();
cf47f50b 1973
244893fa
SC
1974 /* Free the temporary INVALID slot used for DELETE and MOVE. */
1975 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1976 kfree(invalid_slot);
bda44d84 1977
a54d8066
MS
1978 /*
1979 * No need to refresh new->arch, changes after dropping slots_arch_lock
a413a625 1980 * will directly hit the final, active memslot. Architectures are
a54d8066
MS
1981 * responsible for knowing that new->arch may be stale.
1982 */
1983 kvm_commit_memory_region(kvm, old, new, change);
cf47f50b 1984
cf47f50b 1985 return 0;
cf47f50b
SC
1986}
1987
44401a20
MS
1988static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1989 gfn_t start, gfn_t end)
5c0b4f3d 1990{
44401a20 1991 struct kvm_memslot_iter iter;
5c0b4f3d 1992
44401a20
MS
1993 kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1994 if (iter.slot->id != id)
1995 return true;
1996 }
5c0b4f3d 1997
44401a20 1998 return false;
5c0b4f3d
SC
1999}
2000
6aa8b732
AK
2001/*
2002 * Allocate some memory and give it an address in the guest physical address
2003 * space.
2004 *
2005 * Discontiguous memory is allowed, mostly for framebuffers.
f78e0e2e 2006 *
02d5d55b 2007 * Must be called holding kvm->slots_lock for write.
6aa8b732 2008 */
f78e0e2e 2009int __kvm_set_memory_region(struct kvm *kvm,
bb58b90b 2010 const struct kvm_userspace_memory_region2 *mem)
6aa8b732 2011{
244893fa 2012 struct kvm_memory_slot *old, *new;
44401a20 2013 struct kvm_memslots *slots;
f64c0398 2014 enum kvm_mr_change change;
0f9bdef3
SC
2015 unsigned long npages;
2016 gfn_t base_gfn;
163da372
SC
2017 int as_id, id;
2018 int r;
6aa8b732 2019
a7800aa8 2020 r = check_memory_region_flags(kvm, mem);
a50d64d6 2021 if (r)
71a4c30b 2022 return r;
a50d64d6 2023
f481b069
PB
2024 as_id = mem->slot >> 16;
2025 id = (u16)mem->slot;
2026
6aa8b732 2027 /* General sanity checks */
6b285a55
SC
2028 if ((mem->memory_size & (PAGE_SIZE - 1)) ||
2029 (mem->memory_size != (unsigned long)mem->memory_size))
71a4c30b 2030 return -EINVAL;
6aa8b732 2031 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
71a4c30b 2032 return -EINVAL;
fa3d315a 2033 /* We can read the guest memory with __xxx_user() later on. */
09d952c9 2034 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
139bc8a6 2035 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
96d4f267 2036 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
09d952c9 2037 mem->memory_size))
71a4c30b 2038 return -EINVAL;
a7800aa8
SC
2039 if (mem->flags & KVM_MEM_GUEST_MEMFD &&
2040 (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
2041 mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
2042 return -EINVAL;
eed52e43 2043 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
71a4c30b 2044 return -EINVAL;
6aa8b732 2045 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
71a4c30b 2046 return -EINVAL;
0f9bdef3
SC
2047 if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
2048 return -EINVAL;
6aa8b732 2049
44401a20 2050 slots = __kvm_memslots(kvm, as_id);
6aa8b732 2051
5c0b4f3d 2052 /*
7cd08553
SC
2053 * Note, the old memslot (and the pointer itself!) may be invalidated
2054 * and/or destroyed by kvm_set_memslot().
5c0b4f3d 2055 */
44401a20 2056 old = id_to_memslot(slots, id);
163da372 2057
47ea7d90 2058 if (!mem->memory_size) {
7cd08553 2059 if (!old || !old->npages)
47ea7d90 2060 return -EINVAL;
5c0b4f3d 2061
7cd08553 2062 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
47ea7d90 2063 return -EIO;
6aa8b732 2064
244893fa 2065 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
47ea7d90 2066 }
5c0b4f3d 2067
0f9bdef3
SC
2068 base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2069 npages = (mem->memory_size >> PAGE_SHIFT);
163da372 2070
7cd08553 2071 if (!old || !old->npages) {
5c0b4f3d 2072 change = KVM_MR_CREATE;
afa319a5
SC
2073
2074 /*
2075 * To simplify KVM internals, the total number of pages across
2076 * all memslots must fit in an unsigned long.
2077 */
0f9bdef3 2078 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
afa319a5 2079 return -EINVAL;
5c0b4f3d 2080 } else { /* Modify an existing slot. */
a7800aa8
SC
2081 /* Private memslots are immutable, they can only be deleted. */
2082 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2083 return -EINVAL;
0f9bdef3
SC
2084 if ((mem->userspace_addr != old->userspace_addr) ||
2085 (npages != old->npages) ||
2086 ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
71a4c30b 2087 return -EINVAL;
09170a49 2088
0f9bdef3 2089 if (base_gfn != old->base_gfn)
5c0b4f3d 2090 change = KVM_MR_MOVE;
0f9bdef3 2091 else if (mem->flags != old->flags)
5c0b4f3d
SC
2092 change = KVM_MR_FLAGS_ONLY;
2093 else /* Nothing to change. */
2094 return 0;
09170a49 2095 }
6aa8b732 2096
44401a20 2097 if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
0f9bdef3 2098 kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
44401a20 2099 return -EEXIST;
6aa8b732 2100
244893fa
SC
2101 /* Allocate a slot that will persist in the memslot. */
2102 new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2103 if (!new)
2104 return -ENOMEM;
3c9bd400 2105
244893fa
SC
2106 new->as_id = as_id;
2107 new->id = id;
2108 new->base_gfn = base_gfn;
2109 new->npages = npages;
2110 new->flags = mem->flags;
2111 new->userspace_addr = mem->userspace_addr;
a7800aa8
SC
2112 if (mem->flags & KVM_MEM_GUEST_MEMFD) {
2113 r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
2114 if (r)
2115 goto out;
2116 }
6aa8b732 2117
244893fa 2118 r = kvm_set_memslot(kvm, old, new, change);
cf47f50b 2119 if (r)
a7800aa8
SC
2120 goto out_unbind;
2121
2122 return 0;
2123
2124out_unbind:
2125 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2126 kvm_gmem_unbind(new);
2127out:
2128 kfree(new);
6aa8b732 2129 return r;
210c7c4d 2130}
f78e0e2e
SY
2131EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2132
2133int kvm_set_memory_region(struct kvm *kvm,
bb58b90b 2134 const struct kvm_userspace_memory_region2 *mem)
f78e0e2e
SY
2135{
2136 int r;
2137
79fac95e 2138 mutex_lock(&kvm->slots_lock);
47ae31e2 2139 r = __kvm_set_memory_region(kvm, mem);
79fac95e 2140 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
2141 return r;
2142}
210c7c4d
IE
2143EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2144
7940876e 2145static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
bb58b90b 2146 struct kvm_userspace_memory_region2 *mem)
210c7c4d 2147{
f481b069 2148 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
e0d62c7f 2149 return -EINVAL;
09170a49 2150
47ae31e2 2151 return kvm_set_memory_region(kvm, mem);
6aa8b732
AK
2152}
2153
0dff0846 2154#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2a49f61d
SC
2155/**
2156 * kvm_get_dirty_log - get a snapshot of dirty pages
2157 * @kvm: pointer to kvm instance
2158 * @log: slot id and address to which we copy the log
2159 * @is_dirty: set to '1' if any dirty pages were found
2160 * @memslot: set to the associated memslot, always valid on success
2161 */
2162int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2163 int *is_dirty, struct kvm_memory_slot **memslot)
6aa8b732 2164{
9f6b8029 2165 struct kvm_memslots *slots;
843574a3 2166 int i, as_id, id;
87bf6e7d 2167 unsigned long n;
6aa8b732
AK
2168 unsigned long any = 0;
2169
86bdf3eb
GS
2170 /* Dirty ring tracking may be exclusive to dirty log tracking */
2171 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2172 return -ENXIO;
2173
2a49f61d
SC
2174 *memslot = NULL;
2175 *is_dirty = 0;
2176
f481b069
PB
2177 as_id = log->slot >> 16;
2178 id = (u16)log->slot;
eed52e43 2179 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
843574a3 2180 return -EINVAL;
6aa8b732 2181
f481b069 2182 slots = __kvm_memslots(kvm, as_id);
2a49f61d 2183 *memslot = id_to_memslot(slots, id);
0577d1ab 2184 if (!(*memslot) || !(*memslot)->dirty_bitmap)
843574a3 2185 return -ENOENT;
6aa8b732 2186
2a49f61d
SC
2187 kvm_arch_sync_dirty_log(kvm, *memslot);
2188
2189 n = kvm_dirty_bitmap_bytes(*memslot);
6aa8b732 2190
cd1a4a98 2191 for (i = 0; !any && i < n/sizeof(long); ++i)
2a49f61d 2192 any = (*memslot)->dirty_bitmap[i];
6aa8b732 2193
2a49f61d 2194 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
843574a3 2195 return -EFAULT;
6aa8b732 2196
5bb064dc
ZX
2197 if (any)
2198 *is_dirty = 1;
843574a3 2199 return 0;
6aa8b732 2200}
2ba9f0d8 2201EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
6aa8b732 2202
0dff0846 2203#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
ba0513b5 2204/**
b8b00220 2205 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2a31b9db 2206 * and reenable dirty page tracking for the corresponding pages.
ba0513b5
MS
2207 * @kvm: pointer to kvm instance
2208 * @log: slot id and address to which we copy the log
ba0513b5
MS
2209 *
2210 * We need to keep it in mind that VCPU threads can write to the bitmap
2211 * concurrently. So, to avoid losing track of dirty pages we keep the
2212 * following order:
2213 *
2214 * 1. Take a snapshot of the bit and clear it if needed.
2215 * 2. Write protect the corresponding page.
2216 * 3. Copy the snapshot to the userspace.
2217 * 4. Upon return caller flushes TLB's if needed.
2218 *
2219 * Between 2 and 4, the guest may write to the page using the remaining TLB
2220 * entry. This is not a problem because the page is reported dirty using
2221 * the snapshot taken before and step 4 ensures that writes done after
2222 * exiting to userspace will be logged for the next call.
2223 *
2224 */
0dff0846 2225static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
ba0513b5 2226{
9f6b8029 2227 struct kvm_memslots *slots;
ba0513b5 2228 struct kvm_memory_slot *memslot;
58d6db34 2229 int i, as_id, id;
ba0513b5
MS
2230 unsigned long n;
2231 unsigned long *dirty_bitmap;
2232 unsigned long *dirty_bitmap_buffer;
0dff0846 2233 bool flush;
ba0513b5 2234
86bdf3eb
GS
2235 /* Dirty ring tracking may be exclusive to dirty log tracking */
2236 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2237 return -ENXIO;
2238
f481b069
PB
2239 as_id = log->slot >> 16;
2240 id = (u16)log->slot;
eed52e43 2241 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
58d6db34 2242 return -EINVAL;
ba0513b5 2243
f481b069
PB
2244 slots = __kvm_memslots(kvm, as_id);
2245 memslot = id_to_memslot(slots, id);
0577d1ab
SC
2246 if (!memslot || !memslot->dirty_bitmap)
2247 return -ENOENT;
ba0513b5
MS
2248
2249 dirty_bitmap = memslot->dirty_bitmap;
ba0513b5 2250
0dff0846
SC
2251 kvm_arch_sync_dirty_log(kvm, memslot);
2252
ba0513b5 2253 n = kvm_dirty_bitmap_bytes(memslot);
0dff0846 2254 flush = false;
2a31b9db
PB
2255 if (kvm->manual_dirty_log_protect) {
2256 /*
2257 * Unlike kvm_get_dirty_log, we always return false in *flush,
2258 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2259 * is some code duplication between this function and
2260 * kvm_get_dirty_log, but hopefully all architecture
2261 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2262 * can be eliminated.
2263 */
2264 dirty_bitmap_buffer = dirty_bitmap;
2265 } else {
2266 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2267 memset(dirty_bitmap_buffer, 0, n);
ba0513b5 2268
531810ca 2269 KVM_MMU_LOCK(kvm);
2a31b9db
PB
2270 for (i = 0; i < n / sizeof(long); i++) {
2271 unsigned long mask;
2272 gfn_t offset;
ba0513b5 2273
2a31b9db
PB
2274 if (!dirty_bitmap[i])
2275 continue;
2276
0dff0846 2277 flush = true;
2a31b9db
PB
2278 mask = xchg(&dirty_bitmap[i], 0);
2279 dirty_bitmap_buffer[i] = mask;
2280
a67794ca
LT
2281 offset = i * BITS_PER_LONG;
2282 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2283 offset, mask);
2a31b9db 2284 }
531810ca 2285 KVM_MMU_UNLOCK(kvm);
2a31b9db
PB
2286 }
2287
0dff0846 2288 if (flush)
619b5072 2289 kvm_flush_remote_tlbs_memslot(kvm, memslot);
0dff0846 2290
2a31b9db
PB
2291 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2292 return -EFAULT;
2293 return 0;
2294}
0dff0846
SC
2295
2296
2297/**
2298 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2299 * @kvm: kvm instance
2300 * @log: slot id and address to which we copy the log
2301 *
2302 * Steps 1-4 below provide general overview of dirty page logging. See
2303 * kvm_get_dirty_log_protect() function description for additional details.
2304 *
2305 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2306 * always flush the TLB (step 4) even if previous step failed and the dirty
2307 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2308 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2309 * writes will be marked dirty for next log read.
2310 *
2311 * 1. Take a snapshot of the bit and clear it if needed.
2312 * 2. Write protect the corresponding page.
2313 * 3. Copy the snapshot to the userspace.
2314 * 4. Flush TLB's if needed.
2315 */
2316static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2317 struct kvm_dirty_log *log)
2318{
2319 int r;
2320
2321 mutex_lock(&kvm->slots_lock);
2322
2323 r = kvm_get_dirty_log_protect(kvm, log);
2324
2325 mutex_unlock(&kvm->slots_lock);
2326 return r;
2327}
2a31b9db
PB
2328
2329/**
2330 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2331 * and reenable dirty page tracking for the corresponding pages.
2332 * @kvm: pointer to kvm instance
2333 * @log: slot id and address from which to fetch the bitmap of dirty pages
2334 */
0dff0846
SC
2335static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2336 struct kvm_clear_dirty_log *log)
2a31b9db
PB
2337{
2338 struct kvm_memslots *slots;
2339 struct kvm_memory_slot *memslot;
98938aa8 2340 int as_id, id;
2a31b9db 2341 gfn_t offset;
98938aa8 2342 unsigned long i, n;
2a31b9db
PB
2343 unsigned long *dirty_bitmap;
2344 unsigned long *dirty_bitmap_buffer;
0dff0846 2345 bool flush;
2a31b9db 2346
86bdf3eb
GS
2347 /* Dirty ring tracking may be exclusive to dirty log tracking */
2348 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2349 return -ENXIO;
2350
2a31b9db
PB
2351 as_id = log->slot >> 16;
2352 id = (u16)log->slot;
eed52e43 2353 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2a31b9db
PB
2354 return -EINVAL;
2355
76d58e0f 2356 if (log->first_page & 63)
2a31b9db
PB
2357 return -EINVAL;
2358
2359 slots = __kvm_memslots(kvm, as_id);
2360 memslot = id_to_memslot(slots, id);
0577d1ab
SC
2361 if (!memslot || !memslot->dirty_bitmap)
2362 return -ENOENT;
2a31b9db
PB
2363
2364 dirty_bitmap = memslot->dirty_bitmap;
2a31b9db 2365
4ddc9204 2366 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
98938aa8
TB
2367
2368 if (log->first_page > memslot->npages ||
76d58e0f
PB
2369 log->num_pages > memslot->npages - log->first_page ||
2370 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2371 return -EINVAL;
98938aa8 2372
0dff0846
SC
2373 kvm_arch_sync_dirty_log(kvm, memslot);
2374
2375 flush = false;
2a31b9db
PB
2376 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2377 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2378 return -EFAULT;
ba0513b5 2379
531810ca 2380 KVM_MMU_LOCK(kvm);
53eac7a8
PX
2381 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2382 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2a31b9db
PB
2383 i++, offset += BITS_PER_LONG) {
2384 unsigned long mask = *dirty_bitmap_buffer++;
2385 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2386 if (!mask)
ba0513b5
MS
2387 continue;
2388
2a31b9db 2389 mask &= atomic_long_fetch_andnot(mask, p);
ba0513b5 2390
2a31b9db
PB
2391 /*
2392 * mask contains the bits that really have been cleared. This
2393 * never includes any bits beyond the length of the memslot (if
2394 * the length is not aligned to 64 pages), therefore it is not
2395 * a problem if userspace sets them in log->dirty_bitmap.
2396 */
58d2930f 2397 if (mask) {
0dff0846 2398 flush = true;
58d2930f
TY
2399 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2400 offset, mask);
2401 }
ba0513b5 2402 }
531810ca 2403 KVM_MMU_UNLOCK(kvm);
2a31b9db 2404
0dff0846 2405 if (flush)
619b5072 2406 kvm_flush_remote_tlbs_memslot(kvm, memslot);
0dff0846 2407
58d6db34 2408 return 0;
ba0513b5 2409}
0dff0846
SC
2410
2411static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2412 struct kvm_clear_dirty_log *log)
2413{
2414 int r;
2415
2416 mutex_lock(&kvm->slots_lock);
2417
2418 r = kvm_clear_dirty_log_protect(kvm, log);
2419
2420 mutex_unlock(&kvm->slots_lock);
2421 return r;
2422}
2423#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
ba0513b5 2424
5a475554
CP
2425#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
2426/*
2427 * Returns true if _all_ gfns in the range [@start, @end) have attributes
2428 * matching @attrs.
2429 */
2430bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2431 unsigned long attrs)
2432{
2433 XA_STATE(xas, &kvm->mem_attr_array, start);
2434 unsigned long index;
2435 bool has_attrs;
2436 void *entry;
2437
2438 rcu_read_lock();
2439
2440 if (!attrs) {
2441 has_attrs = !xas_find(&xas, end - 1);
2442 goto out;
2443 }
2444
2445 has_attrs = true;
2446 for (index = start; index < end; index++) {
2447 do {
2448 entry = xas_next(&xas);
2449 } while (xas_retry(&xas, entry));
2450
2451 if (xas.xa_index != index || xa_to_value(entry) != attrs) {
2452 has_attrs = false;
2453 break;
2454 }
2455 }
2456
2457out:
2458 rcu_read_unlock();
2459 return has_attrs;
2460}
2461
2462static u64 kvm_supported_mem_attributes(struct kvm *kvm)
2463{
a7800aa8 2464 if (!kvm || kvm_arch_has_private_mem(kvm))
5a475554
CP
2465 return KVM_MEMORY_ATTRIBUTE_PRIVATE;
2466
2467 return 0;
2468}
2469
2470static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
2471 struct kvm_mmu_notifier_range *range)
2472{
2473 struct kvm_gfn_range gfn_range;
2474 struct kvm_memory_slot *slot;
2475 struct kvm_memslots *slots;
2476 struct kvm_memslot_iter iter;
2477 bool found_memslot = false;
2478 bool ret = false;
2479 int i;
2480
2481 gfn_range.arg = range->arg;
2482 gfn_range.may_block = range->may_block;
2483
eed52e43 2484 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
5a475554
CP
2485 slots = __kvm_memslots(kvm, i);
2486
2487 kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
2488 slot = iter.slot;
2489 gfn_range.slot = slot;
2490
2491 gfn_range.start = max(range->start, slot->base_gfn);
2492 gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
2493 if (gfn_range.start >= gfn_range.end)
2494 continue;
2495
2496 if (!found_memslot) {
2497 found_memslot = true;
2498 KVM_MMU_LOCK(kvm);
2499 if (!IS_KVM_NULL_FN(range->on_lock))
2500 range->on_lock(kvm);
2501 }
2502
2503 ret |= range->handler(kvm, &gfn_range);
2504 }
2505 }
2506
2507 if (range->flush_on_ret && ret)
2508 kvm_flush_remote_tlbs(kvm);
2509
2510 if (found_memslot)
2511 KVM_MMU_UNLOCK(kvm);
2512}
2513
2514static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
2515 struct kvm_gfn_range *range)
2516{
2517 /*
2518 * Unconditionally add the range to the invalidation set, regardless of
2519 * whether or not the arch callback actually needs to zap SPTEs. E.g.
2520 * if KVM supports RWX attributes in the future and the attributes are
2521 * going from R=>RW, zapping isn't strictly necessary. Unconditionally
2522 * adding the range allows KVM to require that MMU invalidations add at
2523 * least one range between begin() and end(), e.g. allows KVM to detect
2524 * bugs where the add() is missed. Relaxing the rule *might* be safe,
2525 * but it's not obvious that allowing new mappings while the attributes
2526 * are in flux is desirable or worth the complexity.
2527 */
2528 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
2529
2530 return kvm_arch_pre_set_memory_attributes(kvm, range);
2531}
2532
2533/* Set @attributes for the gfn range [@start, @end). */
2534static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2535 unsigned long attributes)
2536{
2537 struct kvm_mmu_notifier_range pre_set_range = {
2538 .start = start,
2539 .end = end,
2540 .handler = kvm_pre_set_memory_attributes,
2541 .on_lock = kvm_mmu_invalidate_begin,
2542 .flush_on_ret = true,
2543 .may_block = true,
2544 };
2545 struct kvm_mmu_notifier_range post_set_range = {
2546 .start = start,
2547 .end = end,
2548 .arg.attributes = attributes,
2549 .handler = kvm_arch_post_set_memory_attributes,
2550 .on_lock = kvm_mmu_invalidate_end,
2551 .may_block = true,
2552 };
2553 unsigned long i;
2554 void *entry;
2555 int r = 0;
2556
2557 entry = attributes ? xa_mk_value(attributes) : NULL;
2558
2559 mutex_lock(&kvm->slots_lock);
2560
2561 /* Nothing to do if the entire range as the desired attributes. */
2562 if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
2563 goto out_unlock;
2564
2565 /*
2566 * Reserve memory ahead of time to avoid having to deal with failures
2567 * partway through setting the new attributes.
2568 */
2569 for (i = start; i < end; i++) {
2570 r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
2571 if (r)
2572 goto out_unlock;
2573 }
2574
2575 kvm_handle_gfn_range(kvm, &pre_set_range);
2576
2577 for (i = start; i < end; i++) {
2578 r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
2579 GFP_KERNEL_ACCOUNT));
2580 KVM_BUG_ON(r, kvm);
2581 }
2582
2583 kvm_handle_gfn_range(kvm, &post_set_range);
2584
2585out_unlock:
2586 mutex_unlock(&kvm->slots_lock);
2587
2588 return r;
2589}
2590static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
2591 struct kvm_memory_attributes *attrs)
2592{
2593 gfn_t start, end;
2594
2595 /* flags is currently not used. */
2596 if (attrs->flags)
2597 return -EINVAL;
2598 if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
2599 return -EINVAL;
2600 if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
2601 return -EINVAL;
2602 if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
2603 return -EINVAL;
2604
2605 start = attrs->address >> PAGE_SHIFT;
2606 end = (attrs->address + attrs->size) >> PAGE_SHIFT;
2607
2608 /*
2609 * xarray tracks data using "unsigned long", and as a result so does
2610 * KVM. For simplicity, supports generic attributes only on 64-bit
2611 * architectures.
2612 */
2613 BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
2614
2615 return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
2616}
2617#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
2618
49c7754c
GN
2619struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2620{
2621 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2622}
a1f4d395 2623EXPORT_SYMBOL_GPL(gfn_to_memslot);
6aa8b732 2624
8e73485c
PB
2625struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2626{
fe22ed82 2627 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
a54d8066 2628 u64 gen = slots->generation;
fe22ed82 2629 struct kvm_memory_slot *slot;
fe22ed82 2630
a54d8066
MS
2631 /*
2632 * This also protects against using a memslot from a different address space,
2633 * since different address spaces have different generation numbers.
2634 */
2635 if (unlikely(gen != vcpu->last_used_slot_gen)) {
2636 vcpu->last_used_slot = NULL;
2637 vcpu->last_used_slot_gen = gen;
2638 }
2639
2640 slot = try_get_memslot(vcpu->last_used_slot, gfn);
fe22ed82
DM
2641 if (slot)
2642 return slot;
2643
2644 /*
2645 * Fall back to searching all memslots. We purposely use
2646 * search_memslots() instead of __gfn_to_memslot() to avoid
a54d8066 2647 * thrashing the VM-wide last_used_slot in kvm_memslots.
fe22ed82 2648 */
a54d8066 2649 slot = search_memslots(slots, gfn, false);
fe22ed82 2650 if (slot) {
a54d8066 2651 vcpu->last_used_slot = slot;
fe22ed82
DM
2652 return slot;
2653 }
2654
2655 return NULL;
8e73485c
PB
2656}
2657
33e94154 2658bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
e0d62c7f 2659{
bf3e05bc 2660 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
e0d62c7f 2661
c36b7150 2662 return kvm_is_visible_memslot(memslot);
e0d62c7f
IE
2663}
2664EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2665
995decb6
VK
2666bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2667{
2668 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2669
2670 return kvm_is_visible_memslot(memslot);
2671}
2672EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2673
f9b84e19 2674unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
8f0b1ab6
JR
2675{
2676 struct vm_area_struct *vma;
2677 unsigned long addr, size;
2678
2679 size = PAGE_SIZE;
2680
42cde48b 2681 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
8f0b1ab6
JR
2682 if (kvm_is_error_hva(addr))
2683 return PAGE_SIZE;
2684
d8ed45c5 2685 mmap_read_lock(current->mm);
8f0b1ab6
JR
2686 vma = find_vma(current->mm, addr);
2687 if (!vma)
2688 goto out;
2689
2690 size = vma_kernel_pagesize(vma);
2691
2692out:
d8ed45c5 2693 mmap_read_unlock(current->mm);
8f0b1ab6
JR
2694
2695 return size;
2696}
2697
8283e36a 2698static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
4d8b81ab
XG
2699{
2700 return slot->flags & KVM_MEM_READONLY;
2701}
2702
8283e36a 2703static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
4d8b81ab 2704 gfn_t *nr_pages, bool write)
539cb660 2705{
bc6678a3 2706 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
ca3a490c 2707 return KVM_HVA_ERR_BAD;
48987781 2708
4d8b81ab
XG
2709 if (memslot_is_readonly(slot) && write)
2710 return KVM_HVA_ERR_RO_BAD;
48987781
XG
2711
2712 if (nr_pages)
2713 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2714
4d8b81ab 2715 return __gfn_to_hva_memslot(slot, gfn);
539cb660 2716}
48987781 2717
4d8b81ab
XG
2718static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2719 gfn_t *nr_pages)
2720{
2721 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
539cb660 2722}
48987781 2723
4d8b81ab 2724unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
7940876e 2725 gfn_t gfn)
4d8b81ab
XG
2726{
2727 return gfn_to_hva_many(slot, gfn, NULL);
2728}
2729EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2730
48987781
XG
2731unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2732{
49c7754c 2733 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
48987781 2734}
0d150298 2735EXPORT_SYMBOL_GPL(gfn_to_hva);
539cb660 2736
8e73485c
PB
2737unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2738{
2739 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2740}
2741EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2742
86ab8cff 2743/*
970c0d4b
WY
2744 * Return the hva of a @gfn and the R/W attribute if possible.
2745 *
2746 * @slot: the kvm_memory_slot which contains @gfn
2747 * @gfn: the gfn to be translated
2748 * @writable: used to return the read/write attribute of the @slot if the hva
2749 * is valid and @writable is not NULL
86ab8cff 2750 */
64d83126
CD
2751unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2752 gfn_t gfn, bool *writable)
86ab8cff 2753{
a2ac07fe
GN
2754 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2755
2756 if (!kvm_is_error_hva(hva) && writable)
ba6a3541
PB
2757 *writable = !memslot_is_readonly(slot);
2758
a2ac07fe 2759 return hva;
86ab8cff
XG
2760}
2761
64d83126
CD
2762unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2763{
2764 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2765
2766 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2767}
2768
8e73485c
PB
2769unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2770{
2771 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2772
2773 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2774}
2775
fafc3dba
HY
2776static inline int check_user_page_hwpoison(unsigned long addr)
2777{
0d731759 2778 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
fafc3dba 2779
54d02069 2780 rc = get_user_pages(addr, 1, flags, NULL);
fafc3dba
HY
2781 return rc == -EHWPOISON;
2782}
2783
2fc84311 2784/*
b9b33da2
PB
2785 * The fast path to get the writable pfn which will be stored in @pfn,
2786 * true indicates success, otherwise false is returned. It's also the
311497e0 2787 * only part that runs if we can in atomic context.
2fc84311 2788 */
b9b33da2
PB
2789static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2790 bool *writable, kvm_pfn_t *pfn)
954bbbc2 2791{
8d4e1288 2792 struct page *page[1];
954bbbc2 2793
12ce13fe
XG
2794 /*
2795 * Fast pin a writable pfn only if it is a write fault request
2796 * or the caller allows to map a writable pfn for a read fault
2797 * request.
2798 */
2799 if (!(write_fault || writable))
2800 return false;
612819c3 2801
dadbb612 2802 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2fc84311 2803 *pfn = page_to_pfn(page[0]);
612819c3 2804
2fc84311
XG
2805 if (writable)
2806 *writable = true;
2807 return true;
2808 }
af585b92 2809
2fc84311
XG
2810 return false;
2811}
612819c3 2812
2fc84311
XG
2813/*
2814 * The slow path to get the pfn of the specified host virtual address,
2815 * 1 indicates success, -errno is returned if error is detected.
2816 */
2817static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
c8b88b33 2818 bool interruptible, bool *writable, kvm_pfn_t *pfn)
2fc84311 2819{
b1e1296d
DH
2820 /*
2821 * When a VCPU accesses a page that is not mapped into the secondary
2822 * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2823 * make progress. We always want to honor NUMA hinting faults in that
2824 * case, because GUP usage corresponds to memory accesses from the VCPU.
2825 * Otherwise, we'd not trigger NUMA hinting faults once a page is
2826 * mapped into the secondary MMU and gets accessed by a VCPU.
2827 *
2828 * Note that get_user_page_fast_only() and FOLL_WRITE for now
2829 * implicitly honor NUMA hinting faults and don't need this flag.
2830 */
2831 unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
ce53053c 2832 struct page *page;
28249139 2833 int npages;
612819c3 2834
2fc84311
XG
2835 might_sleep();
2836
2837 if (writable)
2838 *writable = write_fault;
2839
ce53053c
AV
2840 if (write_fault)
2841 flags |= FOLL_WRITE;
2842 if (async)
2843 flags |= FOLL_NOWAIT;
c8b88b33
PX
2844 if (interruptible)
2845 flags |= FOLL_INTERRUPTIBLE;
d4944b0e 2846
ce53053c 2847 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2fc84311
XG
2848 if (npages != 1)
2849 return npages;
2850
2851 /* map read fault as writable if possible */
12ce13fe 2852 if (unlikely(!write_fault) && writable) {
ce53053c 2853 struct page *wpage;
2fc84311 2854
dadbb612 2855 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2fc84311 2856 *writable = true;
ce53053c
AV
2857 put_page(page);
2858 page = wpage;
612819c3 2859 }
887c08ac 2860 }
ce53053c 2861 *pfn = page_to_pfn(page);
2fc84311
XG
2862 return npages;
2863}
539cb660 2864
4d8b81ab
XG
2865static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2866{
2867 if (unlikely(!(vma->vm_flags & VM_READ)))
2868 return false;
2e2e3738 2869
4d8b81ab
XG
2870 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2871 return false;
887c08ac 2872
4d8b81ab
XG
2873 return true;
2874}
bf998156 2875
f8be156b
NP
2876static int kvm_try_get_pfn(kvm_pfn_t pfn)
2877{
b14b2690
SC
2878 struct page *page = kvm_pfn_to_refcounted_page(pfn);
2879
2880 if (!page)
f8be156b 2881 return 1;
b14b2690
SC
2882
2883 return get_page_unless_zero(page);
f8be156b
NP
2884}
2885
92176a8e 2886static int hva_to_pfn_remapped(struct vm_area_struct *vma,
1625566e
XT
2887 unsigned long addr, bool write_fault,
2888 bool *writable, kvm_pfn_t *p_pfn)
92176a8e 2889{
a9545779 2890 kvm_pfn_t pfn;
bd2fae8d 2891 pte_t *ptep;
c33c7948 2892 pte_t pte;
bd2fae8d 2893 spinlock_t *ptl;
add6a0cd
PB
2894 int r;
2895
9fd6dad1 2896 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
add6a0cd
PB
2897 if (r) {
2898 /*
2899 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2900 * not call the fault handler, so do it here.
2901 */
2902 bool unlocked = false;
64019a2e 2903 r = fixup_user_fault(current->mm, addr,
add6a0cd
PB
2904 (write_fault ? FAULT_FLAG_WRITE : 0),
2905 &unlocked);
a8387d0b
PB
2906 if (unlocked)
2907 return -EAGAIN;
add6a0cd
PB
2908 if (r)
2909 return r;
2910
9fd6dad1 2911 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
add6a0cd
PB
2912 if (r)
2913 return r;
bd2fae8d 2914 }
add6a0cd 2915
c33c7948
RR
2916 pte = ptep_get(ptep);
2917
2918 if (write_fault && !pte_write(pte)) {
bd2fae8d
PB
2919 pfn = KVM_PFN_ERR_RO_FAULT;
2920 goto out;
add6a0cd
PB
2921 }
2922
a340b3e2 2923 if (writable)
c33c7948
RR
2924 *writable = pte_write(pte);
2925 pfn = pte_pfn(pte);
add6a0cd
PB
2926
2927 /*
2928 * Get a reference here because callers of *hva_to_pfn* and
2929 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2930 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
36c3ce6c 2931 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
add6a0cd
PB
2932 * simply do nothing for reserved pfns.
2933 *
2934 * Whoever called remap_pfn_range is also going to call e.g.
2935 * unmap_mapping_range before the underlying pages are freed,
2936 * causing a call to our MMU notifier.
f8be156b
NP
2937 *
2938 * Certain IO or PFNMAP mappings can be backed with valid
2939 * struct pages, but be allocated without refcounting e.g.,
2940 * tail pages of non-compound higher order allocations, which
2941 * would then underflow the refcount when the caller does the
2942 * required put_page. Don't allow those pages here.
c33c7948 2943 */
f8be156b
NP
2944 if (!kvm_try_get_pfn(pfn))
2945 r = -EFAULT;
add6a0cd 2946
bd2fae8d
PB
2947out:
2948 pte_unmap_unlock(ptep, ptl);
add6a0cd 2949 *p_pfn = pfn;
f8be156b
NP
2950
2951 return r;
92176a8e
PB
2952}
2953
12ce13fe
XG
2954/*
2955 * Pin guest page in memory and return its pfn.
2956 * @addr: host virtual address which maps memory to the guest
ed2f049f 2957 * @atomic: whether this function is forbidden from sleeping
c8b88b33 2958 * @interruptible: whether the process can be interrupted by non-fatal signals
12ce13fe
XG
2959 * @async: whether this function need to wait IO complete if the
2960 * host page is not in the memory
2961 * @write_fault: whether we should get a writable host page
2962 * @writable: whether it allows to map a writable host page for !@write_fault
2963 *
2964 * The function will map a writable host page for these two cases:
2965 * 1): @write_fault = true
2966 * 2): @write_fault = false && @writable, @writable will tell the caller
2967 * whether the mapping is writable.
2968 */
c8b88b33
PX
2969kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2970 bool *async, bool write_fault, bool *writable)
2fc84311
XG
2971{
2972 struct vm_area_struct *vma;
943dfea8 2973 kvm_pfn_t pfn;
92176a8e 2974 int npages, r;
2e2e3738 2975
2fc84311
XG
2976 /* we can do it either atomically or asynchronously, not both */
2977 BUG_ON(atomic && async);
8d4e1288 2978
b9b33da2 2979 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2fc84311
XG
2980 return pfn;
2981
2982 if (atomic)
2983 return KVM_PFN_ERR_FAULT;
2984
c8b88b33
PX
2985 npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2986 writable, &pfn);
2fc84311
XG
2987 if (npages == 1)
2988 return pfn;
fe5ed56c
PX
2989 if (npages == -EINTR)
2990 return KVM_PFN_ERR_SIGPENDING;
8d4e1288 2991
d8ed45c5 2992 mmap_read_lock(current->mm);
2fc84311
XG
2993 if (npages == -EHWPOISON ||
2994 (!async && check_user_page_hwpoison(addr))) {
2995 pfn = KVM_PFN_ERR_HWPOISON;
2996 goto exit;
2997 }
2998
a8387d0b 2999retry:
fc98c03b 3000 vma = vma_lookup(current->mm, addr);
2fc84311
XG
3001
3002 if (vma == NULL)
3003 pfn = KVM_PFN_ERR_FAULT;
92176a8e 3004 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
1625566e 3005 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
a8387d0b
PB
3006 if (r == -EAGAIN)
3007 goto retry;
92176a8e
PB
3008 if (r < 0)
3009 pfn = KVM_PFN_ERR_FAULT;
2fc84311 3010 } else {
4d8b81ab 3011 if (async && vma_is_valid(vma, write_fault))
2fc84311
XG
3012 *async = true;
3013 pfn = KVM_PFN_ERR_FAULT;
3014 }
3015exit:
d8ed45c5 3016 mmap_read_unlock(current->mm);
2e2e3738 3017 return pfn;
35149e21
AL
3018}
3019
8283e36a 3020kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
c8b88b33
PX
3021 bool atomic, bool interruptible, bool *async,
3022 bool write_fault, bool *writable, hva_t *hva)
887c08ac 3023{
4d8b81ab
XG
3024 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
3025
4a42d848
DS
3026 if (hva)
3027 *hva = addr;
3028
b2740d35
PB
3029 if (kvm_is_error_hva(addr)) {
3030 if (writable)
3031 *writable = false;
f588557a
AM
3032
3033 return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT :
3034 KVM_PFN_NOSLOT;
b2740d35 3035 }
4d8b81ab
XG
3036
3037 /* Do not map writable pfn in the readonly memslot. */
3038 if (writable && memslot_is_readonly(slot)) {
3039 *writable = false;
3040 writable = NULL;
3041 }
3042
c8b88b33 3043 return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
4d8b81ab 3044 writable);
887c08ac 3045}
3520469d 3046EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
887c08ac 3047
ba049e93 3048kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
612819c3
MT
3049 bool *writable)
3050{
c8b88b33
PX
3051 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
3052 NULL, write_fault, writable, NULL);
612819c3
MT
3053}
3054EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
3055
8283e36a 3056kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
506f0d6f 3057{
c8b88b33
PX
3058 return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
3059 NULL, NULL);
506f0d6f 3060}
e37afc6e 3061EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
506f0d6f 3062
8283e36a 3063kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
506f0d6f 3064{
c8b88b33
PX
3065 return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
3066 NULL, NULL);
506f0d6f 3067}
037d92dc 3068EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
506f0d6f 3069
ba049e93 3070kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
8e73485c
PB
3071{
3072 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
3073}
3074EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
3075
ba049e93 3076kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
e37afc6e
PB
3077{
3078 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
3079}
3080EXPORT_SYMBOL_GPL(gfn_to_pfn);
3081
ba049e93 3082kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
8e73485c
PB
3083{
3084 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
3085}
3086EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
3087
d9ef13c2
PB
3088int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3089 struct page **pages, int nr_pages)
48987781
XG
3090{
3091 unsigned long addr;
076b925d 3092 gfn_t entry = 0;
48987781 3093
d9ef13c2 3094 addr = gfn_to_hva_many(slot, gfn, &entry);
48987781
XG
3095 if (kvm_is_error_hva(addr))
3096 return -1;
3097
3098 if (entry < nr_pages)
3099 return 0;
3100
dadbb612 3101 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
48987781
XG
3102}
3103EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
3104
b1624f99
SC
3105/*
3106 * Do not use this helper unless you are absolutely certain the gfn _must_ be
3107 * backed by 'struct page'. A valid example is if the backing memslot is
3108 * controlled by KVM. Note, if the returned page is valid, it's refcount has
3109 * been elevated by gfn_to_pfn().
3110 */
35149e21
AL
3111struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
3112{
b14b2690 3113 struct page *page;
ba049e93 3114 kvm_pfn_t pfn;
2e2e3738
AL
3115
3116 pfn = gfn_to_pfn(kvm, gfn);
2e2e3738 3117
81c52c56 3118 if (is_error_noslot_pfn(pfn))
cb9aaa30 3119 return KVM_ERR_PTR_BAD_PAGE;
a2766325 3120
b14b2690
SC
3121 page = kvm_pfn_to_refcounted_page(pfn);
3122 if (!page)
6cede2e6 3123 return KVM_ERR_PTR_BAD_PAGE;
a2766325 3124
b14b2690 3125 return page;
954bbbc2
AK
3126}
3127EXPORT_SYMBOL_GPL(gfn_to_page);
3128
357a18ad 3129void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
91724814 3130{
91724814
BO
3131 if (dirty)
3132 kvm_release_pfn_dirty(pfn);
3133 else
3134 kvm_release_pfn_clean(pfn);
3135}
3136
357a18ad 3137int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
e45adf66
KA
3138{
3139 kvm_pfn_t pfn;
3140 void *hva = NULL;
3141 struct page *page = KVM_UNMAPPED_PAGE;
3142
3143 if (!map)
3144 return -EINVAL;
3145
357a18ad 3146 pfn = gfn_to_pfn(vcpu->kvm, gfn);
e45adf66
KA
3147 if (is_error_noslot_pfn(pfn))
3148 return -EINVAL;
3149
3150 if (pfn_valid(pfn)) {
3151 page = pfn_to_page(pfn);
357a18ad 3152 hva = kmap(page);
d30b214d 3153#ifdef CONFIG_HAS_IOMEM
91724814 3154 } else {
357a18ad 3155 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
d30b214d 3156#endif
e45adf66
KA
3157 }
3158
3159 if (!hva)
3160 return -EFAULT;
3161
3162 map->page = page;
3163 map->hva = hva;
3164 map->pfn = pfn;
3165 map->gfn = gfn;
3166
3167 return 0;
3168}
e45adf66
KA
3169EXPORT_SYMBOL_GPL(kvm_vcpu_map);
3170
357a18ad 3171void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
e45adf66
KA
3172{
3173 if (!map)
3174 return;
3175
3176 if (!map->hva)
3177 return;
3178
357a18ad
DW
3179 if (map->page != KVM_UNMAPPED_PAGE)
3180 kunmap(map->page);
eb1f2f38 3181#ifdef CONFIG_HAS_IOMEM
91724814 3182 else
357a18ad 3183 memunmap(map->hva);
eb1f2f38 3184#endif
e45adf66 3185
91724814 3186 if (dirty)
357a18ad 3187 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
91724814 3188
357a18ad 3189 kvm_release_pfn(map->pfn, dirty);
e45adf66
KA
3190
3191 map->hva = NULL;
3192 map->page = NULL;
3193}
3194EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
3195
8e1c6914 3196static bool kvm_is_ad_tracked_page(struct page *page)
8e73485c 3197{
8e1c6914
SC
3198 /*
3199 * Per page-flags.h, pages tagged PG_reserved "should in general not be
3200 * touched (e.g. set dirty) except by its owner".
3201 */
3202 return !PageReserved(page);
3203}
8e73485c 3204
8e1c6914
SC
3205static void kvm_set_page_dirty(struct page *page)
3206{
3207 if (kvm_is_ad_tracked_page(page))
3208 SetPageDirty(page);
3209}
8e73485c 3210
8e1c6914
SC
3211static void kvm_set_page_accessed(struct page *page)
3212{
3213 if (kvm_is_ad_tracked_page(page))
3214 mark_page_accessed(page);
8e73485c 3215}
8e73485c 3216
b4231d61
IE
3217void kvm_release_page_clean(struct page *page)
3218{
32cad84f
XG
3219 WARN_ON(is_error_page(page));
3220
8e1c6914
SC
3221 kvm_set_page_accessed(page);
3222 put_page(page);
b4231d61
IE
3223}
3224EXPORT_SYMBOL_GPL(kvm_release_page_clean);
3225
ba049e93 3226void kvm_release_pfn_clean(kvm_pfn_t pfn)
35149e21 3227{
b14b2690
SC
3228 struct page *page;
3229
3230 if (is_error_noslot_pfn(pfn))
3231 return;
3232
3233 page = kvm_pfn_to_refcounted_page(pfn);
3234 if (!page)
3235 return;
3236
3237 kvm_release_page_clean(page);
35149e21
AL
3238}
3239EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
3240
b4231d61 3241void kvm_release_page_dirty(struct page *page)
8a7ae055 3242{
a2766325
XG
3243 WARN_ON(is_error_page(page));
3244
8e1c6914
SC
3245 kvm_set_page_dirty(page);
3246 kvm_release_page_clean(page);
35149e21
AL
3247}
3248EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
3249
f7a6509f 3250void kvm_release_pfn_dirty(kvm_pfn_t pfn)
35149e21 3251{
b14b2690
SC
3252 struct page *page;
3253
3254 if (is_error_noslot_pfn(pfn))
3255 return;
3256
3257 page = kvm_pfn_to_refcounted_page(pfn);
3258 if (!page)
3259 return;
3260
3261 kvm_release_page_dirty(page);
35149e21 3262}
f7a6509f 3263EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
35149e21 3264
8e1c6914
SC
3265/*
3266 * Note, checking for an error/noslot pfn is the caller's responsibility when
3267 * directly marking a page dirty/accessed. Unlike the "release" helpers, the
3268 * "set" helpers are not to be used when the pfn might point at garbage.
3269 */
ba049e93 3270void kvm_set_pfn_dirty(kvm_pfn_t pfn)
35149e21 3271{
8e1c6914
SC
3272 if (WARN_ON(is_error_noslot_pfn(pfn)))
3273 return;
3274
3275 if (pfn_valid(pfn))
3276 kvm_set_page_dirty(pfn_to_page(pfn));
8a7ae055 3277}
35149e21
AL
3278EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3279
ba049e93 3280void kvm_set_pfn_accessed(kvm_pfn_t pfn)
35149e21 3281{
8e1c6914
SC
3282 if (WARN_ON(is_error_noslot_pfn(pfn)))
3283 return;
3284
3285 if (pfn_valid(pfn))
3286 kvm_set_page_accessed(pfn_to_page(pfn));
35149e21
AL
3287}
3288EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3289
195aefde
IE
3290static int next_segment(unsigned long len, int offset)
3291{
3292 if (len > PAGE_SIZE - offset)
3293 return PAGE_SIZE - offset;
3294 else
3295 return len;
3296}
3297
a3bd2f7e 3298/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
8e73485c
PB
3299static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3300 void *data, int offset, int len)
195aefde 3301{
e0506bcb
IE
3302 int r;
3303 unsigned long addr;
195aefde 3304
8e73485c 3305 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
e0506bcb
IE
3306 if (kvm_is_error_hva(addr))
3307 return -EFAULT;
3180a7fc 3308 r = __copy_from_user(data, (void __user *)addr + offset, len);
e0506bcb 3309 if (r)
195aefde 3310 return -EFAULT;
195aefde
IE
3311 return 0;
3312}
8e73485c
PB
3313
3314int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3315 int len)
3316{
3317 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3318
3319 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3320}
195aefde
IE
3321EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3322
8e73485c
PB
3323int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3324 int offset, int len)
3325{
3326 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3327
3328 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3329}
3330EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3331
195aefde
IE
3332int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3333{
3334 gfn_t gfn = gpa >> PAGE_SHIFT;
3335 int seg;
3336 int offset = offset_in_page(gpa);
3337 int ret;
3338
3339 while ((seg = next_segment(len, offset)) != 0) {
3340 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3341 if (ret < 0)
3342 return ret;
3343 offset = 0;
3344 len -= seg;
3345 data += seg;
3346 ++gfn;
3347 }
3348 return 0;
3349}
3350EXPORT_SYMBOL_GPL(kvm_read_guest);
3351
8e73485c 3352int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
7ec54588 3353{
7ec54588 3354 gfn_t gfn = gpa >> PAGE_SHIFT;
8e73485c 3355 int seg;
7ec54588 3356 int offset = offset_in_page(gpa);
8e73485c
PB
3357 int ret;
3358
3359 while ((seg = next_segment(len, offset)) != 0) {
3360 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3361 if (ret < 0)
3362 return ret;
3363 offset = 0;
3364 len -= seg;
3365 data += seg;
3366 ++gfn;
3367 }
3368 return 0;
3369}
3370EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
7ec54588 3371
8e73485c
PB
3372static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3373 void *data, int offset, unsigned long len)
3374{
3375 int r;
3376 unsigned long addr;
3377
3378 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
7ec54588
MT
3379 if (kvm_is_error_hva(addr))
3380 return -EFAULT;
0aac03f0 3381 pagefault_disable();
3180a7fc 3382 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
0aac03f0 3383 pagefault_enable();
7ec54588
MT
3384 if (r)
3385 return -EFAULT;
3386 return 0;
3387}
7ec54588 3388
8e73485c
PB
3389int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3390 void *data, unsigned long len)
3391{
3392 gfn_t gfn = gpa >> PAGE_SHIFT;
3393 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3394 int offset = offset_in_page(gpa);
3395
3396 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3397}
3398EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3399
a3bd2f7e 3400/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
28bd726a
PX
3401static int __kvm_write_guest_page(struct kvm *kvm,
3402 struct kvm_memory_slot *memslot, gfn_t gfn,
8e73485c 3403 const void *data, int offset, int len)
195aefde 3404{
e0506bcb
IE
3405 int r;
3406 unsigned long addr;
195aefde 3407
251eb841 3408 addr = gfn_to_hva_memslot(memslot, gfn);
e0506bcb
IE
3409 if (kvm_is_error_hva(addr))
3410 return -EFAULT;
8b0cedff 3411 r = __copy_to_user((void __user *)addr + offset, data, len);
e0506bcb 3412 if (r)
195aefde 3413 return -EFAULT;
28bd726a 3414 mark_page_dirty_in_slot(kvm, memslot, gfn);
195aefde
IE
3415 return 0;
3416}
8e73485c
PB
3417
3418int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3419 const void *data, int offset, int len)
3420{
3421 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3422
28bd726a 3423 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
8e73485c 3424}
195aefde
IE
3425EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3426
8e73485c
PB
3427int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3428 const void *data, int offset, int len)
3429{
3430 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3431
28bd726a 3432 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
8e73485c
PB
3433}
3434EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3435
195aefde
IE
3436int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3437 unsigned long len)
3438{
3439 gfn_t gfn = gpa >> PAGE_SHIFT;
3440 int seg;
3441 int offset = offset_in_page(gpa);
3442 int ret;
3443
3444 while ((seg = next_segment(len, offset)) != 0) {
3445 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3446 if (ret < 0)
3447 return ret;
3448 offset = 0;
3449 len -= seg;
3450 data += seg;
3451 ++gfn;
3452 }
3453 return 0;
3454}
ff651cb6 3455EXPORT_SYMBOL_GPL(kvm_write_guest);
195aefde 3456
8e73485c
PB
3457int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3458 unsigned long len)
3459{
3460 gfn_t gfn = gpa >> PAGE_SHIFT;
3461 int seg;
3462 int offset = offset_in_page(gpa);
3463 int ret;
3464
3465 while ((seg = next_segment(len, offset)) != 0) {
3466 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3467 if (ret < 0)
3468 return ret;
3469 offset = 0;
3470 len -= seg;
3471 data += seg;
3472 ++gfn;
3473 }
3474 return 0;
3475}
3476EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3477
5a2d4365
PB
3478static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3479 struct gfn_to_hva_cache *ghc,
3480 gpa_t gpa, unsigned long len)
49c7754c 3481{
49c7754c 3482 int offset = offset_in_page(gpa);
8f964525
AH
3483 gfn_t start_gfn = gpa >> PAGE_SHIFT;
3484 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3485 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3486 gfn_t nr_pages_avail;
49c7754c 3487
6ad1e29f 3488 /* Update ghc->generation before performing any error checks. */
49c7754c 3489 ghc->generation = slots->generation;
6ad1e29f
SC
3490
3491 if (start_gfn > end_gfn) {
3492 ghc->hva = KVM_HVA_ERR_BAD;
3493 return -EINVAL;
3494 }
f1b9dd5e
JM
3495
3496 /*
3497 * If the requested region crosses two memslots, we still
3498 * verify that the entire region is valid here.
3499 */
6ad1e29f 3500 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
f1b9dd5e
JM
3501 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3502 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3503 &nr_pages_avail);
3504 if (kvm_is_error_hva(ghc->hva))
6ad1e29f 3505 return -EFAULT;
f1b9dd5e
JM
3506 }
3507
3508 /* Use the slow path for cross page reads and writes. */
6ad1e29f 3509 if (nr_pages_needed == 1)
49c7754c 3510 ghc->hva += offset;
f1b9dd5e 3511 else
8f964525 3512 ghc->memslot = NULL;
f1b9dd5e 3513
6ad1e29f
SC
3514 ghc->gpa = gpa;
3515 ghc->len = len;
3516 return 0;
49c7754c 3517}
5a2d4365 3518
4e335d9e 3519int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
5a2d4365
PB
3520 gpa_t gpa, unsigned long len)
3521{
4e335d9e 3522 struct kvm_memslots *slots = kvm_memslots(kvm);
5a2d4365
PB
3523 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3524}
4e335d9e 3525EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
49c7754c 3526
4e335d9e 3527int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
7a86dab8
JM
3528 void *data, unsigned int offset,
3529 unsigned long len)
49c7754c 3530{
4e335d9e 3531 struct kvm_memslots *slots = kvm_memslots(kvm);
49c7754c 3532 int r;
4ec6e863 3533 gpa_t gpa = ghc->gpa + offset;
49c7754c 3534
5f25e71e
PB
3535 if (WARN_ON_ONCE(len + offset > ghc->len))
3536 return -EINVAL;
8f964525 3537
dc9ce71e
SC
3538 if (slots->generation != ghc->generation) {
3539 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3540 return -EFAULT;
3541 }
8f964525 3542
49c7754c
GN
3543 if (kvm_is_error_hva(ghc->hva))
3544 return -EFAULT;
3545
fcfbc617
SC
3546 if (unlikely(!ghc->memslot))
3547 return kvm_write_guest(kvm, gpa, data, len);
3548
4ec6e863 3549 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
49c7754c
GN
3550 if (r)
3551 return -EFAULT;
28bd726a 3552 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
49c7754c
GN
3553
3554 return 0;
3555}
4e335d9e 3556EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
4ec6e863 3557
4e335d9e
PB
3558int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3559 void *data, unsigned long len)
4ec6e863 3560{
4e335d9e 3561 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
4ec6e863 3562}
4e335d9e 3563EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
49c7754c 3564
0958f0ce
VK
3565int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3566 void *data, unsigned int offset,
3567 unsigned long len)
e03b644f 3568{
4e335d9e 3569 struct kvm_memslots *slots = kvm_memslots(kvm);
e03b644f 3570 int r;
0958f0ce 3571 gpa_t gpa = ghc->gpa + offset;
e03b644f 3572
5f25e71e
PB
3573 if (WARN_ON_ONCE(len + offset > ghc->len))
3574 return -EINVAL;
8f964525 3575
dc9ce71e
SC
3576 if (slots->generation != ghc->generation) {
3577 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3578 return -EFAULT;
3579 }
8f964525 3580
e03b644f
GN
3581 if (kvm_is_error_hva(ghc->hva))
3582 return -EFAULT;
3583
fcfbc617 3584 if (unlikely(!ghc->memslot))
0958f0ce 3585 return kvm_read_guest(kvm, gpa, data, len);
fcfbc617 3586
0958f0ce 3587 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
e03b644f
GN
3588 if (r)
3589 return -EFAULT;
3590
3591 return 0;
3592}
0958f0ce
VK
3593EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3594
3595int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3596 void *data, unsigned long len)
3597{
3598 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3599}
4e335d9e 3600EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
e03b644f 3601
195aefde
IE
3602int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3603{
2f541442 3604 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
195aefde
IE
3605 gfn_t gfn = gpa >> PAGE_SHIFT;
3606 int seg;
3607 int offset = offset_in_page(gpa);
3608 int ret;
3609
bfda0e84 3610 while ((seg = next_segment(len, offset)) != 0) {
2f541442 3611 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
195aefde
IE
3612 if (ret < 0)
3613 return ret;
3614 offset = 0;
3615 len -= seg;
3616 ++gfn;
3617 }
3618 return 0;
3619}
3620EXPORT_SYMBOL_GPL(kvm_clear_guest);
3621
28bd726a 3622void mark_page_dirty_in_slot(struct kvm *kvm,
8283e36a 3623 const struct kvm_memory_slot *memslot,
28bd726a 3624 gfn_t gfn)
6aa8b732 3625{
2efd61a6
DW
3626 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3627
e09fccb5 3628#ifdef CONFIG_HAVE_KVM_DIRTY_RING
86bdf3eb 3629 if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
2efd61a6 3630 return;
86bdf3eb 3631
c57351a7 3632 WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
e09fccb5 3633#endif
2efd61a6 3634
044c59c4 3635 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
7e9d619d 3636 unsigned long rel_gfn = gfn - memslot->base_gfn;
fb04a1ed 3637 u32 slot = (memslot->as_id << 16) | memslot->id;
6aa8b732 3638
86bdf3eb 3639 if (kvm->dirty_ring_size && vcpu)
cf87ac73 3640 kvm_dirty_ring_push(vcpu, slot, rel_gfn);
c57351a7 3641 else if (memslot->dirty_bitmap)
fb04a1ed 3642 set_bit_le(rel_gfn, memslot->dirty_bitmap);
6aa8b732
AK
3643 }
3644}
a6a0b05d 3645EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
6aa8b732 3646
49c7754c
GN
3647void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3648{
3649 struct kvm_memory_slot *memslot;
3650
3651 memslot = gfn_to_memslot(kvm, gfn);
28bd726a 3652 mark_page_dirty_in_slot(kvm, memslot, gfn);
49c7754c 3653}
2ba9f0d8 3654EXPORT_SYMBOL_GPL(mark_page_dirty);
49c7754c 3655
8e73485c
PB
3656void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3657{
3658 struct kvm_memory_slot *memslot;
3659
3660 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
28bd726a 3661 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
8e73485c
PB
3662}
3663EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3664
20b7035c
JS
3665void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3666{
3667 if (!vcpu->sigset_active)
3668 return;
3669
3670 /*
3671 * This does a lockless modification of ->real_blocked, which is fine
3672 * because, only current can change ->real_blocked and all readers of
3673 * ->real_blocked don't care as long ->real_blocked is always a subset
3674 * of ->blocked.
3675 */
3676 sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3677}
3678
3679void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3680{
3681 if (!vcpu->sigset_active)
3682 return;
3683
3684 sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3685 sigemptyset(&current->real_blocked);
3686}
3687
aca6ff29
WL
3688static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3689{
dee339b5 3690 unsigned int old, val, grow, grow_start;
aca6ff29 3691
2cbd7824 3692 old = val = vcpu->halt_poll_ns;
dee339b5 3693 grow_start = READ_ONCE(halt_poll_ns_grow_start);
6b6de68c 3694 grow = READ_ONCE(halt_poll_ns_grow);
7fa08e71
NW
3695 if (!grow)
3696 goto out;
3697
dee339b5
NW
3698 val *= grow;
3699 if (val < grow_start)
3700 val = grow_start;
aca6ff29
WL
3701
3702 vcpu->halt_poll_ns = val;
7fa08e71 3703out:
2cbd7824 3704 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
aca6ff29
WL
3705}
3706
3707static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3708{
ae232ea4 3709 unsigned int old, val, shrink, grow_start;
aca6ff29 3710
2cbd7824 3711 old = val = vcpu->halt_poll_ns;
6b6de68c 3712 shrink = READ_ONCE(halt_poll_ns_shrink);
ae232ea4 3713 grow_start = READ_ONCE(halt_poll_ns_grow_start);
6b6de68c 3714 if (shrink == 0)
aca6ff29
WL
3715 val = 0;
3716 else
6b6de68c 3717 val /= shrink;
aca6ff29 3718
ae232ea4
SS
3719 if (val < grow_start)
3720 val = 0;
3721
aca6ff29 3722 vcpu->halt_poll_ns = val;
2cbd7824 3723 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
aca6ff29
WL
3724}
3725
f7819512
PB
3726static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3727{
50c28f21
JS
3728 int ret = -EINTR;
3729 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3730
c59fb127 3731 if (kvm_arch_vcpu_runnable(vcpu))
50c28f21 3732 goto out;
f7819512 3733 if (kvm_cpu_has_pending_timer(vcpu))
50c28f21 3734 goto out;
f7819512 3735 if (signal_pending(current))
50c28f21 3736 goto out;
084071d5
MT
3737 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3738 goto out;
f7819512 3739
50c28f21
JS
3740 ret = 0;
3741out:
3742 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3743 return ret;
f7819512
PB
3744}
3745
fac42688
SC
3746/*
3747 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3748 * pending. This is mostly used when halting a vCPU, but may also be used
3749 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3750 */
3751bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
cb953129 3752{
fac42688
SC
3753 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3754 bool waited = false;
3755
c3858335
JZ
3756 vcpu->stat.generic.blocking = 1;
3757
18869f26 3758 preempt_disable();
fac42688 3759 kvm_arch_vcpu_blocking(vcpu);
fac42688 3760 prepare_to_rcuwait(wait);
18869f26
ML
3761 preempt_enable();
3762
fac42688
SC
3763 for (;;) {
3764 set_current_state(TASK_INTERRUPTIBLE);
3765
3766 if (kvm_vcpu_check_block(vcpu) < 0)
3767 break;
3768
3769 waited = true;
3770 schedule();
3771 }
fac42688 3772
18869f26
ML
3773 preempt_disable();
3774 finish_rcuwait(wait);
fac42688 3775 kvm_arch_vcpu_unblocking(vcpu);
18869f26 3776 preempt_enable();
fac42688 3777
c3858335
JZ
3778 vcpu->stat.generic.blocking = 0;
3779
fac42688
SC
3780 return waited;
3781}
3782
29e72893
SC
3783static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3784 ktime_t end, bool success)
cb953129 3785{
30c94347 3786 struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
29e72893
SC
3787 u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3788
30c94347
SC
3789 ++vcpu->stat.generic.halt_attempted_poll;
3790
3791 if (success) {
3792 ++vcpu->stat.generic.halt_successful_poll;
3793
3794 if (!vcpu_valid_wakeup(vcpu))
3795 ++vcpu->stat.generic.halt_poll_invalid;
3796
3797 stats->halt_poll_success_ns += poll_ns;
3798 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3799 } else {
3800 stats->halt_poll_fail_ns += poll_ns;
3801 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3802 }
cb953129
DM
3803}
3804
175d5dc7
DM
3805static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3806{
9eb8ca04
DM
3807 struct kvm *kvm = vcpu->kvm;
3808
3809 if (kvm->override_halt_poll_ns) {
3810 /*
3811 * Ensure kvm->max_halt_poll_ns is not read before
3812 * kvm->override_halt_poll_ns.
3813 *
3814 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3815 */
3816 smp_rmb();
3817 return READ_ONCE(kvm->max_halt_poll_ns);
3818 }
3819
3820 return READ_ONCE(halt_poll_ns);
175d5dc7
DM
3821}
3822
b6958ce4 3823/*
fac42688
SC
3824 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3825 * polling is enabled, busy wait for a short time before blocking to avoid the
3826 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3827 * is halted.
b6958ce4 3828 */
91b99ea7 3829void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
d3bef15f 3830{
175d5dc7 3831 unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
6f390916 3832 bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
cb953129 3833 ktime_t start, cur, poll_end;
f7819512 3834 bool waited = false;
97b6847a 3835 bool do_halt_poll;
91b99ea7 3836 u64 halt_ns;
07ab0f8d 3837
175d5dc7
DM
3838 if (vcpu->halt_poll_ns > max_halt_poll_ns)
3839 vcpu->halt_poll_ns = max_halt_poll_ns;
97b6847a
DM
3840
3841 do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3842
cb953129 3843 start = cur = poll_end = ktime_get();
8df6a61c 3844 if (do_halt_poll) {
109a9826 3845 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
f95ef0cd 3846
f7819512 3847 do {
30c94347 3848 if (kvm_vcpu_check_block(vcpu) < 0)
f7819512 3849 goto out;
74775654 3850 cpu_relax();
cb953129 3851 poll_end = cur = ktime_get();
6bd5b743 3852 } while (kvm_vcpu_can_poll(cur, stop));
f7819512 3853 }
e5c239cf 3854
fac42688 3855 waited = kvm_vcpu_block(vcpu);
8ccba534 3856
f7819512 3857 cur = ktime_get();
87bcc5fa
JZ
3858 if (waited) {
3859 vcpu->stat.generic.halt_wait_ns +=
3860 ktime_to_ns(cur) - ktime_to_ns(poll_end);
8ccba534
JZ
3861 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3862 ktime_to_ns(cur) - ktime_to_ns(poll_end));
87bcc5fa 3863 }
f7819512 3864out:
91b99ea7
SC
3865 /* The total time the vCPU was "halted", including polling time. */
3866 halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
aca6ff29 3867
29e72893
SC
3868 /*
3869 * Note, halt-polling is considered successful so long as the vCPU was
3870 * never actually scheduled out, i.e. even if the wake event arrived
3871 * after of the halt-polling loop itself, but before the full wait.
3872 */
8df6a61c 3873 if (do_halt_poll)
29e72893 3874 update_halt_poll_stats(vcpu, start, poll_end, !waited);
cb953129 3875
6f390916 3876 if (halt_poll_allowed) {
175d5dc7
DM
3877 /* Recompute the max halt poll time in case it changed. */
3878 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3879
44551b2f 3880 if (!vcpu_valid_wakeup(vcpu)) {
aca6ff29 3881 shrink_halt_poll_ns(vcpu);
175d5dc7 3882 } else if (max_halt_poll_ns) {
91b99ea7 3883 if (halt_ns <= vcpu->halt_poll_ns)
44551b2f
WL
3884 ;
3885 /* we had a long block, shrink polling */
acd05785 3886 else if (vcpu->halt_poll_ns &&
175d5dc7 3887 halt_ns > max_halt_poll_ns)
44551b2f
WL
3888 shrink_halt_poll_ns(vcpu);
3889 /* we had a short halt and our poll time is too small */
175d5dc7
DM
3890 else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3891 halt_ns < max_halt_poll_ns)
44551b2f
WL
3892 grow_halt_poll_ns(vcpu);
3893 } else {
3894 vcpu->halt_poll_ns = 0;
3895 }
3896 }
aca6ff29 3897
91b99ea7 3898 trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
b6958ce4 3899}
91b99ea7 3900EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
b6958ce4 3901
178f02ff 3902bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
b6d33834 3903{
d92a5d1c 3904 if (__kvm_vcpu_wake_up(vcpu)) {
d73eb57b 3905 WRITE_ONCE(vcpu->ready, true);
0193cc90 3906 ++vcpu->stat.generic.halt_wakeup;
178f02ff 3907 return true;
b6d33834
CD
3908 }
3909
178f02ff 3910 return false;
dd1a4cc1
RK
3911}
3912EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3913
0266c894 3914#ifndef CONFIG_S390
dd1a4cc1
RK
3915/*
3916 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3917 */
3918void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3919{
85b64045 3920 int me, cpu;
dd1a4cc1 3921
178f02ff
RK
3922 if (kvm_vcpu_wake_up(vcpu))
3923 return;
3924
aefdc2ed
PB
3925 me = get_cpu();
3926 /*
3927 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3928 * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3929 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3930 * within the vCPU thread itself.
3931 */
3932 if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3933 if (vcpu->mode == IN_GUEST_MODE)
3934 WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3935 goto out;
3936 }
3937
85b64045
SC
3938 /*
3939 * Note, the vCPU could get migrated to a different pCPU at any point
3940 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3941 * IPI to the previous pCPU. But, that's ok because the purpose of the
3942 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3943 * vCPU also requires it to leave IN_GUEST_MODE.
3944 */
85b64045
SC
3945 if (kvm_arch_vcpu_should_kick(vcpu)) {
3946 cpu = READ_ONCE(vcpu->cpu);
3947 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
b6d33834 3948 smp_send_reschedule(cpu);
85b64045 3949 }
aefdc2ed 3950out:
b6d33834
CD
3951 put_cpu();
3952}
a20ed54d 3953EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
0266c894 3954#endif /* !CONFIG_S390 */
b6d33834 3955
fa93384f 3956int kvm_vcpu_yield_to(struct kvm_vcpu *target)
41628d33
KW
3957{
3958 struct pid *pid;
3959 struct task_struct *task = NULL;
fa93384f 3960 int ret = 0;
41628d33
KW
3961
3962 rcu_read_lock();
3963 pid = rcu_dereference(target->pid);
3964 if (pid)
27fbe64b 3965 task = get_pid_task(pid, PIDTYPE_PID);
41628d33
KW
3966 rcu_read_unlock();
3967 if (!task)
c45c528e 3968 return ret;
c45c528e 3969 ret = yield_to(task, 1);
41628d33 3970 put_task_struct(task);
c45c528e
R
3971
3972 return ret;
41628d33
KW
3973}
3974EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3975
06e48c51
R
3976/*
3977 * Helper that checks whether a VCPU is eligible for directed yield.
3978 * Most eligible candidate to yield is decided by following heuristics:
3979 *
3980 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3981 * (preempted lock holder), indicated by @in_spin_loop.
656012c7 3982 * Set at the beginning and cleared at the end of interception/PLE handler.
06e48c51
R
3983 *
3984 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3985 * chance last time (mostly it has become eligible now since we have probably
3986 * yielded to lockholder in last iteration. This is done by toggling
3987 * @dy_eligible each time a VCPU checked for eligibility.)
3988 *
3989 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3990 * to preempted lock-holder could result in wrong VCPU selection and CPU
3991 * burning. Giving priority for a potential lock-holder increases lock
3992 * progress.
3993 *
3994 * Since algorithm is based on heuristics, accessing another VCPU data without
3995 * locking does not harm. It may result in trying to yield to same VCPU, fail
3996 * and continue with next VCPU and so on.
3997 */
7940876e 3998static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
06e48c51 3999{
4a55dd72 4000#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
06e48c51
R
4001 bool eligible;
4002
4003 eligible = !vcpu->spin_loop.in_spin_loop ||
34656113 4004 vcpu->spin_loop.dy_eligible;
06e48c51
R
4005
4006 if (vcpu->spin_loop.in_spin_loop)
4007 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
4008
4009 return eligible;
4a55dd72
SW
4010#else
4011 return true;
06e48c51 4012#endif
4a55dd72 4013}
c45c528e 4014
17e433b5
WL
4015/*
4016 * Unlike kvm_arch_vcpu_runnable, this function is called outside
4017 * a vcpu_load/vcpu_put pair. However, for most architectures
4018 * kvm_arch_vcpu_runnable does not require vcpu_load.
4019 */
4020bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
4021{
4022 return kvm_arch_vcpu_runnable(vcpu);
4023}
4024
4025static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
4026{
4027 if (kvm_arch_dy_runnable(vcpu))
4028 return true;
4029
4030#ifdef CONFIG_KVM_ASYNC_PF
4031 if (!list_empty_careful(&vcpu->async_pf.done))
4032 return true;
4033#endif
4034
4035 return false;
4036}
4037
77bcd9e6
SC
4038/*
4039 * By default, simply query the target vCPU's current mode when checking if a
4040 * vCPU was preempted in kernel mode. All architectures except x86 (or more
4041 * specifical, except VMX) allow querying whether or not a vCPU is in kernel
4042 * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
4043 * directly for cross-vCPU checks is functionally correct and accurate.
4044 */
4045bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
4046{
4047 return kvm_arch_vcpu_in_kernel(vcpu);
4048}
4049
52acd22f
WL
4050bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
4051{
4052 return false;
4053}
4054
199b5763 4055void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
d255f4f2 4056{
217ece61
RR
4057 struct kvm *kvm = me->kvm;
4058 struct kvm_vcpu *vcpu;
4059 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
46808a4c 4060 unsigned long i;
217ece61 4061 int yielded = 0;
c45c528e 4062 int try = 3;
217ece61 4063 int pass;
d255f4f2 4064
4c088493 4065 kvm_vcpu_set_in_spin_loop(me, true);
217ece61
RR
4066 /*
4067 * We boost the priority of a VCPU that is runnable but not
4068 * currently running, because it got preempted by something
4069 * else and called schedule in __vcpu_run. Hopefully that
4070 * VCPU is holding the lock that we need and will release it.
4071 * We approximate round-robin by starting at the last boosted VCPU.
4072 */
c45c528e 4073 for (pass = 0; pass < 2 && !yielded && try; pass++) {
217ece61 4074 kvm_for_each_vcpu(i, vcpu, kvm) {
5cfc2aab 4075 if (!pass && i <= last_boosted_vcpu) {
217ece61
RR
4076 i = last_boosted_vcpu;
4077 continue;
4078 } else if (pass && i > last_boosted_vcpu)
4079 break;
d73eb57b 4080 if (!READ_ONCE(vcpu->ready))
7bc7ae25 4081 continue;
217ece61
RR
4082 if (vcpu == me)
4083 continue;
d92a5d1c 4084 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
217ece61 4085 continue;
dafc17dd
SC
4086
4087 /*
4088 * Treat the target vCPU as being in-kernel if it has a
4089 * pending interrupt, as the vCPU trying to yield may
4090 * be spinning waiting on IPI delivery, i.e. the target
4091 * vCPU is in-kernel for the purposes of directed yield.
4092 */
046ddeed 4093 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
52acd22f 4094 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
77bcd9e6 4095 !kvm_arch_vcpu_preempted_in_kernel(vcpu))
199b5763 4096 continue;
06e48c51
R
4097 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
4098 continue;
c45c528e
R
4099
4100 yielded = kvm_vcpu_yield_to(vcpu);
4101 if (yielded > 0) {
217ece61 4102 kvm->last_boosted_vcpu = i;
217ece61 4103 break;
c45c528e
R
4104 } else if (yielded < 0) {
4105 try--;
4106 if (!try)
4107 break;
217ece61 4108 }
217ece61
RR
4109 }
4110 }
4c088493 4111 kvm_vcpu_set_in_spin_loop(me, false);
06e48c51
R
4112
4113 /* Ensure vcpu is not eligible during next spinloop */
4114 kvm_vcpu_set_dy_eligible(me, false);
d255f4f2
ZE
4115}
4116EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
4117
fb04a1ed
PX
4118static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
4119{
dc70ec21 4120#ifdef CONFIG_HAVE_KVM_DIRTY_RING
fb04a1ed
PX
4121 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
4122 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
4123 kvm->dirty_ring_size / PAGE_SIZE);
4124#else
4125 return false;
4126#endif
4127}
4128
1499fa80 4129static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
9a2bb7f4 4130{
11bac800 4131 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
9a2bb7f4
AK
4132 struct page *page;
4133
e4a533a4 4134 if (vmf->pgoff == 0)
039576c0 4135 page = virt_to_page(vcpu->run);
09566765 4136#ifdef CONFIG_X86
e4a533a4 4137 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
ad312c7c 4138 page = virt_to_page(vcpu->arch.pio_data);
5f94c174 4139#endif
4b4357e0 4140#ifdef CONFIG_KVM_MMIO
5f94c174
LV
4141 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
4142 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
09566765 4143#endif
fb04a1ed
PX
4144 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
4145 page = kvm_dirty_ring_get_page(
4146 &vcpu->dirty_ring,
4147 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
039576c0 4148 else
5b1c1493 4149 return kvm_arch_vcpu_fault(vcpu, vmf);
9a2bb7f4 4150 get_page(page);
e4a533a4 4151 vmf->page = page;
4152 return 0;
9a2bb7f4
AK
4153}
4154
f0f37e2f 4155static const struct vm_operations_struct kvm_vcpu_vm_ops = {
e4a533a4 4156 .fault = kvm_vcpu_fault,
9a2bb7f4
AK
4157};
4158
4159static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
4160{
fb04a1ed 4161 struct kvm_vcpu *vcpu = file->private_data;
11476d27 4162 unsigned long pages = vma_pages(vma);
fb04a1ed
PX
4163
4164 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
4165 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
4166 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
4167 return -EINVAL;
4168
9a2bb7f4
AK
4169 vma->vm_ops = &kvm_vcpu_vm_ops;
4170 return 0;
4171}
4172
bccf2150
AK
4173static int kvm_vcpu_release(struct inode *inode, struct file *filp)
4174{
4175 struct kvm_vcpu *vcpu = filp->private_data;
4176
66c0b394 4177 kvm_put_kvm(vcpu->kvm);
bccf2150
AK
4178 return 0;
4179}
4180
087e1520 4181static struct file_operations kvm_vcpu_fops = {
bccf2150
AK
4182 .release = kvm_vcpu_release,
4183 .unlocked_ioctl = kvm_vcpu_ioctl,
9a2bb7f4 4184 .mmap = kvm_vcpu_mmap,
6038f373 4185 .llseek = noop_llseek,
7ddfd3e0 4186 KVM_COMPAT(kvm_vcpu_compat_ioctl),
bccf2150
AK
4187};
4188
4189/*
4190 * Allocates an inode for the vcpu.
4191 */
4192static int create_vcpu_fd(struct kvm_vcpu *vcpu)
4193{
e46b4692
MY
4194 char name[8 + 1 + ITOA_MAX_LEN + 1];
4195
4196 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
4197 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
bccf2150
AK
4198}
4199
e36de87d
VP
4200#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
4201static int vcpu_get_pid(void *data, u64 *val)
4202{
14aa40a1 4203 struct kvm_vcpu *vcpu = data;
76021e96
SC
4204
4205 rcu_read_lock();
4206 *val = pid_nr(rcu_dereference(vcpu->pid));
4207 rcu_read_unlock();
e36de87d
VP
4208 return 0;
4209}
4210
4211DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
4212
3e7093d0 4213static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
45b5939e 4214{
d56f5136 4215 struct dentry *debugfs_dentry;
45b5939e 4216 char dir_name[ITOA_MAX_LEN * 2];
45b5939e 4217
45b5939e 4218 if (!debugfs_initialized())
3e7093d0 4219 return;
45b5939e
LC
4220
4221 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
d56f5136
PB
4222 debugfs_dentry = debugfs_create_dir(dir_name,
4223 vcpu->kvm->debugfs_dentry);
e36de87d
VP
4224 debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
4225 &vcpu_get_pid_fops);
45b5939e 4226
d56f5136 4227 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
45b5939e 4228}
e36de87d 4229#endif
45b5939e 4230
c5ea7660
AK
4231/*
4232 * Creates some virtual cpus. Good luck creating more than one.
4233 */
73880c80 4234static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
c5ea7660
AK
4235{
4236 int r;
e09fefde 4237 struct kvm_vcpu *vcpu;
8bd826d6 4238 struct page *page;
c5ea7660 4239
a1c42dde 4240 if (id >= KVM_MAX_VCPU_IDS)
338c7dba
AH
4241 return -EINVAL;
4242
6c7caebc 4243 mutex_lock(&kvm->lock);
f502cc56 4244 if (kvm->created_vcpus >= kvm->max_vcpus) {
6c7caebc
PB
4245 mutex_unlock(&kvm->lock);
4246 return -EINVAL;
4247 }
4248
1d5e740d
ZG
4249 r = kvm_arch_vcpu_precreate(kvm, id);
4250 if (r) {
4251 mutex_unlock(&kvm->lock);
4252 return r;
4253 }
4254
6c7caebc
PB
4255 kvm->created_vcpus++;
4256 mutex_unlock(&kvm->lock);
4257
85f47930 4258 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
e529ef66
SC
4259 if (!vcpu) {
4260 r = -ENOMEM;
6c7caebc
PB
4261 goto vcpu_decrement;
4262 }
c5ea7660 4263
fcd97ad5 4264 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
93bb59ca 4265 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
8bd826d6
SC
4266 if (!page) {
4267 r = -ENOMEM;
e529ef66 4268 goto vcpu_free;
8bd826d6
SC
4269 }
4270 vcpu->run = page_address(page);
4271
4272 kvm_vcpu_init(vcpu, kvm, id);
e529ef66
SC
4273
4274 r = kvm_arch_vcpu_create(vcpu);
4275 if (r)
8bd826d6 4276 goto vcpu_free_run_page;
e529ef66 4277
fb04a1ed
PX
4278 if (kvm->dirty_ring_size) {
4279 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
4280 id, kvm->dirty_ring_size);
4281 if (r)
4282 goto arch_vcpu_destroy;
4283 }
4284
11ec2804 4285 mutex_lock(&kvm->lock);
42a90008
DW
4286
4287#ifdef CONFIG_LOCKDEP
4288 /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
4289 mutex_lock(&vcpu->mutex);
4290 mutex_unlock(&vcpu->mutex);
4291#endif
4292
e09fefde
DH
4293 if (kvm_get_vcpu_by_id(kvm, id)) {
4294 r = -EEXIST;
4295 goto unlock_vcpu_destroy;
4296 }
73880c80 4297
8750e72a 4298 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
afb2acb2 4299 r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
c5b07754
MZ
4300 if (r)
4301 goto unlock_vcpu_destroy;
c5ea7660 4302
fb3f0f51 4303 /* Now it's all set up, let userspace reach it */
66c0b394 4304 kvm_get_kvm(kvm);
bccf2150 4305 r = create_vcpu_fd(vcpu);
afb2acb2
ML
4306 if (r < 0)
4307 goto kvm_put_xa_release;
4308
5f643e46 4309 if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
afb2acb2
ML
4310 r = -EINVAL;
4311 goto kvm_put_xa_release;
73880c80
GN
4312 }
4313
dd489240 4314 /*
c5b07754
MZ
4315 * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
4316 * pointer before kvm->online_vcpu's incremented value.
dd489240 4317 */
73880c80
GN
4318 smp_wmb();
4319 atomic_inc(&kvm->online_vcpus);
4320
73880c80 4321 mutex_unlock(&kvm->lock);
42897d86 4322 kvm_arch_vcpu_postcreate(vcpu);
63d04348 4323 kvm_create_vcpu_debugfs(vcpu);
fb3f0f51 4324 return r;
39c3b86e 4325
afb2acb2
ML
4326kvm_put_xa_release:
4327 kvm_put_kvm_no_destroy(kvm);
4328 xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
d780592b 4329unlock_vcpu_destroy:
7d8fece6 4330 mutex_unlock(&kvm->lock);
fb04a1ed
PX
4331 kvm_dirty_ring_free(&vcpu->dirty_ring);
4332arch_vcpu_destroy:
d40ccc62 4333 kvm_arch_vcpu_destroy(vcpu);
8bd826d6
SC
4334vcpu_free_run_page:
4335 free_page((unsigned long)vcpu->run);
e529ef66
SC
4336vcpu_free:
4337 kmem_cache_free(kvm_vcpu_cache, vcpu);
6c7caebc
PB
4338vcpu_decrement:
4339 mutex_lock(&kvm->lock);
4340 kvm->created_vcpus--;
4341 mutex_unlock(&kvm->lock);
c5ea7660
AK
4342 return r;
4343}
4344
1961d276
AK
4345static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4346{
4347 if (sigset) {
4348 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4349 vcpu->sigset_active = 1;
4350 vcpu->sigset = *sigset;
4351 } else
4352 vcpu->sigset_active = 0;
4353 return 0;
4354}
4355
ce55c049
JZ
4356static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4357 size_t size, loff_t *offset)
4358{
4359 struct kvm_vcpu *vcpu = file->private_data;
4360
4361 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4362 &kvm_vcpu_stats_desc[0], &vcpu->stat,
4363 sizeof(vcpu->stat), user_buffer, size, offset);
4364}
4365
eed3013f
SC
4366static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4367{
4368 struct kvm_vcpu *vcpu = file->private_data;
4369
4370 kvm_put_kvm(vcpu->kvm);
4371 return 0;
4372}
4373
ce55c049 4374static const struct file_operations kvm_vcpu_stats_fops = {
087e1520 4375 .owner = THIS_MODULE,
ce55c049 4376 .read = kvm_vcpu_stats_read,
eed3013f 4377 .release = kvm_vcpu_stats_release,
ce55c049
JZ
4378 .llseek = noop_llseek,
4379};
4380
4381static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4382{
4383 int fd;
4384 struct file *file;
4385 char name[15 + ITOA_MAX_LEN + 1];
4386
4387 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4388
4389 fd = get_unused_fd_flags(O_CLOEXEC);
4390 if (fd < 0)
4391 return fd;
4392
4393 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4394 if (IS_ERR(file)) {
4395 put_unused_fd(fd);
4396 return PTR_ERR(file);
4397 }
eed3013f
SC
4398
4399 kvm_get_kvm(vcpu->kvm);
4400
ce55c049
JZ
4401 file->f_mode |= FMODE_PREAD;
4402 fd_install(fd, file);
4403
4404 return fd;
4405}
4406
bccf2150
AK
4407static long kvm_vcpu_ioctl(struct file *filp,
4408 unsigned int ioctl, unsigned long arg)
6aa8b732 4409{
bccf2150 4410 struct kvm_vcpu *vcpu = filp->private_data;
2f366987 4411 void __user *argp = (void __user *)arg;
313a3dc7 4412 int r;
fa3795a7
DH
4413 struct kvm_fpu *fpu = NULL;
4414 struct kvm_sregs *kvm_sregs = NULL;
6aa8b732 4415
f4d31653 4416 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
6d4e4c4f 4417 return -EIO;
2122ff5e 4418
2ea75be3
DM
4419 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4420 return -EINVAL;
4421
2122ff5e 4422 /*
5cb0944c
PB
4423 * Some architectures have vcpu ioctls that are asynchronous to vcpu
4424 * execution; mutex_lock() would break them.
2122ff5e 4425 */
5cb0944c
PB
4426 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4427 if (r != -ENOIOCTLCMD)
9fc77441 4428 return r;
2122ff5e 4429
ec7660cc
CD
4430 if (mutex_lock_killable(&vcpu->mutex))
4431 return -EINTR;
6aa8b732 4432 switch (ioctl) {
0e4524a5
CB
4433 case KVM_RUN: {
4434 struct pid *oldpid;
f0fe5108
AK
4435 r = -EINVAL;
4436 if (arg)
4437 goto out;
0e4524a5 4438 oldpid = rcu_access_pointer(vcpu->pid);
71dbc8a9 4439 if (unlikely(oldpid != task_pid(current))) {
7a72f7a1 4440 /* The thread running this VCPU changed. */
bd2a6394 4441 struct pid *newpid;
f95ef0cd 4442
bd2a6394
CD
4443 r = kvm_arch_vcpu_run_pid_change(vcpu);
4444 if (r)
4445 break;
4446
4447 newpid = get_task_pid(current, PIDTYPE_PID);
7a72f7a1
CB
4448 rcu_assign_pointer(vcpu->pid, newpid);
4449 if (oldpid)
4450 synchronize_rcu();
4451 put_pid(oldpid);
4452 }
1b94f6f8 4453 r = kvm_arch_vcpu_ioctl_run(vcpu);
64be5007 4454 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
6aa8b732 4455 break;
0e4524a5 4456 }
6aa8b732 4457 case KVM_GET_REGS: {
3e4bb3ac 4458 struct kvm_regs *kvm_regs;
6aa8b732 4459
3e4bb3ac 4460 r = -ENOMEM;
b12ce36a 4461 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3e4bb3ac 4462 if (!kvm_regs)
6aa8b732 4463 goto out;
3e4bb3ac
XZ
4464 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4465 if (r)
4466 goto out_free1;
6aa8b732 4467 r = -EFAULT;
3e4bb3ac
XZ
4468 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4469 goto out_free1;
6aa8b732 4470 r = 0;
3e4bb3ac
XZ
4471out_free1:
4472 kfree(kvm_regs);
6aa8b732
AK
4473 break;
4474 }
4475 case KVM_SET_REGS: {
3e4bb3ac 4476 struct kvm_regs *kvm_regs;
6aa8b732 4477
ff5c2c03
SL
4478 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4479 if (IS_ERR(kvm_regs)) {
4480 r = PTR_ERR(kvm_regs);
6aa8b732 4481 goto out;
ff5c2c03 4482 }
3e4bb3ac 4483 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3e4bb3ac 4484 kfree(kvm_regs);
6aa8b732
AK
4485 break;
4486 }
4487 case KVM_GET_SREGS: {
b12ce36a
BG
4488 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4489 GFP_KERNEL_ACCOUNT);
fa3795a7
DH
4490 r = -ENOMEM;
4491 if (!kvm_sregs)
4492 goto out;
4493 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
6aa8b732
AK
4494 if (r)
4495 goto out;
4496 r = -EFAULT;
fa3795a7 4497 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
6aa8b732
AK
4498 goto out;
4499 r = 0;
4500 break;
4501 }
4502 case KVM_SET_SREGS: {
ff5c2c03
SL
4503 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4504 if (IS_ERR(kvm_sregs)) {
4505 r = PTR_ERR(kvm_sregs);
18595411 4506 kvm_sregs = NULL;
6aa8b732 4507 goto out;
ff5c2c03 4508 }
fa3795a7 4509 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
6aa8b732
AK
4510 break;
4511 }
62d9f0db
MT
4512 case KVM_GET_MP_STATE: {
4513 struct kvm_mp_state mp_state;
4514
4515 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4516 if (r)
4517 goto out;
4518 r = -EFAULT;
893bdbf1 4519 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
62d9f0db
MT
4520 goto out;
4521 r = 0;
4522 break;
4523 }
4524 case KVM_SET_MP_STATE: {
4525 struct kvm_mp_state mp_state;
4526
4527 r = -EFAULT;
893bdbf1 4528 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
62d9f0db
MT
4529 goto out;
4530 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
62d9f0db
MT
4531 break;
4532 }
6aa8b732
AK
4533 case KVM_TRANSLATE: {
4534 struct kvm_translation tr;
4535
4536 r = -EFAULT;
893bdbf1 4537 if (copy_from_user(&tr, argp, sizeof(tr)))
6aa8b732 4538 goto out;
8b006791 4539 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
6aa8b732
AK
4540 if (r)
4541 goto out;
4542 r = -EFAULT;
893bdbf1 4543 if (copy_to_user(argp, &tr, sizeof(tr)))
6aa8b732
AK
4544 goto out;
4545 r = 0;
4546 break;
4547 }
d0bfb940
JK
4548 case KVM_SET_GUEST_DEBUG: {
4549 struct kvm_guest_debug dbg;
6aa8b732
AK
4550
4551 r = -EFAULT;
893bdbf1 4552 if (copy_from_user(&dbg, argp, sizeof(dbg)))
6aa8b732 4553 goto out;
d0bfb940 4554 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
6aa8b732
AK
4555 break;
4556 }
1961d276
AK
4557 case KVM_SET_SIGNAL_MASK: {
4558 struct kvm_signal_mask __user *sigmask_arg = argp;
4559 struct kvm_signal_mask kvm_sigmask;
4560 sigset_t sigset, *p;
4561
4562 p = NULL;
4563 if (argp) {
4564 r = -EFAULT;
4565 if (copy_from_user(&kvm_sigmask, argp,
893bdbf1 4566 sizeof(kvm_sigmask)))
1961d276
AK
4567 goto out;
4568 r = -EINVAL;
893bdbf1 4569 if (kvm_sigmask.len != sizeof(sigset))
1961d276
AK
4570 goto out;
4571 r = -EFAULT;
4572 if (copy_from_user(&sigset, sigmask_arg->sigset,
893bdbf1 4573 sizeof(sigset)))
1961d276
AK
4574 goto out;
4575 p = &sigset;
4576 }
376d41ff 4577 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1961d276
AK
4578 break;
4579 }
b8836737 4580 case KVM_GET_FPU: {
b12ce36a 4581 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
fa3795a7
DH
4582 r = -ENOMEM;
4583 if (!fpu)
4584 goto out;
4585 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
b8836737
AK
4586 if (r)
4587 goto out;
4588 r = -EFAULT;
fa3795a7 4589 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
b8836737
AK
4590 goto out;
4591 r = 0;
4592 break;
4593 }
4594 case KVM_SET_FPU: {
ff5c2c03
SL
4595 fpu = memdup_user(argp, sizeof(*fpu));
4596 if (IS_ERR(fpu)) {
4597 r = PTR_ERR(fpu);
18595411 4598 fpu = NULL;
b8836737 4599 goto out;
ff5c2c03 4600 }
fa3795a7 4601 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
b8836737
AK
4602 break;
4603 }
ce55c049
JZ
4604 case KVM_GET_STATS_FD: {
4605 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4606 break;
4607 }
bccf2150 4608 default:
313a3dc7 4609 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
bccf2150
AK
4610 }
4611out:
ec7660cc 4612 mutex_unlock(&vcpu->mutex);
fa3795a7
DH
4613 kfree(fpu);
4614 kfree(kvm_sregs);
bccf2150
AK
4615 return r;
4616}
4617
de8e5d74 4618#ifdef CONFIG_KVM_COMPAT
1dda606c
AG
4619static long kvm_vcpu_compat_ioctl(struct file *filp,
4620 unsigned int ioctl, unsigned long arg)
4621{
4622 struct kvm_vcpu *vcpu = filp->private_data;
4623 void __user *argp = compat_ptr(arg);
4624 int r;
4625
f4d31653 4626 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
1dda606c
AG
4627 return -EIO;
4628
4629 switch (ioctl) {
4630 case KVM_SET_SIGNAL_MASK: {
4631 struct kvm_signal_mask __user *sigmask_arg = argp;
4632 struct kvm_signal_mask kvm_sigmask;
1dda606c
AG
4633 sigset_t sigset;
4634
4635 if (argp) {
4636 r = -EFAULT;
4637 if (copy_from_user(&kvm_sigmask, argp,
893bdbf1 4638 sizeof(kvm_sigmask)))
1dda606c
AG
4639 goto out;
4640 r = -EINVAL;
3968cf62 4641 if (kvm_sigmask.len != sizeof(compat_sigset_t))
1dda606c
AG
4642 goto out;
4643 r = -EFAULT;
1393b4aa
PB
4644 if (get_compat_sigset(&sigset,
4645 (compat_sigset_t __user *)sigmask_arg->sigset))
1dda606c 4646 goto out;
760a9a30
AC
4647 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4648 } else
4649 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
1dda606c
AG
4650 break;
4651 }
4652 default:
4653 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4654 }
4655
4656out:
4657 return r;
4658}
4659#endif
4660
a1cd3f08
CLG
4661static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4662{
4663 struct kvm_device *dev = filp->private_data;
4664
4665 if (dev->ops->mmap)
4666 return dev->ops->mmap(dev, vma);
4667
4668 return -ENODEV;
4669}
4670
852b6d57
SW
4671static int kvm_device_ioctl_attr(struct kvm_device *dev,
4672 int (*accessor)(struct kvm_device *dev,
4673 struct kvm_device_attr *attr),
4674 unsigned long arg)
4675{
4676 struct kvm_device_attr attr;
4677
4678 if (!accessor)
4679 return -EPERM;
4680
4681 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4682 return -EFAULT;
4683
4684 return accessor(dev, &attr);
4685}
4686
4687static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4688 unsigned long arg)
4689{
4690 struct kvm_device *dev = filp->private_data;
4691
f4d31653 4692 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
ddba9180
SC
4693 return -EIO;
4694
852b6d57
SW
4695 switch (ioctl) {
4696 case KVM_SET_DEVICE_ATTR:
4697 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4698 case KVM_GET_DEVICE_ATTR:
4699 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4700 case KVM_HAS_DEVICE_ATTR:
4701 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4702 default:
4703 if (dev->ops->ioctl)
4704 return dev->ops->ioctl(dev, ioctl, arg);
4705
4706 return -ENOTTY;
4707 }
4708}
4709
852b6d57
SW
4710static int kvm_device_release(struct inode *inode, struct file *filp)
4711{
4712 struct kvm_device *dev = filp->private_data;
4713 struct kvm *kvm = dev->kvm;
4714
2bde9b3e
CLG
4715 if (dev->ops->release) {
4716 mutex_lock(&kvm->lock);
4717 list_del(&dev->vm_node);
4718 dev->ops->release(dev);
4719 mutex_unlock(&kvm->lock);
4720 }
4721
852b6d57
SW
4722 kvm_put_kvm(kvm);
4723 return 0;
4724}
4725
087e1520 4726static struct file_operations kvm_device_fops = {
852b6d57
SW
4727 .unlocked_ioctl = kvm_device_ioctl,
4728 .release = kvm_device_release,
7ddfd3e0 4729 KVM_COMPAT(kvm_device_ioctl),
a1cd3f08 4730 .mmap = kvm_device_mmap,
852b6d57
SW
4731};
4732
4733struct kvm_device *kvm_device_from_filp(struct file *filp)
4734{
4735 if (filp->f_op != &kvm_device_fops)
4736 return NULL;
4737
4738 return filp->private_data;
4739}
4740
8538cb22 4741static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
5df554ad 4742#ifdef CONFIG_KVM_MPIC
d60eacb0
WD
4743 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4744 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
5975a2e0 4745#endif
d60eacb0
WD
4746};
4747
8538cb22 4748int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
d60eacb0
WD
4749{
4750 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4751 return -ENOSPC;
4752
4753 if (kvm_device_ops_table[type] != NULL)
4754 return -EEXIST;
4755
4756 kvm_device_ops_table[type] = ops;
4757 return 0;
4758}
4759
571ee1b6
WL
4760void kvm_unregister_device_ops(u32 type)
4761{
4762 if (kvm_device_ops_table[type] != NULL)
4763 kvm_device_ops_table[type] = NULL;
4764}
4765
852b6d57
SW
4766static int kvm_ioctl_create_device(struct kvm *kvm,
4767 struct kvm_create_device *cd)
4768{
eceb6e1d 4769 const struct kvm_device_ops *ops;
852b6d57
SW
4770 struct kvm_device *dev;
4771 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
1d487e9b 4772 int type;
852b6d57
SW
4773 int ret;
4774
d60eacb0
WD
4775 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4776 return -ENODEV;
4777
1d487e9b
PB
4778 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4779 ops = kvm_device_ops_table[type];
d60eacb0 4780 if (ops == NULL)
852b6d57 4781 return -ENODEV;
852b6d57
SW
4782
4783 if (test)
4784 return 0;
4785
b12ce36a 4786 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
852b6d57
SW
4787 if (!dev)
4788 return -ENOMEM;
4789
4790 dev->ops = ops;
4791 dev->kvm = kvm;
852b6d57 4792
a28ebea2 4793 mutex_lock(&kvm->lock);
1d487e9b 4794 ret = ops->create(dev, type);
852b6d57 4795 if (ret < 0) {
a28ebea2 4796 mutex_unlock(&kvm->lock);
852b6d57
SW
4797 kfree(dev);
4798 return ret;
4799 }
a28ebea2
CD
4800 list_add(&dev->vm_node, &kvm->devices);
4801 mutex_unlock(&kvm->lock);
852b6d57 4802
023e9fdd
CD
4803 if (ops->init)
4804 ops->init(dev);
4805
cfa39381 4806 kvm_get_kvm(kvm);
24009b05 4807 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
852b6d57 4808 if (ret < 0) {
149487bd 4809 kvm_put_kvm_no_destroy(kvm);
a28ebea2
CD
4810 mutex_lock(&kvm->lock);
4811 list_del(&dev->vm_node);
e8bc2427
AK
4812 if (ops->release)
4813 ops->release(dev);
a28ebea2 4814 mutex_unlock(&kvm->lock);
e8bc2427
AK
4815 if (ops->destroy)
4816 ops->destroy(dev);
852b6d57
SW
4817 return ret;
4818 }
4819
852b6d57
SW
4820 cd->fd = ret;
4821 return 0;
4822}
4823
f15ba52b 4824static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
92b591a4
AG
4825{
4826 switch (arg) {
4827 case KVM_CAP_USER_MEMORY:
bb58b90b 4828 case KVM_CAP_USER_MEMORY2:
92b591a4
AG
4829 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4830 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
92b591a4
AG
4831 case KVM_CAP_INTERNAL_ERROR_DATA:
4832#ifdef CONFIG_HAVE_KVM_MSI
4833 case KVM_CAP_SIGNAL_MSI:
4834#endif
c5b31cc2 4835#ifdef CONFIG_HAVE_KVM_IRQCHIP
dc9be0fa 4836 case KVM_CAP_IRQFD:
92b591a4 4837#endif
e9ea5069 4838 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
92b591a4 4839 case KVM_CAP_CHECK_EXTENSION_VM:
e5d83c74 4840 case KVM_CAP_ENABLE_CAP_VM:
acd05785 4841 case KVM_CAP_HALT_POLL:
92b591a4 4842 return 1;
4b4357e0 4843#ifdef CONFIG_KVM_MMIO
30422558
PB
4844 case KVM_CAP_COALESCED_MMIO:
4845 return KVM_COALESCED_MMIO_PAGE_OFFSET;
0804c849
PH
4846 case KVM_CAP_COALESCED_PIO:
4847 return 1;
30422558 4848#endif
3c9bd400
JZ
4849#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4850 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4851 return KVM_DIRTY_LOG_MANUAL_CAPS;
4852#endif
92b591a4
AG
4853#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4854 case KVM_CAP_IRQ_ROUTING:
4855 return KVM_MAX_IRQ_ROUTES;
f481b069 4856#endif
eed52e43 4857#if KVM_MAX_NR_ADDRESS_SPACES > 1
f481b069 4858 case KVM_CAP_MULTI_ADDRESS_SPACE:
eed52e43
SC
4859 if (kvm)
4860 return kvm_arch_nr_memslot_as_ids(kvm);
4861 return KVM_MAX_NR_ADDRESS_SPACES;
92b591a4 4862#endif
c110ae57
PB
4863 case KVM_CAP_NR_MEMSLOTS:
4864 return KVM_USER_MEM_SLOTS;
fb04a1ed 4865 case KVM_CAP_DIRTY_LOG_RING:
17601bfe
MZ
4866#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4867 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4868#else
4869 return 0;
4870#endif
4871 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4872#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
fb04a1ed
PX
4873 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4874#else
4875 return 0;
86bdf3eb
GS
4876#endif
4877#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4878 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
fb04a1ed 4879#endif
ce55c049 4880 case KVM_CAP_BINARY_STATS_FD:
d495f942 4881 case KVM_CAP_SYSTEM_EVENT_DATA:
63912245 4882 case KVM_CAP_DEVICE_CTRL:
ce55c049 4883 return 1;
5a475554
CP
4884#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
4885 case KVM_CAP_MEMORY_ATTRIBUTES:
4886 return kvm_supported_mem_attributes(kvm);
a7800aa8
SC
4887#endif
4888#ifdef CONFIG_KVM_PRIVATE_MEM
4889 case KVM_CAP_GUEST_MEMFD:
4890 return !kvm || kvm_arch_has_private_mem(kvm);
5a475554 4891#endif
92b591a4
AG
4892 default:
4893 break;
4894 }
4895 return kvm_vm_ioctl_check_extension(kvm, arg);
4896}
4897
fb04a1ed
PX
4898static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4899{
4900 int r;
4901
4902 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4903 return -EINVAL;
4904
4905 /* the size should be power of 2 */
4906 if (!size || (size & (size - 1)))
4907 return -EINVAL;
4908
4909 /* Should be bigger to keep the reserved entries, or a page */
4910 if (size < kvm_dirty_ring_get_rsvd_entries() *
4911 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4912 return -EINVAL;
4913
4914 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4915 sizeof(struct kvm_dirty_gfn))
4916 return -E2BIG;
4917
4918 /* We only allow it to set once */
4919 if (kvm->dirty_ring_size)
4920 return -EINVAL;
4921
4922 mutex_lock(&kvm->lock);
4923
4924 if (kvm->created_vcpus) {
4925 /* We don't allow to change this value after vcpu created */
4926 r = -EINVAL;
4927 } else {
4928 kvm->dirty_ring_size = size;
4929 r = 0;
4930 }
4931
4932 mutex_unlock(&kvm->lock);
4933 return r;
4934}
4935
4936static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4937{
46808a4c 4938 unsigned long i;
fb04a1ed
PX
4939 struct kvm_vcpu *vcpu;
4940 int cleared = 0;
4941
4942 if (!kvm->dirty_ring_size)
4943 return -EINVAL;
4944
4945 mutex_lock(&kvm->slots_lock);
4946
4947 kvm_for_each_vcpu(i, vcpu, kvm)
4948 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4949
4950 mutex_unlock(&kvm->slots_lock);
4951
4952 if (cleared)
4953 kvm_flush_remote_tlbs(kvm);
4954
4955 return cleared;
4956}
4957
e5d83c74
PB
4958int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4959 struct kvm_enable_cap *cap)
4960{
4961 return -EINVAL;
4962}
4963
26f45714 4964bool kvm_are_all_memslots_empty(struct kvm *kvm)
86bdf3eb
GS
4965{
4966 int i;
4967
4968 lockdep_assert_held(&kvm->slots_lock);
4969
eed52e43 4970 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
86bdf3eb
GS
4971 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4972 return false;
4973 }
4974
4975 return true;
4976}
26f45714 4977EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
86bdf3eb 4978
e5d83c74
PB
4979static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4980 struct kvm_enable_cap *cap)
4981{
4982 switch (cap->cap) {
2a31b9db 4983#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3c9bd400
JZ
4984 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4985 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4986
4987 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4988 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4989
4990 if (cap->flags || (cap->args[0] & ~allowed_options))
2a31b9db
PB
4991 return -EINVAL;
4992 kvm->manual_dirty_log_protect = cap->args[0];
4993 return 0;
3c9bd400 4994 }
2a31b9db 4995#endif
acd05785
DM
4996 case KVM_CAP_HALT_POLL: {
4997 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4998 return -EINVAL;
4999
5000 kvm->max_halt_poll_ns = cap->args[0];
9eb8ca04
DM
5001
5002 /*
5003 * Ensure kvm->override_halt_poll_ns does not become visible
5004 * before kvm->max_halt_poll_ns.
5005 *
5006 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
5007 */
5008 smp_wmb();
5009 kvm->override_halt_poll_ns = true;
5010
acd05785
DM
5011 return 0;
5012 }
fb04a1ed 5013 case KVM_CAP_DIRTY_LOG_RING:
17601bfe 5014 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
7a2726ec
GS
5015 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
5016 return -EINVAL;
5017
fb04a1ed 5018 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
86bdf3eb
GS
5019 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
5020 int r = -EINVAL;
5021
5022 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
5023 !kvm->dirty_ring_size || cap->flags)
5024 return r;
5025
5026 mutex_lock(&kvm->slots_lock);
5027
5028 /*
5029 * For simplicity, allow enabling ring+bitmap if and only if
5030 * there are no memslots, e.g. to ensure all memslots allocate
5031 * a bitmap after the capability is enabled.
5032 */
5033 if (kvm_are_all_memslots_empty(kvm)) {
5034 kvm->dirty_ring_with_bitmap = true;
5035 r = 0;
5036 }
5037
5038 mutex_unlock(&kvm->slots_lock);
5039
5040 return r;
5041 }
e5d83c74
PB
5042 default:
5043 return kvm_vm_ioctl_enable_cap(kvm, cap);
5044 }
5045}
5046
fcfe1bae
JZ
5047static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
5048 size_t size, loff_t *offset)
5049{
5050 struct kvm *kvm = file->private_data;
5051
5052 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
5053 &kvm_vm_stats_desc[0], &kvm->stat,
5054 sizeof(kvm->stat), user_buffer, size, offset);
5055}
5056
eed3013f
SC
5057static int kvm_vm_stats_release(struct inode *inode, struct file *file)
5058{
5059 struct kvm *kvm = file->private_data;
5060
5061 kvm_put_kvm(kvm);
5062 return 0;
5063}
5064
fcfe1bae 5065static const struct file_operations kvm_vm_stats_fops = {
087e1520 5066 .owner = THIS_MODULE,
fcfe1bae 5067 .read = kvm_vm_stats_read,
eed3013f 5068 .release = kvm_vm_stats_release,
fcfe1bae
JZ
5069 .llseek = noop_llseek,
5070};
5071
5072static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
5073{
5074 int fd;
5075 struct file *file;
5076
5077 fd = get_unused_fd_flags(O_CLOEXEC);
5078 if (fd < 0)
5079 return fd;
5080
5081 file = anon_inode_getfile("kvm-vm-stats",
5082 &kvm_vm_stats_fops, kvm, O_RDONLY);
5083 if (IS_ERR(file)) {
5084 put_unused_fd(fd);
5085 return PTR_ERR(file);
5086 }
eed3013f
SC
5087
5088 kvm_get_kvm(kvm);
5089
fcfe1bae
JZ
5090 file->f_mode |= FMODE_PREAD;
5091 fd_install(fd, file);
5092
5093 return fd;
5094}
5095
bb58b90b
SC
5096#define SANITY_CHECK_MEM_REGION_FIELD(field) \
5097do { \
5098 BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
5099 offsetof(struct kvm_userspace_memory_region2, field)); \
5100 BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
5101 sizeof_field(struct kvm_userspace_memory_region2, field)); \
5102} while (0)
5103
bccf2150
AK
5104static long kvm_vm_ioctl(struct file *filp,
5105 unsigned int ioctl, unsigned long arg)
5106{
5107 struct kvm *kvm = filp->private_data;
5108 void __user *argp = (void __user *)arg;
1fe779f8 5109 int r;
bccf2150 5110
f4d31653 5111 if (kvm->mm != current->mm || kvm->vm_dead)
6d4e4c4f 5112 return -EIO;
bccf2150
AK
5113 switch (ioctl) {
5114 case KVM_CREATE_VCPU:
5115 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
bccf2150 5116 break;
e5d83c74
PB
5117 case KVM_ENABLE_CAP: {
5118 struct kvm_enable_cap cap;
5119
5120 r = -EFAULT;
5121 if (copy_from_user(&cap, argp, sizeof(cap)))
5122 goto out;
5123 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
5124 break;
5125 }
bb58b90b 5126 case KVM_SET_USER_MEMORY_REGION2:
6fc138d2 5127 case KVM_SET_USER_MEMORY_REGION: {
bb58b90b
SC
5128 struct kvm_userspace_memory_region2 mem;
5129 unsigned long size;
5130
5131 if (ioctl == KVM_SET_USER_MEMORY_REGION) {
5132 /*
5133 * Fields beyond struct kvm_userspace_memory_region shouldn't be
5134 * accessed, but avoid leaking kernel memory in case of a bug.
5135 */
5136 memset(&mem, 0, sizeof(mem));
5137 size = sizeof(struct kvm_userspace_memory_region);
5138 } else {
5139 size = sizeof(struct kvm_userspace_memory_region2);
5140 }
5141
5142 /* Ensure the common parts of the two structs are identical. */
5143 SANITY_CHECK_MEM_REGION_FIELD(slot);
5144 SANITY_CHECK_MEM_REGION_FIELD(flags);
5145 SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
5146 SANITY_CHECK_MEM_REGION_FIELD(memory_size);
5147 SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
6fc138d2
IE
5148
5149 r = -EFAULT;
bb58b90b
SC
5150 if (copy_from_user(&mem, argp, size))
5151 goto out;
5152
5153 r = -EINVAL;
5154 if (ioctl == KVM_SET_USER_MEMORY_REGION &&
5155 (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
6fc138d2
IE
5156 goto out;
5157
bb58b90b 5158 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
6aa8b732
AK
5159 break;
5160 }
5161 case KVM_GET_DIRTY_LOG: {
5162 struct kvm_dirty_log log;
5163
5164 r = -EFAULT;
893bdbf1 5165 if (copy_from_user(&log, argp, sizeof(log)))
6aa8b732 5166 goto out;
2c6f5df9 5167 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6aa8b732
AK
5168 break;
5169 }
2a31b9db
PB
5170#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5171 case KVM_CLEAR_DIRTY_LOG: {
5172 struct kvm_clear_dirty_log log;
5173
5174 r = -EFAULT;
5175 if (copy_from_user(&log, argp, sizeof(log)))
5176 goto out;
5177 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5178 break;
5179 }
5180#endif
4b4357e0 5181#ifdef CONFIG_KVM_MMIO
5f94c174
LV
5182 case KVM_REGISTER_COALESCED_MMIO: {
5183 struct kvm_coalesced_mmio_zone zone;
f95ef0cd 5184
5f94c174 5185 r = -EFAULT;
893bdbf1 5186 if (copy_from_user(&zone, argp, sizeof(zone)))
5f94c174 5187 goto out;
5f94c174 5188 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
5f94c174
LV
5189 break;
5190 }
5191 case KVM_UNREGISTER_COALESCED_MMIO: {
5192 struct kvm_coalesced_mmio_zone zone;
f95ef0cd 5193
5f94c174 5194 r = -EFAULT;
893bdbf1 5195 if (copy_from_user(&zone, argp, sizeof(zone)))
5f94c174 5196 goto out;
5f94c174 5197 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
5f94c174
LV
5198 break;
5199 }
5200#endif
721eecbf
GH
5201 case KVM_IRQFD: {
5202 struct kvm_irqfd data;
5203
5204 r = -EFAULT;
893bdbf1 5205 if (copy_from_user(&data, argp, sizeof(data)))
721eecbf 5206 goto out;
d4db2935 5207 r = kvm_irqfd(kvm, &data);
721eecbf
GH
5208 break;
5209 }
d34e6b17
GH
5210 case KVM_IOEVENTFD: {
5211 struct kvm_ioeventfd data;
5212
5213 r = -EFAULT;
893bdbf1 5214 if (copy_from_user(&data, argp, sizeof(data)))
d34e6b17
GH
5215 goto out;
5216 r = kvm_ioeventfd(kvm, &data);
5217 break;
5218 }
07975ad3
JK
5219#ifdef CONFIG_HAVE_KVM_MSI
5220 case KVM_SIGNAL_MSI: {
5221 struct kvm_msi msi;
5222
5223 r = -EFAULT;
893bdbf1 5224 if (copy_from_user(&msi, argp, sizeof(msi)))
07975ad3
JK
5225 goto out;
5226 r = kvm_send_userspace_msi(kvm, &msi);
5227 break;
5228 }
23d43cf9
CD
5229#endif
5230#ifdef __KVM_HAVE_IRQ_LINE
5231 case KVM_IRQ_LINE_STATUS:
5232 case KVM_IRQ_LINE: {
5233 struct kvm_irq_level irq_event;
5234
5235 r = -EFAULT;
893bdbf1 5236 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
23d43cf9
CD
5237 goto out;
5238
aa2fbe6d
YZ
5239 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
5240 ioctl == KVM_IRQ_LINE_STATUS);
23d43cf9
CD
5241 if (r)
5242 goto out;
5243
5244 r = -EFAULT;
5245 if (ioctl == KVM_IRQ_LINE_STATUS) {
893bdbf1 5246 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
23d43cf9
CD
5247 goto out;
5248 }
5249
5250 r = 0;
5251 break;
5252 }
73880c80 5253#endif
aa8d5944
AG
5254#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
5255 case KVM_SET_GSI_ROUTING: {
5256 struct kvm_irq_routing routing;
5257 struct kvm_irq_routing __user *urouting;
f8c1b85b 5258 struct kvm_irq_routing_entry *entries = NULL;
aa8d5944
AG
5259
5260 r = -EFAULT;
5261 if (copy_from_user(&routing, argp, sizeof(routing)))
5262 goto out;
5263 r = -EINVAL;
5c0aea0e
DH
5264 if (!kvm_arch_can_set_irq_routing(kvm))
5265 goto out;
caf1ff26 5266 if (routing.nr > KVM_MAX_IRQ_ROUTES)
aa8d5944
AG
5267 goto out;
5268 if (routing.flags)
5269 goto out;
f8c1b85b 5270 if (routing.nr) {
f8c1b85b 5271 urouting = argp;
1f829359
PS
5272 entries = vmemdup_array_user(urouting->entries,
5273 routing.nr, sizeof(*entries));
7ec28e26
DE
5274 if (IS_ERR(entries)) {
5275 r = PTR_ERR(entries);
5276 goto out;
5277 }
f8c1b85b 5278 }
aa8d5944
AG
5279 r = kvm_set_irq_routing(kvm, entries, routing.nr,
5280 routing.flags);
7ec28e26 5281 kvfree(entries);
aa8d5944
AG
5282 break;
5283 }
5284#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
5a475554
CP
5285#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
5286 case KVM_SET_MEMORY_ATTRIBUTES: {
5287 struct kvm_memory_attributes attrs;
5288
5289 r = -EFAULT;
5290 if (copy_from_user(&attrs, argp, sizeof(attrs)))
5291 goto out;
5292
5293 r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
5294 break;
5295 }
5296#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
852b6d57
SW
5297 case KVM_CREATE_DEVICE: {
5298 struct kvm_create_device cd;
5299
5300 r = -EFAULT;
5301 if (copy_from_user(&cd, argp, sizeof(cd)))
5302 goto out;
5303
5304 r = kvm_ioctl_create_device(kvm, &cd);
5305 if (r)
5306 goto out;
5307
5308 r = -EFAULT;
5309 if (copy_to_user(argp, &cd, sizeof(cd)))
5310 goto out;
5311
5312 r = 0;
5313 break;
5314 }
92b591a4
AG
5315 case KVM_CHECK_EXTENSION:
5316 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
5317 break;
fb04a1ed
PX
5318 case KVM_RESET_DIRTY_RINGS:
5319 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
5320 break;
fcfe1bae
JZ
5321 case KVM_GET_STATS_FD:
5322 r = kvm_vm_ioctl_get_stats_fd(kvm);
5323 break;
a7800aa8
SC
5324#ifdef CONFIG_KVM_PRIVATE_MEM
5325 case KVM_CREATE_GUEST_MEMFD: {
5326 struct kvm_create_guest_memfd guest_memfd;
5327
5328 r = -EFAULT;
5329 if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
5330 goto out;
5331
5332 r = kvm_gmem_create(kvm, &guest_memfd);
5333 break;
5334 }
5335#endif
f17abe9a 5336 default:
1fe779f8 5337 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
f17abe9a
AK
5338 }
5339out:
5340 return r;
5341}
5342
de8e5d74 5343#ifdef CONFIG_KVM_COMPAT
6ff5894c
AB
5344struct compat_kvm_dirty_log {
5345 __u32 slot;
5346 __u32 padding1;
5347 union {
5348 compat_uptr_t dirty_bitmap; /* one bit per page */
5349 __u64 padding2;
5350 };
5351};
5352
8750f9bb
PB
5353struct compat_kvm_clear_dirty_log {
5354 __u32 slot;
5355 __u32 num_pages;
5356 __u64 first_page;
5357 union {
5358 compat_uptr_t dirty_bitmap; /* one bit per page */
5359 __u64 padding2;
5360 };
5361};
5362
ed51862f
AG
5363long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5364 unsigned long arg)
5365{
5366 return -ENOTTY;
5367}
5368
6ff5894c
AB
5369static long kvm_vm_compat_ioctl(struct file *filp,
5370 unsigned int ioctl, unsigned long arg)
5371{
5372 struct kvm *kvm = filp->private_data;
5373 int r;
5374
f4d31653 5375 if (kvm->mm != current->mm || kvm->vm_dead)
6ff5894c 5376 return -EIO;
ed51862f
AG
5377
5378 r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5379 if (r != -ENOTTY)
5380 return r;
5381
6ff5894c 5382 switch (ioctl) {
8750f9bb
PB
5383#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5384 case KVM_CLEAR_DIRTY_LOG: {
5385 struct compat_kvm_clear_dirty_log compat_log;
5386 struct kvm_clear_dirty_log log;
5387
5388 if (copy_from_user(&compat_log, (void __user *)arg,
5389 sizeof(compat_log)))
5390 return -EFAULT;
5391 log.slot = compat_log.slot;
5392 log.num_pages = compat_log.num_pages;
5393 log.first_page = compat_log.first_page;
5394 log.padding2 = compat_log.padding2;
5395 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5396
5397 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5398 break;
5399 }
5400#endif
6ff5894c
AB
5401 case KVM_GET_DIRTY_LOG: {
5402 struct compat_kvm_dirty_log compat_log;
5403 struct kvm_dirty_log log;
5404
6ff5894c
AB
5405 if (copy_from_user(&compat_log, (void __user *)arg,
5406 sizeof(compat_log)))
f6a3b168 5407 return -EFAULT;
6ff5894c
AB
5408 log.slot = compat_log.slot;
5409 log.padding1 = compat_log.padding1;
5410 log.padding2 = compat_log.padding2;
5411 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5412
5413 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6ff5894c
AB
5414 break;
5415 }
5416 default:
5417 r = kvm_vm_ioctl(filp, ioctl, arg);
5418 }
6ff5894c
AB
5419 return r;
5420}
5421#endif
5422
087e1520 5423static struct file_operations kvm_vm_fops = {
f17abe9a
AK
5424 .release = kvm_vm_release,
5425 .unlocked_ioctl = kvm_vm_ioctl,
6038f373 5426 .llseek = noop_llseek,
7ddfd3e0 5427 KVM_COMPAT(kvm_vm_compat_ioctl),
f17abe9a
AK
5428};
5429
54526d1f
NT
5430bool file_is_kvm(struct file *file)
5431{
5432 return file && file->f_op == &kvm_vm_fops;
5433}
5434EXPORT_SYMBOL_GPL(file_is_kvm);
5435
e08b9637 5436static int kvm_dev_ioctl_create_vm(unsigned long type)
f17abe9a 5437{
59f82aad 5438 char fdname[ITOA_MAX_LEN + 1];
20020f4c 5439 int r, fd;
f17abe9a 5440 struct kvm *kvm;
506cfba9 5441 struct file *file;
f17abe9a 5442
20020f4c
OU
5443 fd = get_unused_fd_flags(O_CLOEXEC);
5444 if (fd < 0)
5445 return fd;
5446
59f82aad
OU
5447 snprintf(fdname, sizeof(fdname), "%d", fd);
5448
b74ed7a6 5449 kvm = kvm_create_vm(type, fdname);
20020f4c
OU
5450 if (IS_ERR(kvm)) {
5451 r = PTR_ERR(kvm);
5452 goto put_fd;
5453 }
5454
506cfba9
AV
5455 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5456 if (IS_ERR(file)) {
78588335
ME
5457 r = PTR_ERR(file);
5458 goto put_kvm;
506cfba9 5459 }
536a6f88 5460
525df861
PB
5461 /*
5462 * Don't call kvm_put_kvm anymore at this point; file->f_op is
5463 * already set, with ->release() being kvm_vm_release(). In error
5464 * cases it will be called by the final fput(file) and will take
5465 * care of doing kvm_put_kvm(kvm).
5466 */
286de8f6 5467 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
f17abe9a 5468
20020f4c
OU
5469 fd_install(fd, file);
5470 return fd;
78588335
ME
5471
5472put_kvm:
5473 kvm_put_kvm(kvm);
20020f4c
OU
5474put_fd:
5475 put_unused_fd(fd);
78588335 5476 return r;
f17abe9a
AK
5477}
5478
5479static long kvm_dev_ioctl(struct file *filp,
5480 unsigned int ioctl, unsigned long arg)
5481{
f15ba52b 5482 int r = -EINVAL;
f17abe9a
AK
5483
5484 switch (ioctl) {
5485 case KVM_GET_API_VERSION:
f0fe5108
AK
5486 if (arg)
5487 goto out;
f17abe9a
AK
5488 r = KVM_API_VERSION;
5489 break;
5490 case KVM_CREATE_VM:
e08b9637 5491 r = kvm_dev_ioctl_create_vm(arg);
f17abe9a 5492 break;
018d00d2 5493 case KVM_CHECK_EXTENSION:
784aa3d7 5494 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5d308f45 5495 break;
07c45a36 5496 case KVM_GET_VCPU_MMAP_SIZE:
07c45a36
AK
5497 if (arg)
5498 goto out;
adb1ff46
AK
5499 r = PAGE_SIZE; /* struct kvm_run */
5500#ifdef CONFIG_X86
5501 r += PAGE_SIZE; /* pio data page */
5f94c174 5502#endif
4b4357e0 5503#ifdef CONFIG_KVM_MMIO
5f94c174 5504 r += PAGE_SIZE; /* coalesced mmio ring page */
adb1ff46 5505#endif
07c45a36 5506 break;
6aa8b732 5507 default:
043405e1 5508 return kvm_arch_dev_ioctl(filp, ioctl, arg);
6aa8b732
AK
5509 }
5510out:
5511 return r;
5512}
5513
6aa8b732 5514static struct file_operations kvm_chardev_ops = {
6aa8b732 5515 .unlocked_ioctl = kvm_dev_ioctl,
6038f373 5516 .llseek = noop_llseek,
7ddfd3e0 5517 KVM_COMPAT(kvm_dev_ioctl),
6aa8b732
AK
5518};
5519
5520static struct miscdevice kvm_dev = {
bbe4432e 5521 KVM_MINOR,
6aa8b732
AK
5522 "kvm",
5523 &kvm_chardev_ops,
5524};
5525
441f7bfa
SC
5526#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5527__visible bool kvm_rebooting;
5528EXPORT_SYMBOL_GPL(kvm_rebooting);
5529
5530static DEFINE_PER_CPU(bool, hardware_enabled);
5531static int kvm_usage_count;
5532
e6fb7d6e 5533static int __hardware_enable_nolock(void)
1b6c0168 5534{
37d25881 5535 if (__this_cpu_read(hardware_enabled))
e6fb7d6e 5536 return 0;
10474ae8 5537
37d25881 5538 if (kvm_arch_hardware_enable()) {
37d25881
SC
5539 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5540 raw_smp_processor_id());
e6fb7d6e 5541 return -EIO;
10474ae8 5542 }
37d25881
SC
5543
5544 __this_cpu_write(hardware_enabled, true);
e6fb7d6e
IY
5545 return 0;
5546}
5547
5548static void hardware_enable_nolock(void *failed)
5549{
5550 if (__hardware_enable_nolock())
5551 atomic_inc(failed);
1b6c0168
AK
5552}
5553
aaf12a7b 5554static int kvm_online_cpu(unsigned int cpu)
75b7127c 5555{
aaf12a7b
CG
5556 int ret = 0;
5557
5558 /*
5559 * Abort the CPU online process if hardware virtualization cannot
5560 * be enabled. Otherwise running VMs would encounter unrecoverable
5561 * errors when scheduled to this CPU.
5562 */
0bf50497 5563 mutex_lock(&kvm_lock);
e6fb7d6e
IY
5564 if (kvm_usage_count)
5565 ret = __hardware_enable_nolock();
0bf50497 5566 mutex_unlock(&kvm_lock);
aaf12a7b 5567 return ret;
75b7127c
TY
5568}
5569
5570static void hardware_disable_nolock(void *junk)
1b6c0168 5571{
37d25881
SC
5572 /*
5573 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
5574 * hardware, not just CPUs that successfully enabled hardware!
5575 */
5576 if (!__this_cpu_read(hardware_enabled))
1b6c0168 5577 return;
37d25881 5578
13a34e06 5579 kvm_arch_hardware_disable();
37d25881
SC
5580
5581 __this_cpu_write(hardware_enabled, false);
1b6c0168
AK
5582}
5583
aaf12a7b 5584static int kvm_offline_cpu(unsigned int cpu)
75b7127c 5585{
0bf50497 5586 mutex_lock(&kvm_lock);
4fa92fb2
PB
5587 if (kvm_usage_count)
5588 hardware_disable_nolock(NULL);
0bf50497 5589 mutex_unlock(&kvm_lock);
8c18b2d2 5590 return 0;
75b7127c
TY
5591}
5592
10474ae8
AG
5593static void hardware_disable_all_nolock(void)
5594{
5595 BUG_ON(!kvm_usage_count);
5596
5597 kvm_usage_count--;
5598 if (!kvm_usage_count)
75b7127c 5599 on_each_cpu(hardware_disable_nolock, NULL, 1);
10474ae8
AG
5600}
5601
5602static void hardware_disable_all(void)
5603{
e4aa7f88 5604 cpus_read_lock();
0bf50497 5605 mutex_lock(&kvm_lock);
10474ae8 5606 hardware_disable_all_nolock();
0bf50497 5607 mutex_unlock(&kvm_lock);
e4aa7f88 5608 cpus_read_unlock();
10474ae8
AG
5609}
5610
5611static int hardware_enable_all(void)
5612{
e6fb7d6e 5613 atomic_t failed = ATOMIC_INIT(0);
e0ceec22
SC
5614 int r;
5615
5616 /*
5617 * Do not enable hardware virtualization if the system is going down.
5618 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5619 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5620 * after kvm_reboot() is called. Note, this relies on system_state
5621 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5622 * hook instead of registering a dedicated reboot notifier (the latter
5623 * runs before system_state is updated).
5624 */
5625 if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5626 system_state == SYSTEM_RESTART)
5627 return -EBUSY;
10474ae8 5628
e4aa7f88
CG
5629 /*
5630 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5631 * is called, and so on_each_cpu() between them includes the CPU that
5632 * is being onlined. As a result, hardware_enable_nolock() may get
5633 * invoked before kvm_online_cpu(), which also enables hardware if the
5634 * usage count is non-zero. Disable CPU hotplug to avoid attempting to
5635 * enable hardware multiple times.
5636 */
5637 cpus_read_lock();
0bf50497 5638 mutex_lock(&kvm_lock);
10474ae8 5639
e0ceec22
SC
5640 r = 0;
5641
10474ae8
AG
5642 kvm_usage_count++;
5643 if (kvm_usage_count == 1) {
e6fb7d6e 5644 on_each_cpu(hardware_enable_nolock, &failed, 1);
10474ae8 5645
e6fb7d6e 5646 if (atomic_read(&failed)) {
10474ae8
AG
5647 hardware_disable_all_nolock();
5648 r = -EBUSY;
5649 }
5650 }
5651
0bf50497 5652 mutex_unlock(&kvm_lock);
e4aa7f88 5653 cpus_read_unlock();
10474ae8
AG
5654
5655 return r;
5656}
5657
6735150b 5658static void kvm_shutdown(void)
9a2b85c6 5659{
8e1c1815 5660 /*
6735150b
SC
5661 * Disable hardware virtualization and set kvm_rebooting to indicate
5662 * that KVM has asynchronously disabled hardware virtualization, i.e.
5663 * that relevant errors and exceptions aren't entirely unexpected.
5664 * Some flavors of hardware virtualization need to be disabled before
5665 * transferring control to firmware (to perform shutdown/reboot), e.g.
5666 * on x86, virtualization can block INIT interrupts, which are used by
5667 * firmware to pull APs back under firmware control. Note, this path
5668 * is used for both shutdown and reboot scenarios, i.e. neither name is
5669 * 100% comprehensive.
8e1c1815 5670 */
1170adc6 5671 pr_info("kvm: exiting hardware virtualization\n");
8e1c1815 5672 kvm_rebooting = true;
75b7127c 5673 on_each_cpu(hardware_disable_nolock, NULL, 1);
9a2b85c6
RR
5674}
5675
35774a9f
SC
5676static int kvm_suspend(void)
5677{
5678 /*
5679 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5680 * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5681 * is stable. Assert that kvm_lock is not held to ensure the system
5682 * isn't suspended while KVM is enabling hardware. Hardware enabling
5683 * can be preempted, but the task cannot be frozen until it has dropped
5684 * all locks (userspace tasks are frozen via a fake signal).
5685 */
5686 lockdep_assert_not_held(&kvm_lock);
5687 lockdep_assert_irqs_disabled();
5688
5689 if (kvm_usage_count)
5690 hardware_disable_nolock(NULL);
5691 return 0;
5692}
5693
5694static void kvm_resume(void)
5695{
5696 lockdep_assert_not_held(&kvm_lock);
5697 lockdep_assert_irqs_disabled();
5698
5699 if (kvm_usage_count)
5700 WARN_ON_ONCE(__hardware_enable_nolock());
5701}
5702
5703static struct syscore_ops kvm_syscore_ops = {
5704 .suspend = kvm_suspend,
5705 .resume = kvm_resume,
6735150b 5706 .shutdown = kvm_shutdown,
35774a9f 5707};
441f7bfa
SC
5708#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5709static int hardware_enable_all(void)
5710{
5711 return 0;
5712}
5713
5714static void hardware_disable_all(void)
5715{
5716
5717}
5718#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
35774a9f 5719
5ea5ca3c
WW
5720static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5721{
5722 if (dev->ops->destructor)
5723 dev->ops->destructor(dev);
5724}
5725
e93f8a0f 5726static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2eeb2e94
GH
5727{
5728 int i;
5729
5730 for (i = 0; i < bus->dev_count; i++) {
743eeb0b 5731 struct kvm_io_device *pos = bus->range[i].dev;
2eeb2e94
GH
5732
5733 kvm_iodevice_destructor(pos);
5734 }
e93f8a0f 5735 kfree(bus);
2eeb2e94
GH
5736}
5737
c21fbff1 5738static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
20e87b72 5739 const struct kvm_io_range *r2)
743eeb0b 5740{
8f4216c7
JW
5741 gpa_t addr1 = r1->addr;
5742 gpa_t addr2 = r2->addr;
5743
5744 if (addr1 < addr2)
743eeb0b 5745 return -1;
8f4216c7
JW
5746
5747 /* If r2->len == 0, match the exact address. If r2->len != 0,
5748 * accept any overlapping write. Any order is acceptable for
5749 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5750 * we process all of them.
5751 */
5752 if (r2->len) {
5753 addr1 += r1->len;
5754 addr2 += r2->len;
5755 }
5756
5757 if (addr1 > addr2)
743eeb0b 5758 return 1;
8f4216c7 5759
743eeb0b
SL
5760 return 0;
5761}
5762
a343c9b7
PB
5763static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5764{
c21fbff1 5765 return kvm_io_bus_cmp(p1, p2);
a343c9b7
PB
5766}
5767
39369f7a 5768static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
743eeb0b
SL
5769 gpa_t addr, int len)
5770{
5771 struct kvm_io_range *range, key;
5772 int off;
5773
5774 key = (struct kvm_io_range) {
5775 .addr = addr,
5776 .len = len,
5777 };
5778
5779 range = bsearch(&key, bus->range, bus->dev_count,
5780 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5781 if (range == NULL)
5782 return -ENOENT;
5783
5784 off = range - bus->range;
5785
c21fbff1 5786 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
743eeb0b
SL
5787 off--;
5788
5789 return off;
5790}
5791
e32edf4f 5792static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
126a5af5
CH
5793 struct kvm_io_range *range, const void *val)
5794{
5795 int idx;
5796
5797 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5798 if (idx < 0)
5799 return -EOPNOTSUPP;
5800
5801 while (idx < bus->dev_count &&
c21fbff1 5802 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
e32edf4f 5803 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
126a5af5
CH
5804 range->len, val))
5805 return idx;
5806 idx++;
5807 }
5808
5809 return -EOPNOTSUPP;
5810}
5811
bda9020e 5812/* kvm_io_bus_write - called under kvm->slots_lock */
e32edf4f 5813int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
bda9020e 5814 int len, const void *val)
2eeb2e94 5815{
90d83dc3 5816 struct kvm_io_bus *bus;
743eeb0b 5817 struct kvm_io_range range;
126a5af5 5818 int r;
743eeb0b
SL
5819
5820 range = (struct kvm_io_range) {
5821 .addr = addr,
5822 .len = len,
5823 };
90d83dc3 5824
e32edf4f 5825 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5826 if (!bus)
5827 return -ENOMEM;
e32edf4f 5828 r = __kvm_io_bus_write(vcpu, bus, &range, val);
126a5af5
CH
5829 return r < 0 ? r : 0;
5830}
a2420107 5831EXPORT_SYMBOL_GPL(kvm_io_bus_write);
126a5af5
CH
5832
5833/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
e32edf4f
NN
5834int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5835 gpa_t addr, int len, const void *val, long cookie)
126a5af5
CH
5836{
5837 struct kvm_io_bus *bus;
5838 struct kvm_io_range range;
5839
5840 range = (struct kvm_io_range) {
5841 .addr = addr,
5842 .len = len,
5843 };
5844
e32edf4f 5845 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5846 if (!bus)
5847 return -ENOMEM;
126a5af5
CH
5848
5849 /* First try the device referenced by cookie. */
5850 if ((cookie >= 0) && (cookie < bus->dev_count) &&
c21fbff1 5851 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
e32edf4f 5852 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
126a5af5
CH
5853 val))
5854 return cookie;
5855
5856 /*
5857 * cookie contained garbage; fall back to search and return the
5858 * correct cookie value.
5859 */
e32edf4f 5860 return __kvm_io_bus_write(vcpu, bus, &range, val);
126a5af5
CH
5861}
5862
e32edf4f
NN
5863static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5864 struct kvm_io_range *range, void *val)
126a5af5
CH
5865{
5866 int idx;
5867
5868 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
743eeb0b
SL
5869 if (idx < 0)
5870 return -EOPNOTSUPP;
5871
5872 while (idx < bus->dev_count &&
c21fbff1 5873 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
e32edf4f 5874 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
126a5af5
CH
5875 range->len, val))
5876 return idx;
743eeb0b
SL
5877 idx++;
5878 }
5879
bda9020e
MT
5880 return -EOPNOTSUPP;
5881}
2eeb2e94 5882
bda9020e 5883/* kvm_io_bus_read - called under kvm->slots_lock */
e32edf4f 5884int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
e93f8a0f 5885 int len, void *val)
bda9020e 5886{
90d83dc3 5887 struct kvm_io_bus *bus;
743eeb0b 5888 struct kvm_io_range range;
126a5af5 5889 int r;
743eeb0b
SL
5890
5891 range = (struct kvm_io_range) {
5892 .addr = addr,
5893 .len = len,
5894 };
e93f8a0f 5895
e32edf4f 5896 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5897 if (!bus)
5898 return -ENOMEM;
e32edf4f 5899 r = __kvm_io_bus_read(vcpu, bus, &range, val);
126a5af5
CH
5900 return r < 0 ? r : 0;
5901}
743eeb0b 5902
743eeb0b
SL
5903int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5904 int len, struct kvm_io_device *dev)
6c474694 5905{
d4c67a7a 5906 int i;
e93f8a0f 5907 struct kvm_io_bus *new_bus, *bus;
d4c67a7a 5908 struct kvm_io_range range;
090b7aff 5909
b1a39a71
MZ
5910 lockdep_assert_held(&kvm->slots_lock);
5911
4a12f951 5912 bus = kvm_get_bus(kvm, bus_idx);
90db1043
DH
5913 if (!bus)
5914 return -ENOMEM;
5915
6ea34c9b
AK
5916 /* exclude ioeventfd which is limited by maximum fd */
5917 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
090b7aff 5918 return -ENOSPC;
2eeb2e94 5919
90952cd3 5920 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
b12ce36a 5921 GFP_KERNEL_ACCOUNT);
e93f8a0f
MT
5922 if (!new_bus)
5923 return -ENOMEM;
d4c67a7a
GH
5924
5925 range = (struct kvm_io_range) {
5926 .addr = addr,
5927 .len = len,
5928 .dev = dev,
5929 };
5930
5931 for (i = 0; i < bus->dev_count; i++)
5932 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5933 break;
5934
5935 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5936 new_bus->dev_count++;
5937 new_bus->range[i] = range;
5938 memcpy(new_bus->range + i + 1, bus->range + i,
5939 (bus->dev_count - i) * sizeof(struct kvm_io_range));
e93f8a0f
MT
5940 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5941 synchronize_srcu_expedited(&kvm->srcu);
5942 kfree(bus);
090b7aff
GH
5943
5944 return 0;
5945}
5946
5d3c4c79
SC
5947int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5948 struct kvm_io_device *dev)
090b7aff 5949{
5ea5ca3c 5950 int i;
e93f8a0f 5951 struct kvm_io_bus *new_bus, *bus;
090b7aff 5952
7c896d37
SC
5953 lockdep_assert_held(&kvm->slots_lock);
5954
4a12f951 5955 bus = kvm_get_bus(kvm, bus_idx);
df630b8c 5956 if (!bus)
5d3c4c79 5957 return 0;
df630b8c 5958
7c896d37 5959 for (i = 0; i < bus->dev_count; i++) {
a1300716 5960 if (bus->range[i].dev == dev) {
090b7aff
GH
5961 break;
5962 }
7c896d37 5963 }
e93f8a0f 5964
90db1043 5965 if (i == bus->dev_count)
5d3c4c79 5966 return 0;
a1300716 5967
90952cd3 5968 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
b12ce36a 5969 GFP_KERNEL_ACCOUNT);
f6588660 5970 if (new_bus) {
871c433b 5971 memcpy(new_bus, bus, struct_size(bus, range, i));
f6588660
RK
5972 new_bus->dev_count--;
5973 memcpy(new_bus->range + i, bus->range + i + 1,
871c433b 5974 flex_array_size(new_bus, range, new_bus->dev_count - i));
2ee37574
SC
5975 }
5976
5977 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5978 synchronize_srcu_expedited(&kvm->srcu);
5979
5ea5ca3c
WW
5980 /*
5981 * If NULL bus is installed, destroy the old bus, including all the
5982 * attached devices. Otherwise, destroy the caller's device only.
5983 */
2ee37574 5984 if (!new_bus) {
90db1043 5985 pr_err("kvm: failed to shrink bus, removing it completely\n");
5ea5ca3c
WW
5986 kvm_io_bus_destroy(bus);
5987 return -ENOMEM;
90db1043 5988 }
a1300716 5989
5ea5ca3c 5990 kvm_iodevice_destructor(dev);
e93f8a0f 5991 kfree(bus);
5ea5ca3c 5992 return 0;
2eeb2e94
GH
5993}
5994
8a39d006
AP
5995struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5996 gpa_t addr)
5997{
5998 struct kvm_io_bus *bus;
5999 int dev_idx, srcu_idx;
6000 struct kvm_io_device *iodev = NULL;
6001
6002 srcu_idx = srcu_read_lock(&kvm->srcu);
6003
6004 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
90db1043
DH
6005 if (!bus)
6006 goto out_unlock;
8a39d006
AP
6007
6008 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
6009 if (dev_idx < 0)
6010 goto out_unlock;
6011
6012 iodev = bus->range[dev_idx].dev;
6013
6014out_unlock:
6015 srcu_read_unlock(&kvm->srcu, srcu_idx);
6016
6017 return iodev;
6018}
6019EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
6020
536a6f88
JF
6021static int kvm_debugfs_open(struct inode *inode, struct file *file,
6022 int (*get)(void *, u64 *), int (*set)(void *, u64),
6023 const char *fmt)
6024{
180418e2 6025 int ret;
14aa40a1 6026 struct kvm_stat_data *stat_data = inode->i_private;
536a6f88 6027
605c7130
PX
6028 /*
6029 * The debugfs files are a reference to the kvm struct which
6030 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
6031 * avoids the race between open and the removal of the debugfs directory.
536a6f88 6032 */
605c7130 6033 if (!kvm_get_kvm_safe(stat_data->kvm))
536a6f88
JF
6034 return -ENOENT;
6035
180418e2
HW
6036 ret = simple_attr_open(inode, file, get,
6037 kvm_stats_debugfs_mode(stat_data->desc) & 0222
6038 ? set : NULL, fmt);
6039 if (ret)
536a6f88 6040 kvm_put_kvm(stat_data->kvm);
536a6f88 6041
180418e2 6042 return ret;
536a6f88
JF
6043}
6044
6045static int kvm_debugfs_release(struct inode *inode, struct file *file)
6046{
14aa40a1 6047 struct kvm_stat_data *stat_data = inode->i_private;
536a6f88
JF
6048
6049 simple_attr_release(inode, file);
6050 kvm_put_kvm(stat_data->kvm);
6051
6052 return 0;
6053}
6054
09cbcef6 6055static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
536a6f88 6056{
bc9e9e67 6057 *val = *(u64 *)((void *)(&kvm->stat) + offset);
536a6f88 6058
09cbcef6
MP
6059 return 0;
6060}
6061
6062static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
6063{
bc9e9e67 6064 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
536a6f88
JF
6065
6066 return 0;
6067}
6068
09cbcef6 6069static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
ce35ef27 6070{
46808a4c 6071 unsigned long i;
09cbcef6 6072 struct kvm_vcpu *vcpu;
ce35ef27 6073
09cbcef6 6074 *val = 0;
ce35ef27 6075
09cbcef6 6076 kvm_for_each_vcpu(i, vcpu, kvm)
bc9e9e67 6077 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
ce35ef27
SJS
6078
6079 return 0;
6080}
6081
09cbcef6 6082static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
536a6f88 6083{
46808a4c 6084 unsigned long i;
09cbcef6 6085 struct kvm_vcpu *vcpu;
536a6f88 6086
09cbcef6 6087 kvm_for_each_vcpu(i, vcpu, kvm)
bc9e9e67 6088 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
09cbcef6
MP
6089
6090 return 0;
6091}
536a6f88 6092
09cbcef6 6093static int kvm_stat_data_get(void *data, u64 *val)
536a6f88 6094{
09cbcef6 6095 int r = -EFAULT;
14aa40a1 6096 struct kvm_stat_data *stat_data = data;
536a6f88 6097
bc9e9e67 6098 switch (stat_data->kind) {
09cbcef6
MP
6099 case KVM_STAT_VM:
6100 r = kvm_get_stat_per_vm(stat_data->kvm,
bc9e9e67 6101 stat_data->desc->desc.offset, val);
09cbcef6
MP
6102 break;
6103 case KVM_STAT_VCPU:
6104 r = kvm_get_stat_per_vcpu(stat_data->kvm,
bc9e9e67 6105 stat_data->desc->desc.offset, val);
09cbcef6
MP
6106 break;
6107 }
536a6f88 6108
09cbcef6 6109 return r;
536a6f88
JF
6110}
6111
09cbcef6 6112static int kvm_stat_data_clear(void *data, u64 val)
ce35ef27 6113{
09cbcef6 6114 int r = -EFAULT;
14aa40a1 6115 struct kvm_stat_data *stat_data = data;
ce35ef27
SJS
6116
6117 if (val)
6118 return -EINVAL;
6119
bc9e9e67 6120 switch (stat_data->kind) {
09cbcef6
MP
6121 case KVM_STAT_VM:
6122 r = kvm_clear_stat_per_vm(stat_data->kvm,
bc9e9e67 6123 stat_data->desc->desc.offset);
09cbcef6
MP
6124 break;
6125 case KVM_STAT_VCPU:
6126 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
bc9e9e67 6127 stat_data->desc->desc.offset);
09cbcef6
MP
6128 break;
6129 }
ce35ef27 6130
09cbcef6 6131 return r;
ce35ef27
SJS
6132}
6133
09cbcef6 6134static int kvm_stat_data_open(struct inode *inode, struct file *file)
536a6f88
JF
6135{
6136 __simple_attr_check_format("%llu\n", 0ull);
09cbcef6
MP
6137 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
6138 kvm_stat_data_clear, "%llu\n");
536a6f88
JF
6139}
6140
09cbcef6
MP
6141static const struct file_operations stat_fops_per_vm = {
6142 .owner = THIS_MODULE,
6143 .open = kvm_stat_data_open,
536a6f88 6144 .release = kvm_debugfs_release,
09cbcef6
MP
6145 .read = simple_attr_read,
6146 .write = simple_attr_write,
6147 .llseek = no_llseek,
536a6f88
JF
6148};
6149
8b88b099 6150static int vm_stat_get(void *_offset, u64 *val)
ba1389b7
AK
6151{
6152 unsigned offset = (long)_offset;
ba1389b7 6153 struct kvm *kvm;
536a6f88 6154 u64 tmp_val;
ba1389b7 6155
8b88b099 6156 *val = 0;
0d9ce162 6157 mutex_lock(&kvm_lock);
536a6f88 6158 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 6159 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
536a6f88
JF
6160 *val += tmp_val;
6161 }
0d9ce162 6162 mutex_unlock(&kvm_lock);
8b88b099 6163 return 0;
ba1389b7
AK
6164}
6165
ce35ef27
SJS
6166static int vm_stat_clear(void *_offset, u64 val)
6167{
6168 unsigned offset = (long)_offset;
6169 struct kvm *kvm;
ce35ef27
SJS
6170
6171 if (val)
6172 return -EINVAL;
6173
0d9ce162 6174 mutex_lock(&kvm_lock);
ce35ef27 6175 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 6176 kvm_clear_stat_per_vm(kvm, offset);
ce35ef27 6177 }
0d9ce162 6178 mutex_unlock(&kvm_lock);
ce35ef27
SJS
6179
6180 return 0;
6181}
6182
6183DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
bc9e9e67 6184DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
ba1389b7 6185
8b88b099 6186static int vcpu_stat_get(void *_offset, u64 *val)
1165f5fe
AK
6187{
6188 unsigned offset = (long)_offset;
1165f5fe 6189 struct kvm *kvm;
536a6f88 6190 u64 tmp_val;
1165f5fe 6191
8b88b099 6192 *val = 0;
0d9ce162 6193 mutex_lock(&kvm_lock);
536a6f88 6194 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 6195 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
536a6f88
JF
6196 *val += tmp_val;
6197 }
0d9ce162 6198 mutex_unlock(&kvm_lock);
8b88b099 6199 return 0;
1165f5fe
AK
6200}
6201
ce35ef27
SJS
6202static int vcpu_stat_clear(void *_offset, u64 val)
6203{
6204 unsigned offset = (long)_offset;
6205 struct kvm *kvm;
ce35ef27
SJS
6206
6207 if (val)
6208 return -EINVAL;
6209
0d9ce162 6210 mutex_lock(&kvm_lock);
ce35ef27 6211 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 6212 kvm_clear_stat_per_vcpu(kvm, offset);
ce35ef27 6213 }
0d9ce162 6214 mutex_unlock(&kvm_lock);
ce35ef27
SJS
6215
6216 return 0;
6217}
6218
6219DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
6220 "%llu\n");
bc9e9e67 6221DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
1165f5fe 6222
286de8f6
CI
6223static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
6224{
6225 struct kobj_uevent_env *env;
286de8f6
CI
6226 unsigned long long created, active;
6227
6228 if (!kvm_dev.this_device || !kvm)
6229 return;
6230
0d9ce162 6231 mutex_lock(&kvm_lock);
286de8f6
CI
6232 if (type == KVM_EVENT_CREATE_VM) {
6233 kvm_createvm_count++;
6234 kvm_active_vms++;
6235 } else if (type == KVM_EVENT_DESTROY_VM) {
6236 kvm_active_vms--;
6237 }
6238 created = kvm_createvm_count;
6239 active = kvm_active_vms;
0d9ce162 6240 mutex_unlock(&kvm_lock);
286de8f6 6241
b12ce36a 6242 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
286de8f6
CI
6243 if (!env)
6244 return;
6245
6246 add_uevent_var(env, "CREATED=%llu", created);
6247 add_uevent_var(env, "COUNT=%llu", active);
6248
fdeaf7e3 6249 if (type == KVM_EVENT_CREATE_VM) {
286de8f6 6250 add_uevent_var(env, "EVENT=create");
fdeaf7e3
CI
6251 kvm->userspace_pid = task_pid_nr(current);
6252 } else if (type == KVM_EVENT_DESTROY_VM) {
286de8f6 6253 add_uevent_var(env, "EVENT=destroy");
fdeaf7e3
CI
6254 }
6255 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
286de8f6 6256
a44a4cc1 6257 if (!IS_ERR(kvm->debugfs_dentry)) {
b12ce36a 6258 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
fdeaf7e3
CI
6259
6260 if (p) {
6261 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
6262 if (!IS_ERR(tmp))
6263 add_uevent_var(env, "STATS_PATH=%s", tmp);
6264 kfree(p);
286de8f6
CI
6265 }
6266 }
6267 /* no need for checks, since we are adding at most only 5 keys */
6268 env->envp[env->envp_idx++] = NULL;
6269 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
6270 kfree(env);
286de8f6
CI
6271}
6272
929f45e3 6273static void kvm_init_debug(void)
6aa8b732 6274{
bc9e9e67
JZ
6275 const struct file_operations *fops;
6276 const struct _kvm_stats_desc *pdesc;
6277 int i;
6aa8b732 6278
76f7c879 6279 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
4f69b680 6280
bc9e9e67
JZ
6281 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
6282 pdesc = &kvm_vm_stats_desc[i];
6283 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6284 fops = &vm_stat_fops;
6285 else
6286 fops = &vm_stat_readonly_fops;
6287 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6288 kvm_debugfs_dir,
6289 (void *)(long)pdesc->desc.offset, fops);
6290 }
6291
6292 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
6293 pdesc = &kvm_vcpu_stats_desc[i];
6294 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6295 fops = &vcpu_stat_fops;
6296 else
6297 fops = &vcpu_stat_readonly_fops;
6298 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6299 kvm_debugfs_dir,
6300 (void *)(long)pdesc->desc.offset, fops);
4f69b680 6301 }
6aa8b732
AK
6302}
6303
15ad7146
AK
6304static inline
6305struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
6306{
6307 return container_of(pn, struct kvm_vcpu, preempt_notifier);
6308}
6309
6310static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
6311{
6312 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
f95ef0cd 6313
046ddeed 6314 WRITE_ONCE(vcpu->preempted, false);
d73eb57b 6315 WRITE_ONCE(vcpu->ready, false);
15ad7146 6316
7495e22b 6317 __this_cpu_write(kvm_running_vcpu, vcpu);
e790d9ef 6318 kvm_arch_sched_in(vcpu, cpu);
e9b11c17 6319 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146
AK
6320}
6321
6322static void kvm_sched_out(struct preempt_notifier *pn,
6323 struct task_struct *next)
6324{
6325 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6326
3ba9f93b 6327 if (current->on_rq) {
046ddeed 6328 WRITE_ONCE(vcpu->preempted, true);
d73eb57b
WL
6329 WRITE_ONCE(vcpu->ready, true);
6330 }
e9b11c17 6331 kvm_arch_vcpu_put(vcpu);
7495e22b
PB
6332 __this_cpu_write(kvm_running_vcpu, NULL);
6333}
6334
6335/**
6336 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
1f03b2bc
MZ
6337 *
6338 * We can disable preemption locally around accessing the per-CPU variable,
6339 * and use the resolved vcpu pointer after enabling preemption again,
6340 * because even if the current thread is migrated to another CPU, reading
6341 * the per-CPU value later will give us the same value as we update the
6342 * per-CPU variable in the preempt notifier handlers.
7495e22b
PB
6343 */
6344struct kvm_vcpu *kvm_get_running_vcpu(void)
6345{
1f03b2bc
MZ
6346 struct kvm_vcpu *vcpu;
6347
6348 preempt_disable();
6349 vcpu = __this_cpu_read(kvm_running_vcpu);
6350 preempt_enable();
6351
6352 return vcpu;
7495e22b 6353}
379a3c8e 6354EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
7495e22b
PB
6355
6356/**
6357 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6358 */
6359struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6360{
6361 return &kvm_running_vcpu;
15ad7146
AK
6362}
6363
e1bfc245
SC
6364#ifdef CONFIG_GUEST_PERF_EVENTS
6365static unsigned int kvm_guest_state(void)
6366{
6367 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6368 unsigned int state;
6369
6370 if (!kvm_arch_pmi_in_guest(vcpu))
6371 return 0;
6372
6373 state = PERF_GUEST_ACTIVE;
6374 if (!kvm_arch_vcpu_in_kernel(vcpu))
6375 state |= PERF_GUEST_USER;
6376
6377 return state;
6378}
6379
6380static unsigned long kvm_guest_get_ip(void)
6381{
6382 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6383
6384 /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6385 if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6386 return 0;
6387
6388 return kvm_arch_vcpu_get_ip(vcpu);
6389}
6390
6391static struct perf_guest_info_callbacks kvm_guest_cbs = {
6392 .state = kvm_guest_state,
6393 .get_ip = kvm_guest_get_ip,
6394 .handle_intel_pt_intr = NULL,
6395};
6396
6397void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6398{
6399 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6400 perf_register_guest_info_callbacks(&kvm_guest_cbs);
6401}
6402void kvm_unregister_perf_callbacks(void)
6403{
6404 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6405}
6406#endif
6407
81a1cf9f 6408int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
f257d6dc 6409{
6aa8b732 6410 int r;
002c7f7c 6411 int cpu;
6aa8b732 6412
441f7bfa 6413#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
aaf12a7b
CG
6414 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6415 kvm_online_cpu, kvm_offline_cpu);
774c47f1 6416 if (r)
37d25881
SC
6417 return r;
6418
35774a9f 6419 register_syscore_ops(&kvm_syscore_ops);
441f7bfa 6420#endif
6aa8b732 6421
c16f862d 6422 /* A kmem cache lets us meet the alignment requirements of fx_save. */
0ee75bea
AK
6423 if (!vcpu_align)
6424 vcpu_align = __alignof__(struct kvm_vcpu);
46515736
PB
6425 kvm_vcpu_cache =
6426 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6427 SLAB_ACCOUNT,
6428 offsetof(struct kvm_vcpu, arch),
ce55c049
JZ
6429 offsetofend(struct kvm_vcpu, stats_id)
6430 - offsetof(struct kvm_vcpu, arch),
46515736 6431 NULL);
c16f862d
RR
6432 if (!kvm_vcpu_cache) {
6433 r = -ENOMEM;
9f1a4c00 6434 goto err_vcpu_cache;
c16f862d
RR
6435 }
6436
baff59cc
VK
6437 for_each_possible_cpu(cpu) {
6438 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6439 GFP_KERNEL, cpu_to_node(cpu))) {
6440 r = -ENOMEM;
9f1a4c00 6441 goto err_cpu_kick_mask;
baff59cc
VK
6442 }
6443 }
6444
5910ccf0
SC
6445 r = kvm_irqfd_init();
6446 if (r)
6447 goto err_irqfd;
6448
af585b92
GN
6449 r = kvm_async_pf_init();
6450 if (r)
5910ccf0 6451 goto err_async_pf;
af585b92 6452
6aa8b732 6453 kvm_chardev_ops.owner = module;
087e1520
SC
6454 kvm_vm_fops.owner = module;
6455 kvm_vcpu_fops.owner = module;
6456 kvm_device_fops.owner = module;
6aa8b732 6457
15ad7146
AK
6458 kvm_preempt_ops.sched_in = kvm_sched_in;
6459 kvm_preempt_ops.sched_out = kvm_sched_out;
6460
929f45e3 6461 kvm_init_debug();
0ea4ed8e 6462
3c3c29fd 6463 r = kvm_vfio_ops_init();
2b012812
SC
6464 if (WARN_ON_ONCE(r))
6465 goto err_vfio;
6466
a7800aa8
SC
6467 kvm_gmem_init(module);
6468
2b012812
SC
6469 /*
6470 * Registration _must_ be the very last thing done, as this exposes
6471 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6472 */
6473 r = misc_register(&kvm_dev);
6474 if (r) {
6475 pr_err("kvm: misc device register failed\n");
6476 goto err_register;
6477 }
3c3c29fd 6478
c7addb90 6479 return 0;
6aa8b732 6480
2b012812
SC
6481err_register:
6482 kvm_vfio_ops_exit();
6483err_vfio:
af585b92 6484 kvm_async_pf_deinit();
5910ccf0
SC
6485err_async_pf:
6486 kvm_irqfd_exit();
6487err_irqfd:
9f1a4c00 6488err_cpu_kick_mask:
baff59cc
VK
6489 for_each_possible_cpu(cpu)
6490 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
c16f862d 6491 kmem_cache_destroy(kvm_vcpu_cache);
9f1a4c00 6492err_vcpu_cache:
441f7bfa 6493#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
35774a9f 6494 unregister_syscore_ops(&kvm_syscore_ops);
aaf12a7b 6495 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
441f7bfa 6496#endif
6aa8b732
AK
6497 return r;
6498}
cb498ea2 6499EXPORT_SYMBOL_GPL(kvm_init);
6aa8b732 6500
cb498ea2 6501void kvm_exit(void)
6aa8b732 6502{
baff59cc
VK
6503 int cpu;
6504
2b012812
SC
6505 /*
6506 * Note, unregistering /dev/kvm doesn't strictly need to come first,
6507 * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6508 * to KVM while the module is being stopped.
6509 */
6aa8b732 6510 misc_deregister(&kvm_dev);
2b012812
SC
6511
6512 debugfs_remove_recursive(kvm_debugfs_dir);
baff59cc
VK
6513 for_each_possible_cpu(cpu)
6514 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
c16f862d 6515 kmem_cache_destroy(kvm_vcpu_cache);
73b8dc04 6516 kvm_vfio_ops_exit();
af585b92 6517 kvm_async_pf_deinit();
441f7bfa 6518#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
fb3600cc 6519 unregister_syscore_ops(&kvm_syscore_ops);
aaf12a7b 6520 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
441f7bfa 6521#endif
5910ccf0 6522 kvm_irqfd_exit();
6aa8b732 6523}
cb498ea2 6524EXPORT_SYMBOL_GPL(kvm_exit);
c57c8046
JS
6525
6526struct kvm_vm_worker_thread_context {
6527 struct kvm *kvm;
6528 struct task_struct *parent;
6529 struct completion init_done;
6530 kvm_vm_thread_fn_t thread_fn;
6531 uintptr_t data;
6532 int err;
6533};
6534
6535static int kvm_vm_worker_thread(void *context)
6536{
6537 /*
6538 * The init_context is allocated on the stack of the parent thread, so
6539 * we have to locally copy anything that is needed beyond initialization
6540 */
6541 struct kvm_vm_worker_thread_context *init_context = context;
e45cce30 6542 struct task_struct *parent;
c57c8046
JS
6543 struct kvm *kvm = init_context->kvm;
6544 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6545 uintptr_t data = init_context->data;
6546 int err;
6547
6548 err = kthread_park(current);
6549 /* kthread_park(current) is never supposed to return an error */
6550 WARN_ON(err != 0);
6551 if (err)
6552 goto init_complete;
6553
6554 err = cgroup_attach_task_all(init_context->parent, current);
6555 if (err) {
6556 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6557 __func__, err);
6558 goto init_complete;
6559 }
6560
6561 set_user_nice(current, task_nice(init_context->parent));
6562
6563init_complete:
6564 init_context->err = err;
6565 complete(&init_context->init_done);
6566 init_context = NULL;
6567
6568 if (err)
e45cce30 6569 goto out;
c57c8046
JS
6570
6571 /* Wait to be woken up by the spawner before proceeding. */
6572 kthread_parkme();
6573
6574 if (!kthread_should_stop())
6575 err = thread_fn(kvm, data);
6576
e45cce30
VS
6577out:
6578 /*
6579 * Move kthread back to its original cgroup to prevent it lingering in
6580 * the cgroup of the VM process, after the latter finishes its
6581 * execution.
6582 *
6583 * kthread_stop() waits on the 'exited' completion condition which is
6584 * set in exit_mm(), via mm_release(), in do_exit(). However, the
6585 * kthread is removed from the cgroup in the cgroup_exit() which is
6586 * called after the exit_mm(). This causes the kthread_stop() to return
6587 * before the kthread actually quits the cgroup.
6588 */
6589 rcu_read_lock();
6590 parent = rcu_dereference(current->real_parent);
6591 get_task_struct(parent);
6592 rcu_read_unlock();
6593 cgroup_attach_task_all(parent, current);
6594 put_task_struct(parent);
6595
c57c8046
JS
6596 return err;
6597}
6598
6599int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6600 uintptr_t data, const char *name,
6601 struct task_struct **thread_ptr)
6602{
6603 struct kvm_vm_worker_thread_context init_context = {};
6604 struct task_struct *thread;
6605
6606 *thread_ptr = NULL;
6607 init_context.kvm = kvm;
6608 init_context.parent = current;
6609 init_context.thread_fn = thread_fn;
6610 init_context.data = data;
6611 init_completion(&init_context.init_done);
6612
6613 thread = kthread_run(kvm_vm_worker_thread, &init_context,
6614 "%s-%d", name, task_pid_nr(current));
6615 if (IS_ERR(thread))
6616 return PTR_ERR(thread);
6617
6618 /* kthread_run is never supposed to return NULL */
6619 WARN_ON(thread == NULL);
6620
6621 wait_for_completion(&init_context.init_done);
6622
6623 if (!init_context.err)
6624 *thread_ptr = thread;
6625
6626 return init_context.err;
6627}