mm: pass VMA instead of MM to follow_pte()
[linux-block.git] / virt / kvm / kvm_main.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
6aa8b732
AK
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9611c187 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
6aa8b732
AK
14 */
15
af669ac6 16#include <kvm/iodev.h>
6aa8b732 17
edf88417 18#include <linux/kvm_host.h>
6aa8b732
AK
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
6aa8b732 22#include <linux/percpu.h>
6aa8b732
AK
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
6aa8b732 26#include <linux/reboot.h>
6aa8b732
AK
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
fb3600cc 30#include <linux/syscore_ops.h>
774c47f1 31#include <linux/cpu.h>
174cd4b1 32#include <linux/sched/signal.h>
6e84f315 33#include <linux/sched/mm.h>
03441a34 34#include <linux/sched/stat.h>
d9e368d6
AK
35#include <linux/cpumask.h>
36#include <linux/smp.h>
d6d28168 37#include <linux/anon_inodes.h>
04d2cc77 38#include <linux/profile.h>
7aa81cc0 39#include <linux/kvm_para.h>
6fc138d2 40#include <linux/pagemap.h>
8d4e1288 41#include <linux/mman.h>
35149e21 42#include <linux/swap.h>
e56d532f 43#include <linux/bitops.h>
547de29e 44#include <linux/spinlock.h>
6ff5894c 45#include <linux/compat.h>
bc6678a3 46#include <linux/srcu.h>
8f0b1ab6 47#include <linux/hugetlb.h>
5a0e3ad6 48#include <linux/slab.h>
743eeb0b
SL
49#include <linux/sort.h>
50#include <linux/bsearch.h>
c011d23b 51#include <linux/io.h>
2eb06c30 52#include <linux/lockdep.h>
c57c8046 53#include <linux/kthread.h>
2fdef3a2 54#include <linux/suspend.h>
6aa8b732 55
e495606d 56#include <asm/processor.h>
2ea75be3 57#include <asm/ioctl.h>
7c0f6ba6 58#include <linux/uaccess.h>
6aa8b732 59
5f94c174 60#include "coalesced_mmio.h"
af585b92 61#include "async_pf.h"
982ed0de 62#include "kvm_mm.h"
3c3c29fd 63#include "vfio.h"
5f94c174 64
4c8c3c7f
VS
65#include <trace/events/ipi.h>
66
229456fc
MT
67#define CREATE_TRACE_POINTS
68#include <trace/events/kvm.h>
69
fb04a1ed
PX
70#include <linux/kvm_dirty_ring.h>
71
4c8c3c7f 72
536a6f88
JF
73/* Worst case buffer size needed for holding an integer. */
74#define ITOA_MAX_LEN 12
75
6aa8b732
AK
76MODULE_AUTHOR("Qumranet");
77MODULE_LICENSE("GPL");
78
920552b2 79/* Architectures should define their poll value according to the halt latency */
ec76d819 80unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
039c5d1b 81module_param(halt_poll_ns, uint, 0644);
ec76d819 82EXPORT_SYMBOL_GPL(halt_poll_ns);
f7819512 83
aca6ff29 84/* Default doubles per-vcpu halt_poll_ns. */
ec76d819 85unsigned int halt_poll_ns_grow = 2;
039c5d1b 86module_param(halt_poll_ns_grow, uint, 0644);
ec76d819 87EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
aca6ff29 88
49113d36
NW
89/* The start value to grow halt_poll_ns from */
90unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
91module_param(halt_poll_ns_grow_start, uint, 0644);
92EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
93
aca6ff29 94/* Default resets per-vcpu halt_poll_ns . */
ec76d819 95unsigned int halt_poll_ns_shrink;
039c5d1b 96module_param(halt_poll_ns_shrink, uint, 0644);
ec76d819 97EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
aca6ff29 98
fa40a821
MT
99/*
100 * Ordering of locks:
101 *
b7d409de 102 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
fa40a821
MT
103 */
104
0d9ce162 105DEFINE_MUTEX(kvm_lock);
e9b11c17 106LIST_HEAD(vm_list);
133de902 107
aaba298c 108static struct kmem_cache *kvm_vcpu_cache;
1165f5fe 109
15ad7146 110static __read_mostly struct preempt_ops kvm_preempt_ops;
7495e22b 111static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
15ad7146 112
76f7c879 113struct dentry *kvm_debugfs_dir;
e23a808b 114EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
6aa8b732 115
09cbcef6 116static const struct file_operations stat_fops_per_vm;
536a6f88 117
bccf2150
AK
118static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
119 unsigned long arg);
de8e5d74 120#ifdef CONFIG_KVM_COMPAT
1dda606c
AG
121static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
122 unsigned long arg);
7ddfd3e0
MZ
123#define KVM_COMPAT(c) .compat_ioctl = (c)
124#else
9cb09e7c
MZ
125/*
126 * For architectures that don't implement a compat infrastructure,
127 * adopt a double line of defense:
128 * - Prevent a compat task from opening /dev/kvm
129 * - If the open has been done by a 64bit task, and the KVM fd
130 * passed to a compat task, let the ioctls fail.
131 */
7ddfd3e0
MZ
132static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
133 unsigned long arg) { return -EINVAL; }
b9876e6d
MZ
134
135static int kvm_no_compat_open(struct inode *inode, struct file *file)
136{
137 return is_compat_task() ? -ENODEV : 0;
138}
139#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
140 .open = kvm_no_compat_open
1dda606c 141#endif
10474ae8
AG
142static int hardware_enable_all(void);
143static void hardware_disable_all(void);
bccf2150 144
e93f8a0f 145static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
7940876e 146
286de8f6
CI
147#define KVM_EVENT_CREATE_VM 0
148#define KVM_EVENT_DESTROY_VM 1
149static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
150static unsigned long long kvm_createvm_count;
151static unsigned long long kvm_active_vms;
152
baff59cc
VK
153static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
154
683412cc
MZ
155__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
156{
157}
158
284dc493 159bool kvm_is_zone_device_page(struct page *page)
a78986aa
SC
160{
161 /*
162 * The metadata used by is_zone_device_page() to determine whether or
163 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
164 * the device has been pinned, e.g. by get_user_pages(). WARN if the
165 * page_count() is zero to help detect bad usage of this helper.
166 */
284dc493 167 if (WARN_ON_ONCE(!page_count(page)))
a78986aa
SC
168 return false;
169
284dc493 170 return is_zone_device_page(page);
a78986aa
SC
171}
172
b14b2690
SC
173/*
174 * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
175 * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
176 * is likely incomplete, it has been compiled purely through people wanting to
177 * back guest with a certain type of memory and encountering issues.
178 */
179struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
cbff90a7 180{
b14b2690
SC
181 struct page *page;
182
183 if (!pfn_valid(pfn))
184 return NULL;
185
186 page = pfn_to_page(pfn);
187 if (!PageReserved(page))
188 return page;
189
190 /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
191 if (is_zero_pfn(pfn))
192 return page;
193
a78986aa
SC
194 /*
195 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
196 * perspective they are "normal" pages, albeit with slightly different
197 * usage rules.
198 */
b14b2690
SC
199 if (kvm_is_zone_device_page(page))
200 return page;
cbff90a7 201
b14b2690 202 return NULL;
cbff90a7
BAY
203}
204
bccf2150
AK
205/*
206 * Switches to specified vcpu, until a matching vcpu_put()
207 */
ec7660cc 208void vcpu_load(struct kvm_vcpu *vcpu)
6aa8b732 209{
ec7660cc 210 int cpu = get_cpu();
7495e22b
PB
211
212 __this_cpu_write(kvm_running_vcpu, vcpu);
15ad7146 213 preempt_notifier_register(&vcpu->preempt_notifier);
313a3dc7 214 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146 215 put_cpu();
6aa8b732 216}
2f1fe811 217EXPORT_SYMBOL_GPL(vcpu_load);
6aa8b732 218
313a3dc7 219void vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 220{
15ad7146 221 preempt_disable();
313a3dc7 222 kvm_arch_vcpu_put(vcpu);
15ad7146 223 preempt_notifier_unregister(&vcpu->preempt_notifier);
7495e22b 224 __this_cpu_write(kvm_running_vcpu, NULL);
15ad7146 225 preempt_enable();
6aa8b732 226}
2f1fe811 227EXPORT_SYMBOL_GPL(vcpu_put);
6aa8b732 228
7a97cec2
PB
229/* TODO: merge with kvm_arch_vcpu_should_kick */
230static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
231{
232 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
233
234 /*
235 * We need to wait for the VCPU to reenable interrupts and get out of
236 * READING_SHADOW_PAGE_TABLES mode.
237 */
238 if (req & KVM_REQUEST_WAIT)
239 return mode != OUTSIDE_GUEST_MODE;
240
241 /*
242 * Need to kick a running VCPU, but otherwise there is nothing to do.
243 */
244 return mode == IN_GUEST_MODE;
245}
246
f24b44e4 247static void ack_kick(void *_completed)
d9e368d6 248{
d9e368d6
AK
249}
250
620b2438 251static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
b49defe8 252{
b49defe8
PB
253 if (cpumask_empty(cpus))
254 return false;
255
f24b44e4 256 smp_call_function_many(cpus, ack_kick, NULL, wait);
b49defe8
PB
257 return true;
258}
259
b56bd8e0
JL
260static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
261 struct cpumask *tmp, int current_cpu)
ae0946cd
VK
262{
263 int cpu;
264
df06dae3
SC
265 if (likely(!(req & KVM_REQUEST_NO_ACTION)))
266 __kvm_make_request(req, vcpu);
ae0946cd
VK
267
268 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
269 return;
270
ae0946cd
VK
271 /*
272 * Note, the vCPU could get migrated to a different pCPU at any point
273 * after kvm_request_needs_ipi(), which could result in sending an IPI
274 * to the previous pCPU. But, that's OK because the purpose of the IPI
275 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
276 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
277 * after this point is also OK, as the requirement is only that KVM wait
278 * for vCPUs that were reading SPTEs _before_ any changes were
279 * finalized. See kvm_vcpu_kick() for more details on handling requests.
280 */
281 if (kvm_request_needs_ipi(vcpu, req)) {
282 cpu = READ_ONCE(vcpu->cpu);
283 if (cpu != -1 && cpu != current_cpu)
284 __cpumask_set_cpu(cpu, tmp);
285 }
286}
287
7053df4e 288bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
620b2438 289 unsigned long *vcpu_bitmap)
d9e368d6 290{
d9e368d6 291 struct kvm_vcpu *vcpu;
620b2438 292 struct cpumask *cpus;
ae0946cd 293 int i, me;
7053df4e 294 bool called;
6ef7a1bc 295
3cba4130 296 me = get_cpu();
7053df4e 297
620b2438
VK
298 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
299 cpumask_clear(cpus);
300
ae0946cd
VK
301 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
302 vcpu = kvm_get_vcpu(kvm, i);
381cecc5 303 if (!vcpu)
7053df4e 304 continue;
b56bd8e0 305 kvm_make_vcpu_request(vcpu, req, cpus, me);
49846896 306 }
7053df4e 307
620b2438 308 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
3cba4130 309 put_cpu();
7053df4e
VK
310
311 return called;
312}
313
54163a34
SS
314bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
315 struct kvm_vcpu *except)
7053df4e 316{
ae0946cd 317 struct kvm_vcpu *vcpu;
baff59cc 318 struct cpumask *cpus;
46808a4c 319 unsigned long i;
7053df4e 320 bool called;
46808a4c 321 int me;
7053df4e 322
ae0946cd
VK
323 me = get_cpu();
324
baff59cc
VK
325 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
326 cpumask_clear(cpus);
327
ae0946cd
VK
328 kvm_for_each_vcpu(i, vcpu, kvm) {
329 if (vcpu == except)
330 continue;
b56bd8e0 331 kvm_make_vcpu_request(vcpu, req, cpus, me);
ae0946cd
VK
332 }
333
334 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
335 put_cpu();
7053df4e 336
49846896 337 return called;
d9e368d6
AK
338}
339
54163a34
SS
340bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
341{
342 return kvm_make_all_cpus_request_except(kvm, req, NULL);
343}
a2486020 344EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
54163a34 345
49846896 346void kvm_flush_remote_tlbs(struct kvm *kvm)
2e53d63a 347{
3cc4e148 348 ++kvm->stat.generic.remote_tlb_flush_requests;
6bc6db00 349
4ae3cb3a
LT
350 /*
351 * We want to publish modifications to the page tables before reading
352 * mode. Pairs with a memory barrier in arch-specific code.
353 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
354 * and smp_mb in walk_shadow_page_lockless_begin/end.
355 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
356 *
357 * There is already an smp_mb__after_atomic() before
358 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
359 * barrier here.
360 */
a1342c80 361 if (!kvm_arch_flush_remote_tlbs(kvm)
b08660e5 362 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
0193cc90 363 ++kvm->stat.generic.remote_tlb_flush;
2e53d63a 364}
2ba9f0d8 365EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
2e53d63a 366
d4788996
DM
367void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
368{
369 if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
370 return;
371
372 /*
373 * Fall back to a flushing entire TLBs if the architecture range-based
374 * TLB invalidation is unsupported or can't be performed for whatever
375 * reason.
376 */
377 kvm_flush_remote_tlbs(kvm);
378}
379
619b5072
DM
380void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
381 const struct kvm_memory_slot *memslot)
382{
383 /*
384 * All current use cases for flushing the TLBs for a specific memslot
385 * are related to dirty logging, and many do the TLB flush out of
386 * mmu_lock. The interaction between the various operations on memslot
387 * must be serialized by slots_locks to ensure the TLB flush from one
388 * operation is observed by any other operation on the same memslot.
389 */
390 lockdep_assert_held(&kvm->slots_lock);
391 kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
392}
2e53d63a 393
683412cc
MZ
394static void kvm_flush_shadow_all(struct kvm *kvm)
395{
396 kvm_arch_flush_shadow_all(kvm);
397 kvm_arch_guest_memory_reclaimed(kvm);
398}
399
6926f95a
SC
400#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
401static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
402 gfp_t gfp_flags)
403{
404 gfp_flags |= mc->gfp_zero;
405
406 if (mc->kmem_cache)
407 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
408 else
409 return (void *)__get_free_page(gfp_flags);
410}
411
837f66c7 412int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
6926f95a 413{
63f4b210 414 gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
6926f95a
SC
415 void *obj;
416
417 if (mc->nobjs >= min)
418 return 0;
837f66c7
DM
419
420 if (unlikely(!mc->objects)) {
421 if (WARN_ON_ONCE(!capacity))
422 return -EIO;
423
ea3689d9 424 mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
837f66c7
DM
425 if (!mc->objects)
426 return -ENOMEM;
427
428 mc->capacity = capacity;
429 }
430
431 /* It is illegal to request a different capacity across topups. */
432 if (WARN_ON_ONCE(mc->capacity != capacity))
433 return -EIO;
434
435 while (mc->nobjs < mc->capacity) {
436 obj = mmu_memory_cache_alloc_obj(mc, gfp);
6926f95a
SC
437 if (!obj)
438 return mc->nobjs >= min ? 0 : -ENOMEM;
439 mc->objects[mc->nobjs++] = obj;
440 }
441 return 0;
442}
443
837f66c7
DM
444int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
445{
446 return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
447}
448
6926f95a
SC
449int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
450{
451 return mc->nobjs;
452}
453
454void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
455{
456 while (mc->nobjs) {
457 if (mc->kmem_cache)
458 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
459 else
460 free_page((unsigned long)mc->objects[--mc->nobjs]);
461 }
837f66c7
DM
462
463 kvfree(mc->objects);
464
465 mc->objects = NULL;
466 mc->capacity = 0;
6926f95a
SC
467}
468
469void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
470{
471 void *p;
472
473 if (WARN_ON(!mc->nobjs))
474 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
475 else
476 p = mc->objects[--mc->nobjs];
477 BUG_ON(!p);
478 return p;
479}
480#endif
481
8bd826d6 482static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
fb3f0f51 483{
fb3f0f51
RR
484 mutex_init(&vcpu->mutex);
485 vcpu->cpu = -1;
fb3f0f51
RR
486 vcpu->kvm = kvm;
487 vcpu->vcpu_id = id;
34bb10b7 488 vcpu->pid = NULL;
510958e9 489#ifndef __KVM_HAVE_ARCH_WQP
da4ad88c 490 rcuwait_init(&vcpu->wait);
510958e9 491#endif
af585b92 492 kvm_async_pf_vcpu_init(vcpu);
fb3f0f51 493
4c088493
R
494 kvm_vcpu_set_in_spin_loop(vcpu, false);
495 kvm_vcpu_set_dy_eligible(vcpu, false);
3a08a8f9 496 vcpu->preempted = false;
d73eb57b 497 vcpu->ready = false;
d5c48deb 498 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
a54d8066 499 vcpu->last_used_slot = NULL;
58fc1166
OU
500
501 /* Fill the stats id string for the vcpu */
502 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
503 task_pid_nr(current), id);
fb3f0f51 504}
fb3f0f51 505
27592ae8 506static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
4543bdc0
SC
507{
508 kvm_arch_vcpu_destroy(vcpu);
5593473a 509 kvm_dirty_ring_free(&vcpu->dirty_ring);
e529ef66 510
9941d224
SC
511 /*
512 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
513 * the vcpu->pid pointer, and at destruction time all file descriptors
514 * are already gone.
515 */
516 put_pid(rcu_dereference_protected(vcpu->pid, 1));
517
8bd826d6 518 free_page((unsigned long)vcpu->run);
e529ef66 519 kmem_cache_free(kvm_vcpu_cache, vcpu);
4543bdc0 520}
27592ae8
MZ
521
522void kvm_destroy_vcpus(struct kvm *kvm)
523{
46808a4c 524 unsigned long i;
27592ae8
MZ
525 struct kvm_vcpu *vcpu;
526
527 kvm_for_each_vcpu(i, vcpu, kvm) {
528 kvm_vcpu_destroy(vcpu);
c5b07754 529 xa_erase(&kvm->vcpu_array, i);
27592ae8
MZ
530 }
531
532 atomic_set(&kvm->online_vcpus, 0);
533}
534EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
4543bdc0 535
f128cf8c 536#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
e930bffe
AA
537static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
538{
539 return container_of(mn, struct kvm, mmu_notifier);
540}
541
e97b39c5 542typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
3039bcc7 543
8569992d 544typedef void (*on_lock_fn_t)(struct kvm *kvm);
683412cc 545
e97b39c5
SC
546struct kvm_mmu_notifier_range {
547 /*
548 * 64-bit addresses, as KVM notifiers can operate on host virtual
549 * addresses (unsigned long) and guest physical addresses (64-bit).
550 */
551 u64 start;
552 u64 end;
3e1efe2b 553 union kvm_mmu_notifier_arg arg;
e97b39c5 554 gfn_handler_t handler;
f922bd9b 555 on_lock_fn_t on_lock;
3039bcc7
SC
556 bool flush_on_ret;
557 bool may_block;
558};
559
cec29eef
SC
560/*
561 * The inner-most helper returns a tuple containing the return value from the
562 * arch- and action-specific handler, plus a flag indicating whether or not at
563 * least one memslot was found, i.e. if the handler found guest memory.
564 *
565 * Note, most notifiers are averse to booleans, so even though KVM tracks the
566 * return from arch code as a bool, outer helpers will cast it to an int. :-(
567 */
568typedef struct kvm_mmu_notifier_return {
569 bool ret;
570 bool found_memslot;
571} kvm_mn_ret_t;
572
f922bd9b
SC
573/*
574 * Use a dedicated stub instead of NULL to indicate that there is no callback
575 * function/handler. The compiler technically can't guarantee that a real
576 * function will have a non-zero address, and so it will generate code to
577 * check for !NULL, whereas comparing against a stub will be elided at compile
578 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
579 */
580static void kvm_null_fn(void)
581{
582
583}
584#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
585
3e1efe2b
SC
586static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
587
ed922739
MS
588/* Iterate over each memslot intersecting [start, last] (inclusive) range */
589#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
590 for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
591 node; \
592 node = interval_tree_iter_next(node, start, last)) \
593
cec29eef
SC
594static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
595 const struct kvm_mmu_notifier_range *range)
3039bcc7 596{
cec29eef
SC
597 struct kvm_mmu_notifier_return r = {
598 .ret = false,
599 .found_memslot = false,
600 };
f922bd9b 601 struct kvm_gfn_range gfn_range;
3039bcc7
SC
602 struct kvm_memory_slot *slot;
603 struct kvm_memslots *slots;
3039bcc7
SC
604 int i, idx;
605
ed922739 606 if (WARN_ON_ONCE(range->end <= range->start))
cec29eef 607 return r;
ed922739 608
f922bd9b
SC
609 /* A null handler is allowed if and only if on_lock() is provided. */
610 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
611 IS_KVM_NULL_FN(range->handler)))
cec29eef 612 return r;
f922bd9b 613
3039bcc7
SC
614 idx = srcu_read_lock(&kvm->srcu);
615
eed52e43 616 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
ed922739
MS
617 struct interval_tree_node *node;
618
3039bcc7 619 slots = __kvm_memslots(kvm, i);
ed922739
MS
620 kvm_for_each_memslot_in_hva_range(node, slots,
621 range->start, range->end - 1) {
3039bcc7
SC
622 unsigned long hva_start, hva_end;
623
a54d8066 624 slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
e97b39c5
SC
625 hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
626 hva_end = min_t(unsigned long, range->end,
627 slot->userspace_addr + (slot->npages << PAGE_SHIFT));
3039bcc7
SC
628
629 /*
630 * To optimize for the likely case where the address
631 * range is covered by zero or one memslots, don't
632 * bother making these conditional (to avoid writes on
633 * the second or later invocation of the handler).
634 */
3e1efe2b 635 gfn_range.arg = range->arg;
3039bcc7
SC
636 gfn_range.may_block = range->may_block;
637
638 /*
639 * {gfn(page) | page intersects with [hva_start, hva_end)} =
640 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
641 */
642 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
643 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
644 gfn_range.slot = slot;
645
cec29eef
SC
646 if (!r.found_memslot) {
647 r.found_memslot = true;
8931a454 648 KVM_MMU_LOCK(kvm);
071064f1 649 if (!IS_KVM_NULL_FN(range->on_lock))
8569992d
CP
650 range->on_lock(kvm);
651
071064f1
PB
652 if (IS_KVM_NULL_FN(range->handler))
653 break;
8931a454 654 }
cec29eef 655 r.ret |= range->handler(kvm, &gfn_range);
3039bcc7
SC
656 }
657 }
658
cec29eef 659 if (range->flush_on_ret && r.ret)
3039bcc7
SC
660 kvm_flush_remote_tlbs(kvm);
661
193bbfaa 662 if (r.found_memslot)
8931a454 663 KVM_MMU_UNLOCK(kvm);
f922bd9b 664
3039bcc7
SC
665 srcu_read_unlock(&kvm->srcu, idx);
666
cec29eef 667 return r;
3039bcc7
SC
668}
669
670static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
671 unsigned long start,
672 unsigned long end,
3e1efe2b 673 union kvm_mmu_notifier_arg arg,
e97b39c5 674 gfn_handler_t handler)
3039bcc7
SC
675{
676 struct kvm *kvm = mmu_notifier_to_kvm(mn);
e97b39c5 677 const struct kvm_mmu_notifier_range range = {
3039bcc7
SC
678 .start = start,
679 .end = end,
3e1efe2b 680 .arg = arg,
3039bcc7 681 .handler = handler,
f922bd9b 682 .on_lock = (void *)kvm_null_fn,
3039bcc7
SC
683 .flush_on_ret = true,
684 .may_block = false,
685 };
3039bcc7 686
cec29eef 687 return __kvm_handle_hva_range(kvm, &range).ret;
3039bcc7
SC
688}
689
690static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
691 unsigned long start,
692 unsigned long end,
e97b39c5 693 gfn_handler_t handler)
3039bcc7
SC
694{
695 struct kvm *kvm = mmu_notifier_to_kvm(mn);
e97b39c5 696 const struct kvm_mmu_notifier_range range = {
3039bcc7
SC
697 .start = start,
698 .end = end,
3039bcc7 699 .handler = handler,
f922bd9b 700 .on_lock = (void *)kvm_null_fn,
3039bcc7
SC
701 .flush_on_ret = false,
702 .may_block = false,
703 };
3039bcc7 704
cec29eef 705 return __kvm_handle_hva_range(kvm, &range).ret;
3039bcc7 706}
2230f9e1
GS
707
708static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
709{
710 /*
711 * Skipping invalid memslots is correct if and only change_pte() is
712 * surrounded by invalidate_range_{start,end}(), which is currently
713 * guaranteed by the primary MMU. If that ever changes, KVM needs to
714 * unmap the memslot instead of skipping the memslot to ensure that KVM
715 * doesn't hold references to the old PFN.
716 */
717 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
718
719 if (range->slot->flags & KVM_MEMSLOT_INVALID)
720 return false;
721
722 return kvm_set_spte_gfn(kvm, range);
723}
724
3da0dd43
IE
725static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
726 struct mm_struct *mm,
727 unsigned long address,
728 pte_t pte)
729{
730 struct kvm *kvm = mmu_notifier_to_kvm(mn);
3e1efe2b 731 const union kvm_mmu_notifier_arg arg = { .pte = pte };
3da0dd43 732
501b9185
SC
733 trace_kvm_set_spte_hva(address);
734
c13fda23 735 /*
52ac8b35 736 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
20ec3ebd
CP
737 * If mmu_invalidate_in_progress is zero, then no in-progress
738 * invalidations, including this one, found a relevant memslot at
739 * start(); rechecking memslots here is unnecessary. Note, a false
740 * positive (count elevated by a different invalidation) is sub-optimal
741 * but functionally ok.
c13fda23 742 */
52ac8b35 743 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
20ec3ebd 744 if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
071064f1 745 return;
c13fda23 746
3e1efe2b 747 kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
3da0dd43
IE
748}
749
8569992d 750void kvm_mmu_invalidate_begin(struct kvm *kvm)
e930bffe 751{
8569992d 752 lockdep_assert_held_write(&kvm->mmu_lock);
e930bffe
AA
753 /*
754 * The count increase must become visible at unlock time as no
755 * spte can be established without taking the mmu_lock and
756 * count is also read inside the mmu_lock critical section.
757 */
20ec3ebd 758 kvm->mmu_invalidate_in_progress++;
8569992d 759
20ec3ebd 760 if (likely(kvm->mmu_invalidate_in_progress == 1)) {
8569992d
CP
761 kvm->mmu_invalidate_range_start = INVALID_GPA;
762 kvm->mmu_invalidate_range_end = INVALID_GPA;
763 }
764}
765
766void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
767{
768 lockdep_assert_held_write(&kvm->mmu_lock);
769
770 WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
771
772 if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
20ec3ebd
CP
773 kvm->mmu_invalidate_range_start = start;
774 kvm->mmu_invalidate_range_end = end;
4a42d848
DS
775 } else {
776 /*
a413a625 777 * Fully tracking multiple concurrent ranges has diminishing
4a42d848
DS
778 * returns. Keep things simple and just find the minimal range
779 * which includes the current and new ranges. As there won't be
780 * enough information to subtract a range after its invalidate
781 * completes, any ranges invalidated concurrently will
782 * accumulate and persist until all outstanding invalidates
783 * complete.
784 */
20ec3ebd
CP
785 kvm->mmu_invalidate_range_start =
786 min(kvm->mmu_invalidate_range_start, start);
787 kvm->mmu_invalidate_range_end =
788 max(kvm->mmu_invalidate_range_end, end);
4a42d848 789 }
f922bd9b 790}
3039bcc7 791
a7800aa8 792bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
8569992d
CP
793{
794 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
795 return kvm_unmap_gfn_range(kvm, range);
796}
797
f922bd9b
SC
798static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
799 const struct mmu_notifier_range *range)
800{
801 struct kvm *kvm = mmu_notifier_to_kvm(mn);
e97b39c5 802 const struct kvm_mmu_notifier_range hva_range = {
f922bd9b
SC
803 .start = range->start,
804 .end = range->end,
8569992d 805 .handler = kvm_mmu_unmap_gfn_range,
20ec3ebd 806 .on_lock = kvm_mmu_invalidate_begin,
f922bd9b
SC
807 .flush_on_ret = true,
808 .may_block = mmu_notifier_range_blockable(range),
809 };
565f3be2 810
f922bd9b
SC
811 trace_kvm_unmap_hva_range(range->start, range->end);
812
52ac8b35
PB
813 /*
814 * Prevent memslot modification between range_start() and range_end()
815 * so that conditionally locking provides the same result in both
20ec3ebd 816 * functions. Without that guarantee, the mmu_invalidate_in_progress
52ac8b35
PB
817 * adjustments will be imbalanced.
818 *
819 * Pairs with the decrement in range_end().
820 */
821 spin_lock(&kvm->mn_invalidate_lock);
822 kvm->mn_active_invalidate_count++;
823 spin_unlock(&kvm->mn_invalidate_lock);
824
58cd407c
SC
825 /*
826 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
827 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
828 * each cache's lock. There are relatively few caches in existence at
829 * any given time, and the caches themselves can check for hva overlap,
830 * i.e. don't need to rely on memslot overlap checks for performance.
831 * Because this runs without holding mmu_lock, the pfn caches must use
20ec3ebd
CP
832 * mn_active_invalidate_count (see above) instead of
833 * mmu_invalidate_in_progress.
58cd407c 834 */
982ed0de
DW
835 gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
836 hva_range.may_block);
837
cec29eef
SC
838 /*
839 * If one or more memslots were found and thus zapped, notify arch code
840 * that guest memory has been reclaimed. This needs to be done *after*
841 * dropping mmu_lock, as x86's reclaim path is slooooow.
842 */
843 if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
844 kvm_arch_guest_memory_reclaimed(kvm);
93065ac7 845
e649b3f0 846 return 0;
e930bffe
AA
847}
848
8569992d 849void kvm_mmu_invalidate_end(struct kvm *kvm)
e930bffe 850{
8569992d
CP
851 lockdep_assert_held_write(&kvm->mmu_lock);
852
e930bffe
AA
853 /*
854 * This sequence increase will notify the kvm page fault that
855 * the page that is going to be mapped in the spte could have
856 * been freed.
857 */
20ec3ebd 858 kvm->mmu_invalidate_seq++;
a355aa54 859 smp_wmb();
e930bffe
AA
860 /*
861 * The above sequence increase must be visible before the
a355aa54 862 * below count decrease, which is ensured by the smp_wmb above
20ec3ebd 863 * in conjunction with the smp_rmb in mmu_invalidate_retry().
e930bffe 864 */
20ec3ebd 865 kvm->mmu_invalidate_in_progress--;
c0db1923 866 KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
8569992d
CP
867
868 /*
869 * Assert that at least one range was added between start() and end().
870 * Not adding a range isn't fatal, but it is a KVM bug.
871 */
872 WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
f922bd9b
SC
873}
874
875static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
876 const struct mmu_notifier_range *range)
877{
878 struct kvm *kvm = mmu_notifier_to_kvm(mn);
e97b39c5 879 const struct kvm_mmu_notifier_range hva_range = {
f922bd9b
SC
880 .start = range->start,
881 .end = range->end,
f922bd9b 882 .handler = (void *)kvm_null_fn,
20ec3ebd 883 .on_lock = kvm_mmu_invalidate_end,
f922bd9b
SC
884 .flush_on_ret = false,
885 .may_block = mmu_notifier_range_blockable(range),
886 };
52ac8b35 887 bool wake;
f922bd9b
SC
888
889 __kvm_handle_hva_range(kvm, &hva_range);
e930bffe 890
52ac8b35
PB
891 /* Pairs with the increment in range_start(). */
892 spin_lock(&kvm->mn_invalidate_lock);
d489ec95
SC
893 if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
894 --kvm->mn_active_invalidate_count;
895 wake = !kvm->mn_active_invalidate_count;
52ac8b35
PB
896 spin_unlock(&kvm->mn_invalidate_lock);
897
898 /*
899 * There can only be one waiter, since the wait happens under
900 * slots_lock.
901 */
902 if (wake)
903 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
e930bffe
AA
904}
905
906static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
907 struct mm_struct *mm,
57128468
ALC
908 unsigned long start,
909 unsigned long end)
e930bffe 910{
501b9185
SC
911 trace_kvm_age_hva(start, end);
912
3e1efe2b
SC
913 return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
914 kvm_age_gfn);
e930bffe
AA
915}
916
1d7715c6
VD
917static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
918 struct mm_struct *mm,
919 unsigned long start,
920 unsigned long end)
921{
501b9185
SC
922 trace_kvm_age_hva(start, end);
923
1d7715c6
VD
924 /*
925 * Even though we do not flush TLB, this will still adversely
926 * affect performance on pre-Haswell Intel EPT, where there is
927 * no EPT Access Bit to clear so that we have to tear down EPT
928 * tables instead. If we find this unacceptable, we can always
929 * add a parameter to kvm_age_hva so that it effectively doesn't
930 * do anything on clear_young.
931 *
932 * Also note that currently we never issue secondary TLB flushes
933 * from clear_young, leaving this job up to the regular system
934 * cadence. If we find this inaccurate, we might come up with a
935 * more sophisticated heuristic later.
936 */
3039bcc7 937 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
1d7715c6
VD
938}
939
8ee53820
AA
940static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
941 struct mm_struct *mm,
942 unsigned long address)
943{
501b9185
SC
944 trace_kvm_test_age_hva(address);
945
3039bcc7
SC
946 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
947 kvm_test_age_gfn);
8ee53820
AA
948}
949
85db06e5
MT
950static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
951 struct mm_struct *mm)
952{
953 struct kvm *kvm = mmu_notifier_to_kvm(mn);
eda2beda
LJ
954 int idx;
955
956 idx = srcu_read_lock(&kvm->srcu);
683412cc 957 kvm_flush_shadow_all(kvm);
eda2beda 958 srcu_read_unlock(&kvm->srcu, idx);
85db06e5
MT
959}
960
e930bffe 961static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
e930bffe
AA
962 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
963 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
964 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
1d7715c6 965 .clear_young = kvm_mmu_notifier_clear_young,
8ee53820 966 .test_young = kvm_mmu_notifier_test_young,
3da0dd43 967 .change_pte = kvm_mmu_notifier_change_pte,
85db06e5 968 .release = kvm_mmu_notifier_release,
e930bffe 969};
4c07b0a4
AK
970
971static int kvm_init_mmu_notifier(struct kvm *kvm)
972{
973 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
974 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
975}
976
f128cf8c 977#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
4c07b0a4
AK
978
979static int kvm_init_mmu_notifier(struct kvm *kvm)
980{
981 return 0;
982}
983
f128cf8c 984#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
e930bffe 985
2fdef3a2
SS
986#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
987static int kvm_pm_notifier_call(struct notifier_block *bl,
988 unsigned long state,
989 void *unused)
990{
991 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
992
993 return kvm_arch_pm_notifier(kvm, state);
994}
995
996static void kvm_init_pm_notifier(struct kvm *kvm)
997{
998 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
999 /* Suspend KVM before we suspend ftrace, RCU, etc. */
1000 kvm->pm_notifier.priority = INT_MAX;
1001 register_pm_notifier(&kvm->pm_notifier);
1002}
1003
1004static void kvm_destroy_pm_notifier(struct kvm *kvm)
1005{
1006 unregister_pm_notifier(&kvm->pm_notifier);
1007}
1008#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
1009static void kvm_init_pm_notifier(struct kvm *kvm)
1010{
1011}
1012
1013static void kvm_destroy_pm_notifier(struct kvm *kvm)
1014{
1015}
1016#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
1017
a47d2b07
PB
1018static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
1019{
1020 if (!memslot->dirty_bitmap)
1021 return;
1022
1023 kvfree(memslot->dirty_bitmap);
1024 memslot->dirty_bitmap = NULL;
1025}
1026
a54d8066 1027/* This does not remove the slot from struct kvm_memslots data structures */
e96c81ee 1028static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
a47d2b07 1029{
a7800aa8
SC
1030 if (slot->flags & KVM_MEM_GUEST_MEMFD)
1031 kvm_gmem_unbind(slot);
1032
e96c81ee 1033 kvm_destroy_dirty_bitmap(slot);
a47d2b07 1034
e96c81ee 1035 kvm_arch_free_memslot(kvm, slot);
a47d2b07 1036
a54d8066 1037 kfree(slot);
a47d2b07
PB
1038}
1039
1040static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
1041{
a54d8066 1042 struct hlist_node *idnode;
a47d2b07 1043 struct kvm_memory_slot *memslot;
a54d8066 1044 int bkt;
a47d2b07 1045
a54d8066
MS
1046 /*
1047 * The same memslot objects live in both active and inactive sets,
1048 * arbitrarily free using index '1' so the second invocation of this
1049 * function isn't operating over a structure with dangling pointers
1050 * (even though this function isn't actually touching them).
1051 */
1052 if (!slots->node_idx)
a47d2b07
PB
1053 return;
1054
a54d8066 1055 hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
e96c81ee 1056 kvm_free_memslot(kvm, memslot);
bf3e05bc
XG
1057}
1058
bc9e9e67
JZ
1059static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1060{
1061 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1062 case KVM_STATS_TYPE_INSTANT:
1063 return 0444;
1064 case KVM_STATS_TYPE_CUMULATIVE:
1065 case KVM_STATS_TYPE_PEAK:
1066 default:
1067 return 0644;
1068 }
1069}
1070
1071
536a6f88
JF
1072static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1073{
1074 int i;
bc9e9e67
JZ
1075 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1076 kvm_vcpu_stats_header.num_desc;
536a6f88 1077
a44a4cc1 1078 if (IS_ERR(kvm->debugfs_dentry))
536a6f88
JF
1079 return;
1080
1081 debugfs_remove_recursive(kvm->debugfs_dentry);
1082
9d5a1dce
LC
1083 if (kvm->debugfs_stat_data) {
1084 for (i = 0; i < kvm_debugfs_num_entries; i++)
1085 kfree(kvm->debugfs_stat_data[i]);
1086 kfree(kvm->debugfs_stat_data);
1087 }
536a6f88
JF
1088}
1089
59f82aad 1090static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
536a6f88 1091{
85cd39af
PB
1092 static DEFINE_MUTEX(kvm_debugfs_lock);
1093 struct dentry *dent;
536a6f88
JF
1094 char dir_name[ITOA_MAX_LEN * 2];
1095 struct kvm_stat_data *stat_data;
bc9e9e67 1096 const struct _kvm_stats_desc *pdesc;
b74ed7a6 1097 int i, ret = -ENOMEM;
bc9e9e67
JZ
1098 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1099 kvm_vcpu_stats_header.num_desc;
536a6f88
JF
1100
1101 if (!debugfs_initialized())
1102 return 0;
1103
59f82aad 1104 snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
85cd39af
PB
1105 mutex_lock(&kvm_debugfs_lock);
1106 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1107 if (dent) {
1108 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1109 dput(dent);
1110 mutex_unlock(&kvm_debugfs_lock);
1111 return 0;
1112 }
1113 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1114 mutex_unlock(&kvm_debugfs_lock);
1115 if (IS_ERR(dent))
1116 return 0;
536a6f88 1117
85cd39af 1118 kvm->debugfs_dentry = dent;
536a6f88
JF
1119 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1120 sizeof(*kvm->debugfs_stat_data),
b12ce36a 1121 GFP_KERNEL_ACCOUNT);
536a6f88 1122 if (!kvm->debugfs_stat_data)
b74ed7a6 1123 goto out_err;
536a6f88 1124
bc9e9e67
JZ
1125 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1126 pdesc = &kvm_vm_stats_desc[i];
b12ce36a 1127 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
536a6f88 1128 if (!stat_data)
b74ed7a6 1129 goto out_err;
536a6f88
JF
1130
1131 stat_data->kvm = kvm;
bc9e9e67
JZ
1132 stat_data->desc = pdesc;
1133 stat_data->kind = KVM_STAT_VM;
1134 kvm->debugfs_stat_data[i] = stat_data;
1135 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1136 kvm->debugfs_dentry, stat_data,
1137 &stat_fops_per_vm);
1138 }
1139
1140 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1141 pdesc = &kvm_vcpu_stats_desc[i];
b12ce36a 1142 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
536a6f88 1143 if (!stat_data)
b74ed7a6 1144 goto out_err;
536a6f88
JF
1145
1146 stat_data->kvm = kvm;
bc9e9e67
JZ
1147 stat_data->desc = pdesc;
1148 stat_data->kind = KVM_STAT_VCPU;
004d62eb 1149 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
bc9e9e67 1150 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
09cbcef6
MP
1151 kvm->debugfs_dentry, stat_data,
1152 &stat_fops_per_vm);
536a6f88 1153 }
3165af73 1154
284851ee 1155 kvm_arch_create_vm_debugfs(kvm);
536a6f88 1156 return 0;
b74ed7a6
OU
1157out_err:
1158 kvm_destroy_vm_debugfs(kvm);
1159 return ret;
536a6f88
JF
1160}
1161
1aa9b957
JS
1162/*
1163 * Called after the VM is otherwise initialized, but just before adding it to
1164 * the vm_list.
1165 */
1166int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1167{
1168 return 0;
1169}
1170
1171/*
1172 * Called just after removing the VM from the vm_list, but before doing any
1173 * other destruction.
1174 */
1175void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1176{
1177}
1178
3165af73
PX
1179/*
1180 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1181 * be setup already, so we can create arch-specific debugfs entries under it.
1182 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1183 * a per-arch destroy interface is not needed.
1184 */
284851ee 1185void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
3165af73 1186{
3165af73
PX
1187}
1188
b74ed7a6 1189static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
6aa8b732 1190{
d89f5eff 1191 struct kvm *kvm = kvm_arch_alloc_vm();
a54d8066 1192 struct kvm_memslots *slots;
9121923c 1193 int r = -ENOMEM;
a54d8066 1194 int i, j;
6aa8b732 1195
d89f5eff
JK
1196 if (!kvm)
1197 return ERR_PTR(-ENOMEM);
1198
531810ca 1199 KVM_MMU_LOCK_INIT(kvm);
f1f10076 1200 mmgrab(current->mm);
e9ad4ec8
PB
1201 kvm->mm = current->mm;
1202 kvm_eventfd_init(kvm);
1203 mutex_init(&kvm->lock);
1204 mutex_init(&kvm->irq_lock);
1205 mutex_init(&kvm->slots_lock);
b10a038e 1206 mutex_init(&kvm->slots_arch_lock);
52ac8b35
PB
1207 spin_lock_init(&kvm->mn_invalidate_lock);
1208 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
c5b07754 1209 xa_init(&kvm->vcpu_array);
5a475554
CP
1210#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1211 xa_init(&kvm->mem_attr_array);
1212#endif
52ac8b35 1213
982ed0de
DW
1214 INIT_LIST_HEAD(&kvm->gpc_list);
1215 spin_lock_init(&kvm->gpc_lock);
52ac8b35 1216
e9ad4ec8 1217 INIT_LIST_HEAD(&kvm->devices);
f502cc56 1218 kvm->max_vcpus = KVM_MAX_VCPUS;
e9ad4ec8 1219
1e702d9a
AW
1220 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1221
5c697c36
SC
1222 /*
1223 * Force subsequent debugfs file creations to fail if the VM directory
1224 * is not created (by kvm_create_vm_debugfs()).
1225 */
1226 kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1227
f2759c08
OU
1228 snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1229 task_pid_nr(current));
1230
8a44119a
PB
1231 if (init_srcu_struct(&kvm->srcu))
1232 goto out_err_no_srcu;
1233 if (init_srcu_struct(&kvm->irq_srcu))
1234 goto out_err_no_irq_srcu;
1235
e2d3fcaf 1236 refcount_set(&kvm->users_count, 1);
eed52e43 1237 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
a54d8066
MS
1238 for (j = 0; j < 2; j++) {
1239 slots = &kvm->__memslots[i][j];
9121923c 1240
a54d8066
MS
1241 atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1242 slots->hva_tree = RB_ROOT_CACHED;
1243 slots->gfn_tree = RB_ROOT;
1244 hash_init(slots->id_hash);
1245 slots->node_idx = j;
1246
1247 /* Generations must be different for each address space. */
1248 slots->generation = i;
1249 }
1250
1251 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
f481b069 1252 }
00f034a1 1253
e93f8a0f 1254 for (i = 0; i < KVM_NR_BUSES; i++) {
4a12f951 1255 rcu_assign_pointer(kvm->buses[i],
b12ce36a 1256 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
57e7fbee 1257 if (!kvm->buses[i])
a97b0e77 1258 goto out_err_no_arch_destroy_vm;
e93f8a0f 1259 }
e930bffe 1260
e08b9637 1261 r = kvm_arch_init_vm(kvm, type);
d89f5eff 1262 if (r)
a97b0e77 1263 goto out_err_no_arch_destroy_vm;
10474ae8
AG
1264
1265 r = hardware_enable_all();
1266 if (r)
719d93cd 1267 goto out_err_no_disable;
10474ae8 1268
c5b31cc2 1269#ifdef CONFIG_HAVE_KVM_IRQCHIP
136bdfee 1270 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
75858a84 1271#endif
6aa8b732 1272
74b5c5bf 1273 r = kvm_init_mmu_notifier(kvm);
1aa9b957
JS
1274 if (r)
1275 goto out_err_no_mmu_notifier;
1276
c2b82397
SC
1277 r = kvm_coalesced_mmio_init(kvm);
1278 if (r < 0)
1279 goto out_no_coalesced_mmio;
1280
4ba4f419
SC
1281 r = kvm_create_vm_debugfs(kvm, fdname);
1282 if (r)
1283 goto out_err_no_debugfs;
1284
1aa9b957 1285 r = kvm_arch_post_init_vm(kvm);
74b5c5bf 1286 if (r)
4ba4f419 1287 goto out_err;
74b5c5bf 1288
0d9ce162 1289 mutex_lock(&kvm_lock);
5e58cfe4 1290 list_add(&kvm->vm_list, &vm_list);
0d9ce162 1291 mutex_unlock(&kvm_lock);
d89f5eff 1292
2ecd9d29 1293 preempt_notifier_inc();
2fdef3a2 1294 kvm_init_pm_notifier(kvm);
2ecd9d29 1295
f17abe9a 1296 return kvm;
10474ae8
AG
1297
1298out_err:
4ba4f419
SC
1299 kvm_destroy_vm_debugfs(kvm);
1300out_err_no_debugfs:
c2b82397
SC
1301 kvm_coalesced_mmio_free(kvm);
1302out_no_coalesced_mmio:
f128cf8c 1303#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1aa9b957
JS
1304 if (kvm->mmu_notifier.ops)
1305 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1306#endif
1307out_err_no_mmu_notifier:
10474ae8 1308 hardware_disable_all();
719d93cd 1309out_err_no_disable:
a97b0e77 1310 kvm_arch_destroy_vm(kvm);
a97b0e77 1311out_err_no_arch_destroy_vm:
e2d3fcaf 1312 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
e93f8a0f 1313 for (i = 0; i < KVM_NR_BUSES; i++)
3898da94 1314 kfree(kvm_get_bus(kvm, i));
8a44119a
PB
1315 cleanup_srcu_struct(&kvm->irq_srcu);
1316out_err_no_irq_srcu:
1317 cleanup_srcu_struct(&kvm->srcu);
1318out_err_no_srcu:
d89f5eff 1319 kvm_arch_free_vm(kvm);
e9ad4ec8 1320 mmdrop(current->mm);
10474ae8 1321 return ERR_PTR(r);
f17abe9a
AK
1322}
1323
07f0a7bd
SW
1324static void kvm_destroy_devices(struct kvm *kvm)
1325{
e6e3b5a6 1326 struct kvm_device *dev, *tmp;
07f0a7bd 1327
a28ebea2
CD
1328 /*
1329 * We do not need to take the kvm->lock here, because nobody else
1330 * has a reference to the struct kvm at this point and therefore
1331 * cannot access the devices list anyhow.
1332 */
e6e3b5a6
GT
1333 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1334 list_del(&dev->vm_node);
07f0a7bd
SW
1335 dev->ops->destroy(dev);
1336 }
1337}
1338
f17abe9a
AK
1339static void kvm_destroy_vm(struct kvm *kvm)
1340{
e93f8a0f 1341 int i;
6d4e4c4f
AK
1342 struct mm_struct *mm = kvm->mm;
1343
2fdef3a2 1344 kvm_destroy_pm_notifier(kvm);
286de8f6 1345 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
536a6f88 1346 kvm_destroy_vm_debugfs(kvm);
ad8ba2cd 1347 kvm_arch_sync_events(kvm);
0d9ce162 1348 mutex_lock(&kvm_lock);
133de902 1349 list_del(&kvm->vm_list);
0d9ce162 1350 mutex_unlock(&kvm_lock);
1aa9b957
JS
1351 kvm_arch_pre_destroy_vm(kvm);
1352
399ec807 1353 kvm_free_irq_routing(kvm);
df630b8c 1354 for (i = 0; i < KVM_NR_BUSES; i++) {
3898da94 1355 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
4a12f951 1356
4a12f951
CB
1357 if (bus)
1358 kvm_io_bus_destroy(bus);
df630b8c
PX
1359 kvm->buses[i] = NULL;
1360 }
980da6ce 1361 kvm_coalesced_mmio_free(kvm);
f128cf8c 1362#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
e930bffe 1363 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
52ac8b35
PB
1364 /*
1365 * At this point, pending calls to invalidate_range_start()
1366 * have completed but no more MMU notifiers will run, so
1367 * mn_active_invalidate_count may remain unbalanced.
b0d23708 1368 * No threads can be waiting in kvm_swap_active_memslots() as the
52ac8b35
PB
1369 * last reference on KVM has been dropped, but freeing
1370 * memslots would deadlock without this manual intervention.
d497a0fa
SC
1371 *
1372 * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
1373 * notifier between a start() and end(), then there shouldn't be any
1374 * in-progress invalidations.
52ac8b35
PB
1375 */
1376 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
d497a0fa
SC
1377 if (kvm->mn_active_invalidate_count)
1378 kvm->mn_active_invalidate_count = 0;
1379 else
1380 WARN_ON(kvm->mmu_invalidate_in_progress);
f00be0ca 1381#else
683412cc 1382 kvm_flush_shadow_all(kvm);
5f94c174 1383#endif
d19a9cd2 1384 kvm_arch_destroy_vm(kvm);
07f0a7bd 1385 kvm_destroy_devices(kvm);
eed52e43 1386 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
a54d8066
MS
1387 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1388 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1389 }
820b3fcd 1390 cleanup_srcu_struct(&kvm->irq_srcu);
d89f5eff 1391 cleanup_srcu_struct(&kvm->srcu);
5a475554
CP
1392#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1393 xa_destroy(&kvm->mem_attr_array);
1394#endif
d89f5eff 1395 kvm_arch_free_vm(kvm);
2ecd9d29 1396 preempt_notifier_dec();
10474ae8 1397 hardware_disable_all();
6d4e4c4f 1398 mmdrop(mm);
f17abe9a
AK
1399}
1400
d39f13b0
IE
1401void kvm_get_kvm(struct kvm *kvm)
1402{
e3736c3e 1403 refcount_inc(&kvm->users_count);
d39f13b0
IE
1404}
1405EXPORT_SYMBOL_GPL(kvm_get_kvm);
1406
605c7130
PX
1407/*
1408 * Make sure the vm is not during destruction, which is a safe version of
1409 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1410 */
1411bool kvm_get_kvm_safe(struct kvm *kvm)
1412{
1413 return refcount_inc_not_zero(&kvm->users_count);
1414}
1415EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1416
d39f13b0
IE
1417void kvm_put_kvm(struct kvm *kvm)
1418{
e3736c3e 1419 if (refcount_dec_and_test(&kvm->users_count))
d39f13b0
IE
1420 kvm_destroy_vm(kvm);
1421}
1422EXPORT_SYMBOL_GPL(kvm_put_kvm);
1423
149487bd
SC
1424/*
1425 * Used to put a reference that was taken on behalf of an object associated
1426 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1427 * of the new file descriptor fails and the reference cannot be transferred to
1428 * its final owner. In such cases, the caller is still actively using @kvm and
1429 * will fail miserably if the refcount unexpectedly hits zero.
1430 */
1431void kvm_put_kvm_no_destroy(struct kvm *kvm)
1432{
1433 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1434}
1435EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
d39f13b0 1436
f17abe9a
AK
1437static int kvm_vm_release(struct inode *inode, struct file *filp)
1438{
1439 struct kvm *kvm = filp->private_data;
1440
721eecbf
GH
1441 kvm_irqfd_release(kvm);
1442
d39f13b0 1443 kvm_put_kvm(kvm);
6aa8b732
AK
1444 return 0;
1445}
1446
515a0127
TY
1447/*
1448 * Allocation size is twice as large as the actual dirty bitmap size.
0dff0846 1449 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
515a0127 1450 */
3c9bd400 1451static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
a36a57b1 1452{
37b2a651 1453 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
a36a57b1 1454
37b2a651 1455 memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
a36a57b1
TY
1456 if (!memslot->dirty_bitmap)
1457 return -ENOMEM;
1458
a36a57b1
TY
1459 return 0;
1460}
1461
a54d8066 1462static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
bf3e05bc 1463{
a54d8066
MS
1464 struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1465 int node_idx_inactive = active->node_idx ^ 1;
0e60b079 1466
a54d8066 1467 return &kvm->__memslots[as_id][node_idx_inactive];
0577d1ab
SC
1468}
1469
1470/*
a54d8066
MS
1471 * Helper to get the address space ID when one of memslot pointers may be NULL.
1472 * This also serves as a sanity that at least one of the pointers is non-NULL,
1473 * and that their address space IDs don't diverge.
0577d1ab 1474 */
a54d8066
MS
1475static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1476 struct kvm_memory_slot *b)
0577d1ab 1477{
a54d8066
MS
1478 if (WARN_ON_ONCE(!a && !b))
1479 return 0;
0577d1ab 1480
a54d8066
MS
1481 if (!a)
1482 return b->as_id;
1483 if (!b)
1484 return a->as_id;
0577d1ab 1485
a54d8066
MS
1486 WARN_ON_ONCE(a->as_id != b->as_id);
1487 return a->as_id;
0577d1ab 1488}
efbeec70 1489
a54d8066
MS
1490static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1491 struct kvm_memory_slot *slot)
0577d1ab 1492{
a54d8066
MS
1493 struct rb_root *gfn_tree = &slots->gfn_tree;
1494 struct rb_node **node, *parent;
1495 int idx = slots->node_idx;
0577d1ab 1496
a54d8066
MS
1497 parent = NULL;
1498 for (node = &gfn_tree->rb_node; *node; ) {
1499 struct kvm_memory_slot *tmp;
f85e2cb5 1500
a54d8066
MS
1501 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1502 parent = *node;
1503 if (slot->base_gfn < tmp->base_gfn)
1504 node = &(*node)->rb_left;
1505 else if (slot->base_gfn > tmp->base_gfn)
1506 node = &(*node)->rb_right;
1507 else
1508 BUG();
0577d1ab 1509 }
a54d8066
MS
1510
1511 rb_link_node(&slot->gfn_node[idx], parent, node);
1512 rb_insert_color(&slot->gfn_node[idx], gfn_tree);
0577d1ab
SC
1513}
1514
a54d8066
MS
1515static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1516 struct kvm_memory_slot *slot)
0577d1ab 1517{
a54d8066
MS
1518 rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1519}
0577d1ab 1520
a54d8066
MS
1521static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1522 struct kvm_memory_slot *old,
1523 struct kvm_memory_slot *new)
1524{
1525 int idx = slots->node_idx;
0577d1ab 1526
a54d8066 1527 WARN_ON_ONCE(old->base_gfn != new->base_gfn);
0577d1ab 1528
a54d8066
MS
1529 rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1530 &slots->gfn_tree);
0577d1ab
SC
1531}
1532
1533/*
a54d8066 1534 * Replace @old with @new in the inactive memslots.
0577d1ab 1535 *
a54d8066
MS
1536 * With NULL @old this simply adds @new.
1537 * With NULL @new this simply removes @old.
0577d1ab 1538 *
a54d8066
MS
1539 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1540 * appropriately.
0577d1ab 1541 */
a54d8066
MS
1542static void kvm_replace_memslot(struct kvm *kvm,
1543 struct kvm_memory_slot *old,
1544 struct kvm_memory_slot *new)
0577d1ab 1545{
a54d8066
MS
1546 int as_id = kvm_memslots_get_as_id(old, new);
1547 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1548 int idx = slots->node_idx;
0577d1ab 1549
a54d8066
MS
1550 if (old) {
1551 hash_del(&old->id_node[idx]);
1552 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
0577d1ab 1553
a54d8066
MS
1554 if ((long)old == atomic_long_read(&slots->last_used_slot))
1555 atomic_long_set(&slots->last_used_slot, (long)new);
0577d1ab 1556
a54d8066
MS
1557 if (!new) {
1558 kvm_erase_gfn_node(slots, old);
1e8617d3 1559 return;
a54d8066
MS
1560 }
1561 }
1e8617d3 1562
a54d8066
MS
1563 /*
1564 * Initialize @new's hva range. Do this even when replacing an @old
1565 * slot, kvm_copy_memslot() deliberately does not touch node data.
1566 */
1567 new->hva_node[idx].start = new->userspace_addr;
1568 new->hva_node[idx].last = new->userspace_addr +
1569 (new->npages << PAGE_SHIFT) - 1;
1570
1571 /*
1572 * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1573 * hva_node needs to be swapped with remove+insert even though hva can't
1574 * change when replacing an existing slot.
1575 */
1576 hash_add(slots->id_hash, &new->id_node[idx], new->id);
1577 interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1578
1579 /*
1580 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1581 * switch the node in the gfn tree instead of removing the old and
1582 * inserting the new as two separate operations. Replacement is a
1583 * single O(1) operation versus two O(log(n)) operations for
1584 * remove+insert.
1585 */
1586 if (old && old->base_gfn == new->base_gfn) {
1587 kvm_replace_gfn_node(slots, old, new);
1588 } else {
1589 if (old)
1590 kvm_erase_gfn_node(slots, old);
1591 kvm_insert_gfn_node(slots, new);
0577d1ab 1592 }
bf3e05bc
XG
1593}
1594
bb58b90b
SC
1595/*
1596 * Flags that do not access any of the extra space of struct
1597 * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
1598 * only allows these.
1599 */
1600#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
1601 (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
1602
a7800aa8
SC
1603static int check_memory_region_flags(struct kvm *kvm,
1604 const struct kvm_userspace_memory_region2 *mem)
a50d64d6 1605{
4d8b81ab
XG
1606 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1607
a7800aa8
SC
1608 if (kvm_arch_has_private_mem(kvm))
1609 valid_flags |= KVM_MEM_GUEST_MEMFD;
1610
1611 /* Dirty logging private memory is not currently supported. */
1612 if (mem->flags & KVM_MEM_GUEST_MEMFD)
1613 valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
1614
8886640d 1615#ifdef CONFIG_HAVE_KVM_READONLY_MEM
e5635922
SC
1616 /*
1617 * GUEST_MEMFD is incompatible with read-only memslots, as writes to
1618 * read-only memslots have emulated MMIO, not page fault, semantics,
1619 * and KVM doesn't allow emulated MMIO for private memory.
1620 */
1621 if (!(mem->flags & KVM_MEM_GUEST_MEMFD))
1622 valid_flags |= KVM_MEM_READONLY;
4d8b81ab
XG
1623#endif
1624
1625 if (mem->flags & ~valid_flags)
a50d64d6
XG
1626 return -EINVAL;
1627
1628 return 0;
1629}
1630
a54d8066 1631static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
7ec4fb44 1632{
a54d8066
MS
1633 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1634
1635 /* Grab the generation from the activate memslots. */
1636 u64 gen = __kvm_memslots(kvm, as_id)->generation;
7ec4fb44 1637
361209e0
SC
1638 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1639 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
ee3d1570 1640
52ac8b35
PB
1641 /*
1642 * Do not store the new memslots while there are invalidations in
071064f1
PB
1643 * progress, otherwise the locking in invalidate_range_start and
1644 * invalidate_range_end will be unbalanced.
52ac8b35
PB
1645 */
1646 spin_lock(&kvm->mn_invalidate_lock);
1647 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1648 while (kvm->mn_active_invalidate_count) {
1649 set_current_state(TASK_UNINTERRUPTIBLE);
1650 spin_unlock(&kvm->mn_invalidate_lock);
1651 schedule();
1652 spin_lock(&kvm->mn_invalidate_lock);
1653 }
1654 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
f481b069 1655 rcu_assign_pointer(kvm->memslots[as_id], slots);
52ac8b35 1656 spin_unlock(&kvm->mn_invalidate_lock);
b10a038e
BG
1657
1658 /*
1659 * Acquired in kvm_set_memslot. Must be released before synchronize
1660 * SRCU below in order to avoid deadlock with another thread
1661 * acquiring the slots_arch_lock in an srcu critical section.
1662 */
1663 mutex_unlock(&kvm->slots_arch_lock);
1664
7ec4fb44 1665 synchronize_srcu_expedited(&kvm->srcu);
e59dbe09 1666
ee3d1570 1667 /*
361209e0 1668 * Increment the new memslot generation a second time, dropping the
00116795 1669 * update in-progress flag and incrementing the generation based on
361209e0
SC
1670 * the number of address spaces. This provides a unique and easily
1671 * identifiable generation number while the memslots are in flux.
1672 */
1673 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1674
1675 /*
4bd518f1
PB
1676 * Generations must be unique even across address spaces. We do not need
1677 * a global counter for that, instead the generation space is evenly split
1678 * across address spaces. For example, with two address spaces, address
164bf7e5
SC
1679 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1680 * use generations 1, 3, 5, ...
ee3d1570 1681 */
eed52e43 1682 gen += kvm_arch_nr_memslot_as_ids(kvm);
ee3d1570 1683
15248258 1684 kvm_arch_memslots_updated(kvm, gen);
ee3d1570 1685
15248258 1686 slots->generation = gen;
7ec4fb44
GN
1687}
1688
07921665
SC
1689static int kvm_prepare_memory_region(struct kvm *kvm,
1690 const struct kvm_memory_slot *old,
1691 struct kvm_memory_slot *new,
1692 enum kvm_mr_change change)
ddc12f2a 1693{
07921665
SC
1694 int r;
1695
1696 /*
1697 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1698 * will be freed on "commit". If logging is enabled in both old and
1699 * new, reuse the existing bitmap. If logging is enabled only in the
1700 * new and KVM isn't using a ring buffer, allocate and initialize a
1701 * new bitmap.
1702 */
244893fa
SC
1703 if (change != KVM_MR_DELETE) {
1704 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1705 new->dirty_bitmap = NULL;
1706 else if (old && old->dirty_bitmap)
1707 new->dirty_bitmap = old->dirty_bitmap;
86bdf3eb 1708 else if (kvm_use_dirty_bitmap(kvm)) {
244893fa
SC
1709 r = kvm_alloc_dirty_bitmap(new);
1710 if (r)
1711 return r;
1712
1713 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1714 bitmap_set(new->dirty_bitmap, 0, new->npages);
1715 }
07921665
SC
1716 }
1717
1718 r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1719
1720 /* Free the bitmap on failure if it was allocated above. */
c87661f8 1721 if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
07921665
SC
1722 kvm_destroy_dirty_bitmap(new);
1723
1724 return r;
ddc12f2a
BG
1725}
1726
07921665
SC
1727static void kvm_commit_memory_region(struct kvm *kvm,
1728 struct kvm_memory_slot *old,
1729 const struct kvm_memory_slot *new,
1730 enum kvm_mr_change change)
ddc12f2a 1731{
6c7b2202
PB
1732 int old_flags = old ? old->flags : 0;
1733 int new_flags = new ? new->flags : 0;
07921665
SC
1734 /*
1735 * Update the total number of memslot pages before calling the arch
1736 * hook so that architectures can consume the result directly.
1737 */
1738 if (change == KVM_MR_DELETE)
1739 kvm->nr_memslot_pages -= old->npages;
1740 else if (change == KVM_MR_CREATE)
1741 kvm->nr_memslot_pages += new->npages;
1742
6c7b2202
PB
1743 if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1744 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1745 atomic_set(&kvm->nr_memslots_dirty_logging,
1746 atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1747 }
1748
07921665
SC
1749 kvm_arch_commit_memory_region(kvm, old, new, change);
1750
a54d8066
MS
1751 switch (change) {
1752 case KVM_MR_CREATE:
1753 /* Nothing more to do. */
1754 break;
1755 case KVM_MR_DELETE:
1756 /* Free the old memslot and all its metadata. */
1757 kvm_free_memslot(kvm, old);
1758 break;
1759 case KVM_MR_MOVE:
1760 case KVM_MR_FLAGS_ONLY:
1761 /*
1762 * Free the dirty bitmap as needed; the below check encompasses
1763 * both the flags and whether a ring buffer is being used)
1764 */
1765 if (old->dirty_bitmap && !new->dirty_bitmap)
1766 kvm_destroy_dirty_bitmap(old);
1767
1768 /*
1769 * The final quirk. Free the detached, old slot, but only its
1770 * memory, not any metadata. Metadata, including arch specific
1771 * data, may be reused by @new.
1772 */
1773 kfree(old);
1774 break;
1775 default:
1776 BUG();
1777 }
ddc12f2a
BG
1778}
1779
36947254 1780/*
a54d8066
MS
1781 * Activate @new, which must be installed in the inactive slots by the caller,
1782 * by swapping the active slots and then propagating @new to @old once @old is
1783 * unreachable and can be safely modified.
1784 *
1785 * With NULL @old this simply adds @new to @active (while swapping the sets).
1786 * With NULL @new this simply removes @old from @active and frees it
1787 * (while also swapping the sets).
36947254 1788 */
a54d8066
MS
1789static void kvm_activate_memslot(struct kvm *kvm,
1790 struct kvm_memory_slot *old,
1791 struct kvm_memory_slot *new)
36947254 1792{
a54d8066 1793 int as_id = kvm_memslots_get_as_id(old, new);
36947254 1794
a54d8066
MS
1795 kvm_swap_active_memslots(kvm, as_id);
1796
1797 /* Propagate the new memslot to the now inactive memslots. */
1798 kvm_replace_memslot(kvm, old, new);
1799}
1800
1801static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1802 const struct kvm_memory_slot *src)
1803{
1804 dest->base_gfn = src->base_gfn;
1805 dest->npages = src->npages;
1806 dest->dirty_bitmap = src->dirty_bitmap;
1807 dest->arch = src->arch;
1808 dest->userspace_addr = src->userspace_addr;
1809 dest->flags = src->flags;
1810 dest->id = src->id;
1811 dest->as_id = src->as_id;
1812}
1813
1814static void kvm_invalidate_memslot(struct kvm *kvm,
1815 struct kvm_memory_slot *old,
244893fa 1816 struct kvm_memory_slot *invalid_slot)
a54d8066 1817{
07921665 1818 /*
a54d8066
MS
1819 * Mark the current slot INVALID. As with all memslot modifications,
1820 * this must be done on an unreachable slot to avoid modifying the
1821 * current slot in the active tree.
07921665 1822 */
244893fa
SC
1823 kvm_copy_memslot(invalid_slot, old);
1824 invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1825 kvm_replace_memslot(kvm, old, invalid_slot);
a54d8066
MS
1826
1827 /*
1828 * Activate the slot that is now marked INVALID, but don't propagate
1829 * the slot to the now inactive slots. The slot is either going to be
1830 * deleted or recreated as a new slot.
1831 */
1832 kvm_swap_active_memslots(kvm, old->as_id);
1833
1834 /*
1835 * From this point no new shadow pages pointing to a deleted, or moved,
1836 * memslot will be created. Validation of sp->gfn happens in:
1837 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1838 * - kvm_is_visible_gfn (mmu_check_root)
1839 */
bcb63dcd 1840 kvm_arch_flush_shadow_memslot(kvm, old);
683412cc 1841 kvm_arch_guest_memory_reclaimed(kvm);
a54d8066 1842
b0d23708 1843 /* Was released by kvm_swap_active_memslots(), reacquire. */
a54d8066
MS
1844 mutex_lock(&kvm->slots_arch_lock);
1845
1846 /*
1847 * Copy the arch-specific field of the newly-installed slot back to the
1848 * old slot as the arch data could have changed between releasing
b0d23708 1849 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
a54d8066
MS
1850 * above. Writers are required to retrieve memslots *after* acquiring
1851 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1852 */
244893fa 1853 old->arch = invalid_slot->arch;
a54d8066
MS
1854}
1855
1856static void kvm_create_memslot(struct kvm *kvm,
244893fa 1857 struct kvm_memory_slot *new)
a54d8066 1858{
244893fa
SC
1859 /* Add the new memslot to the inactive set and activate. */
1860 kvm_replace_memslot(kvm, NULL, new);
1861 kvm_activate_memslot(kvm, NULL, new);
a54d8066
MS
1862}
1863
1864static void kvm_delete_memslot(struct kvm *kvm,
1865 struct kvm_memory_slot *old,
1866 struct kvm_memory_slot *invalid_slot)
1867{
1868 /*
1869 * Remove the old memslot (in the inactive memslots) by passing NULL as
244893fa 1870 * the "new" slot, and for the invalid version in the active slots.
a54d8066
MS
1871 */
1872 kvm_replace_memslot(kvm, old, NULL);
a54d8066 1873 kvm_activate_memslot(kvm, invalid_slot, NULL);
a54d8066 1874}
36947254 1875
244893fa
SC
1876static void kvm_move_memslot(struct kvm *kvm,
1877 struct kvm_memory_slot *old,
1878 struct kvm_memory_slot *new,
1879 struct kvm_memory_slot *invalid_slot)
a54d8066 1880{
a54d8066 1881 /*
244893fa
SC
1882 * Replace the old memslot in the inactive slots, and then swap slots
1883 * and replace the current INVALID with the new as well.
a54d8066 1884 */
244893fa
SC
1885 kvm_replace_memslot(kvm, old, new);
1886 kvm_activate_memslot(kvm, invalid_slot, new);
a54d8066 1887}
36947254 1888
a54d8066
MS
1889static void kvm_update_flags_memslot(struct kvm *kvm,
1890 struct kvm_memory_slot *old,
244893fa 1891 struct kvm_memory_slot *new)
a54d8066
MS
1892{
1893 /*
1894 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1895 * an intermediate step. Instead, the old memslot is simply replaced
1896 * with a new, updated copy in both memslot sets.
1897 */
244893fa
SC
1898 kvm_replace_memslot(kvm, old, new);
1899 kvm_activate_memslot(kvm, old, new);
36947254
SC
1900}
1901
cf47f50b 1902static int kvm_set_memslot(struct kvm *kvm,
a54d8066 1903 struct kvm_memory_slot *old,
ce5f0215 1904 struct kvm_memory_slot *new,
cf47f50b
SC
1905 enum kvm_mr_change change)
1906{
244893fa 1907 struct kvm_memory_slot *invalid_slot;
cf47f50b
SC
1908 int r;
1909
b10a038e 1910 /*
b0d23708 1911 * Released in kvm_swap_active_memslots().
b10a038e 1912 *
b0d23708
JM
1913 * Must be held from before the current memslots are copied until after
1914 * the new memslots are installed with rcu_assign_pointer, then
1915 * released before the synchronize srcu in kvm_swap_active_memslots().
b10a038e
BG
1916 *
1917 * When modifying memslots outside of the slots_lock, must be held
1918 * before reading the pointer to the current memslots until after all
1919 * changes to those memslots are complete.
1920 *
1921 * These rules ensure that installing new memslots does not lose
1922 * changes made to the previous memslots.
1923 */
1924 mutex_lock(&kvm->slots_arch_lock);
1925
a54d8066
MS
1926 /*
1927 * Invalidate the old slot if it's being deleted or moved. This is
1928 * done prior to actually deleting/moving the memslot to allow vCPUs to
1929 * continue running by ensuring there are no mappings or shadow pages
1930 * for the memslot when it is deleted/moved. Without pre-invalidation
1931 * (and without a lock), a window would exist between effecting the
1932 * delete/move and committing the changes in arch code where KVM or a
1933 * guest could access a non-existent memslot.
244893fa
SC
1934 *
1935 * Modifications are done on a temporary, unreachable slot. The old
1936 * slot needs to be preserved in case a later step fails and the
1937 * invalidation needs to be reverted.
a54d8066 1938 */
cf47f50b 1939 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
244893fa
SC
1940 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1941 if (!invalid_slot) {
1942 mutex_unlock(&kvm->slots_arch_lock);
1943 return -ENOMEM;
1944 }
1945 kvm_invalidate_memslot(kvm, old, invalid_slot);
1946 }
b10a038e 1947
a54d8066
MS
1948 r = kvm_prepare_memory_region(kvm, old, new, change);
1949 if (r) {
b10a038e 1950 /*
a54d8066
MS
1951 * For DELETE/MOVE, revert the above INVALID change. No
1952 * modifications required since the original slot was preserved
1953 * in the inactive slots. Changing the active memslots also
1954 * release slots_arch_lock.
b10a038e 1955 */
244893fa
SC
1956 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1957 kvm_activate_memslot(kvm, invalid_slot, old);
1958 kfree(invalid_slot);
1959 } else {
a54d8066 1960 mutex_unlock(&kvm->slots_arch_lock);
244893fa 1961 }
a54d8066 1962 return r;
cf47f50b
SC
1963 }
1964
bda44d84 1965 /*
a54d8066
MS
1966 * For DELETE and MOVE, the working slot is now active as the INVALID
1967 * version of the old slot. MOVE is particularly special as it reuses
1968 * the old slot and returns a copy of the old slot (in working_slot).
1969 * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1970 * old slot is detached but otherwise preserved.
bda44d84 1971 */
a54d8066 1972 if (change == KVM_MR_CREATE)
244893fa 1973 kvm_create_memslot(kvm, new);
a54d8066 1974 else if (change == KVM_MR_DELETE)
244893fa 1975 kvm_delete_memslot(kvm, old, invalid_slot);
a54d8066 1976 else if (change == KVM_MR_MOVE)
244893fa 1977 kvm_move_memslot(kvm, old, new, invalid_slot);
a54d8066 1978 else if (change == KVM_MR_FLAGS_ONLY)
244893fa 1979 kvm_update_flags_memslot(kvm, old, new);
a54d8066
MS
1980 else
1981 BUG();
cf47f50b 1982
244893fa
SC
1983 /* Free the temporary INVALID slot used for DELETE and MOVE. */
1984 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1985 kfree(invalid_slot);
bda44d84 1986
a54d8066
MS
1987 /*
1988 * No need to refresh new->arch, changes after dropping slots_arch_lock
a413a625 1989 * will directly hit the final, active memslot. Architectures are
a54d8066
MS
1990 * responsible for knowing that new->arch may be stale.
1991 */
1992 kvm_commit_memory_region(kvm, old, new, change);
cf47f50b 1993
cf47f50b 1994 return 0;
cf47f50b
SC
1995}
1996
44401a20
MS
1997static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1998 gfn_t start, gfn_t end)
5c0b4f3d 1999{
44401a20 2000 struct kvm_memslot_iter iter;
5c0b4f3d 2001
44401a20
MS
2002 kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
2003 if (iter.slot->id != id)
2004 return true;
2005 }
5c0b4f3d 2006
44401a20 2007 return false;
5c0b4f3d
SC
2008}
2009
6aa8b732
AK
2010/*
2011 * Allocate some memory and give it an address in the guest physical address
2012 * space.
2013 *
2014 * Discontiguous memory is allowed, mostly for framebuffers.
f78e0e2e 2015 *
02d5d55b 2016 * Must be called holding kvm->slots_lock for write.
6aa8b732 2017 */
f78e0e2e 2018int __kvm_set_memory_region(struct kvm *kvm,
bb58b90b 2019 const struct kvm_userspace_memory_region2 *mem)
6aa8b732 2020{
244893fa 2021 struct kvm_memory_slot *old, *new;
44401a20 2022 struct kvm_memslots *slots;
f64c0398 2023 enum kvm_mr_change change;
0f9bdef3
SC
2024 unsigned long npages;
2025 gfn_t base_gfn;
163da372
SC
2026 int as_id, id;
2027 int r;
6aa8b732 2028
a7800aa8 2029 r = check_memory_region_flags(kvm, mem);
a50d64d6 2030 if (r)
71a4c30b 2031 return r;
a50d64d6 2032
f481b069
PB
2033 as_id = mem->slot >> 16;
2034 id = (u16)mem->slot;
2035
6aa8b732 2036 /* General sanity checks */
6b285a55
SC
2037 if ((mem->memory_size & (PAGE_SIZE - 1)) ||
2038 (mem->memory_size != (unsigned long)mem->memory_size))
71a4c30b 2039 return -EINVAL;
6aa8b732 2040 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
71a4c30b 2041 return -EINVAL;
fa3d315a 2042 /* We can read the guest memory with __xxx_user() later on. */
09d952c9 2043 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
139bc8a6 2044 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
96d4f267 2045 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
09d952c9 2046 mem->memory_size))
71a4c30b 2047 return -EINVAL;
a7800aa8
SC
2048 if (mem->flags & KVM_MEM_GUEST_MEMFD &&
2049 (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
2050 mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
2051 return -EINVAL;
eed52e43 2052 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
71a4c30b 2053 return -EINVAL;
6aa8b732 2054 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
71a4c30b 2055 return -EINVAL;
0f9bdef3
SC
2056 if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
2057 return -EINVAL;
6aa8b732 2058
44401a20 2059 slots = __kvm_memslots(kvm, as_id);
6aa8b732 2060
5c0b4f3d 2061 /*
7cd08553
SC
2062 * Note, the old memslot (and the pointer itself!) may be invalidated
2063 * and/or destroyed by kvm_set_memslot().
5c0b4f3d 2064 */
44401a20 2065 old = id_to_memslot(slots, id);
163da372 2066
47ea7d90 2067 if (!mem->memory_size) {
7cd08553 2068 if (!old || !old->npages)
47ea7d90 2069 return -EINVAL;
5c0b4f3d 2070
7cd08553 2071 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
47ea7d90 2072 return -EIO;
6aa8b732 2073
244893fa 2074 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
47ea7d90 2075 }
5c0b4f3d 2076
0f9bdef3
SC
2077 base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2078 npages = (mem->memory_size >> PAGE_SHIFT);
163da372 2079
7cd08553 2080 if (!old || !old->npages) {
5c0b4f3d 2081 change = KVM_MR_CREATE;
afa319a5
SC
2082
2083 /*
2084 * To simplify KVM internals, the total number of pages across
2085 * all memslots must fit in an unsigned long.
2086 */
0f9bdef3 2087 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
afa319a5 2088 return -EINVAL;
5c0b4f3d 2089 } else { /* Modify an existing slot. */
a7800aa8
SC
2090 /* Private memslots are immutable, they can only be deleted. */
2091 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2092 return -EINVAL;
0f9bdef3
SC
2093 if ((mem->userspace_addr != old->userspace_addr) ||
2094 (npages != old->npages) ||
2095 ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
71a4c30b 2096 return -EINVAL;
09170a49 2097
0f9bdef3 2098 if (base_gfn != old->base_gfn)
5c0b4f3d 2099 change = KVM_MR_MOVE;
0f9bdef3 2100 else if (mem->flags != old->flags)
5c0b4f3d
SC
2101 change = KVM_MR_FLAGS_ONLY;
2102 else /* Nothing to change. */
2103 return 0;
09170a49 2104 }
6aa8b732 2105
44401a20 2106 if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
0f9bdef3 2107 kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
44401a20 2108 return -EEXIST;
6aa8b732 2109
244893fa
SC
2110 /* Allocate a slot that will persist in the memslot. */
2111 new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2112 if (!new)
2113 return -ENOMEM;
3c9bd400 2114
244893fa
SC
2115 new->as_id = as_id;
2116 new->id = id;
2117 new->base_gfn = base_gfn;
2118 new->npages = npages;
2119 new->flags = mem->flags;
2120 new->userspace_addr = mem->userspace_addr;
a7800aa8
SC
2121 if (mem->flags & KVM_MEM_GUEST_MEMFD) {
2122 r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
2123 if (r)
2124 goto out;
2125 }
6aa8b732 2126
244893fa 2127 r = kvm_set_memslot(kvm, old, new, change);
cf47f50b 2128 if (r)
a7800aa8
SC
2129 goto out_unbind;
2130
2131 return 0;
2132
2133out_unbind:
2134 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2135 kvm_gmem_unbind(new);
2136out:
2137 kfree(new);
6aa8b732 2138 return r;
210c7c4d 2139}
f78e0e2e
SY
2140EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2141
2142int kvm_set_memory_region(struct kvm *kvm,
bb58b90b 2143 const struct kvm_userspace_memory_region2 *mem)
f78e0e2e
SY
2144{
2145 int r;
2146
79fac95e 2147 mutex_lock(&kvm->slots_lock);
47ae31e2 2148 r = __kvm_set_memory_region(kvm, mem);
79fac95e 2149 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
2150 return r;
2151}
210c7c4d
IE
2152EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2153
7940876e 2154static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
bb58b90b 2155 struct kvm_userspace_memory_region2 *mem)
210c7c4d 2156{
f481b069 2157 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
e0d62c7f 2158 return -EINVAL;
09170a49 2159
47ae31e2 2160 return kvm_set_memory_region(kvm, mem);
6aa8b732
AK
2161}
2162
0dff0846 2163#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2a49f61d
SC
2164/**
2165 * kvm_get_dirty_log - get a snapshot of dirty pages
2166 * @kvm: pointer to kvm instance
2167 * @log: slot id and address to which we copy the log
2168 * @is_dirty: set to '1' if any dirty pages were found
2169 * @memslot: set to the associated memslot, always valid on success
2170 */
2171int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2172 int *is_dirty, struct kvm_memory_slot **memslot)
6aa8b732 2173{
9f6b8029 2174 struct kvm_memslots *slots;
843574a3 2175 int i, as_id, id;
87bf6e7d 2176 unsigned long n;
6aa8b732
AK
2177 unsigned long any = 0;
2178
86bdf3eb
GS
2179 /* Dirty ring tracking may be exclusive to dirty log tracking */
2180 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2181 return -ENXIO;
2182
2a49f61d
SC
2183 *memslot = NULL;
2184 *is_dirty = 0;
2185
f481b069
PB
2186 as_id = log->slot >> 16;
2187 id = (u16)log->slot;
eed52e43 2188 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
843574a3 2189 return -EINVAL;
6aa8b732 2190
f481b069 2191 slots = __kvm_memslots(kvm, as_id);
2a49f61d 2192 *memslot = id_to_memslot(slots, id);
0577d1ab 2193 if (!(*memslot) || !(*memslot)->dirty_bitmap)
843574a3 2194 return -ENOENT;
6aa8b732 2195
2a49f61d
SC
2196 kvm_arch_sync_dirty_log(kvm, *memslot);
2197
2198 n = kvm_dirty_bitmap_bytes(*memslot);
6aa8b732 2199
cd1a4a98 2200 for (i = 0; !any && i < n/sizeof(long); ++i)
2a49f61d 2201 any = (*memslot)->dirty_bitmap[i];
6aa8b732 2202
2a49f61d 2203 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
843574a3 2204 return -EFAULT;
6aa8b732 2205
5bb064dc
ZX
2206 if (any)
2207 *is_dirty = 1;
843574a3 2208 return 0;
6aa8b732 2209}
2ba9f0d8 2210EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
6aa8b732 2211
0dff0846 2212#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
ba0513b5 2213/**
b8b00220 2214 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2a31b9db 2215 * and reenable dirty page tracking for the corresponding pages.
ba0513b5
MS
2216 * @kvm: pointer to kvm instance
2217 * @log: slot id and address to which we copy the log
ba0513b5
MS
2218 *
2219 * We need to keep it in mind that VCPU threads can write to the bitmap
2220 * concurrently. So, to avoid losing track of dirty pages we keep the
2221 * following order:
2222 *
2223 * 1. Take a snapshot of the bit and clear it if needed.
2224 * 2. Write protect the corresponding page.
2225 * 3. Copy the snapshot to the userspace.
2226 * 4. Upon return caller flushes TLB's if needed.
2227 *
2228 * Between 2 and 4, the guest may write to the page using the remaining TLB
2229 * entry. This is not a problem because the page is reported dirty using
2230 * the snapshot taken before and step 4 ensures that writes done after
2231 * exiting to userspace will be logged for the next call.
2232 *
2233 */
0dff0846 2234static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
ba0513b5 2235{
9f6b8029 2236 struct kvm_memslots *slots;
ba0513b5 2237 struct kvm_memory_slot *memslot;
58d6db34 2238 int i, as_id, id;
ba0513b5
MS
2239 unsigned long n;
2240 unsigned long *dirty_bitmap;
2241 unsigned long *dirty_bitmap_buffer;
0dff0846 2242 bool flush;
ba0513b5 2243
86bdf3eb
GS
2244 /* Dirty ring tracking may be exclusive to dirty log tracking */
2245 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2246 return -ENXIO;
2247
f481b069
PB
2248 as_id = log->slot >> 16;
2249 id = (u16)log->slot;
eed52e43 2250 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
58d6db34 2251 return -EINVAL;
ba0513b5 2252
f481b069
PB
2253 slots = __kvm_memslots(kvm, as_id);
2254 memslot = id_to_memslot(slots, id);
0577d1ab
SC
2255 if (!memslot || !memslot->dirty_bitmap)
2256 return -ENOENT;
ba0513b5
MS
2257
2258 dirty_bitmap = memslot->dirty_bitmap;
ba0513b5 2259
0dff0846
SC
2260 kvm_arch_sync_dirty_log(kvm, memslot);
2261
ba0513b5 2262 n = kvm_dirty_bitmap_bytes(memslot);
0dff0846 2263 flush = false;
2a31b9db
PB
2264 if (kvm->manual_dirty_log_protect) {
2265 /*
2266 * Unlike kvm_get_dirty_log, we always return false in *flush,
2267 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2268 * is some code duplication between this function and
2269 * kvm_get_dirty_log, but hopefully all architecture
2270 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2271 * can be eliminated.
2272 */
2273 dirty_bitmap_buffer = dirty_bitmap;
2274 } else {
2275 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2276 memset(dirty_bitmap_buffer, 0, n);
ba0513b5 2277
531810ca 2278 KVM_MMU_LOCK(kvm);
2a31b9db
PB
2279 for (i = 0; i < n / sizeof(long); i++) {
2280 unsigned long mask;
2281 gfn_t offset;
ba0513b5 2282
2a31b9db
PB
2283 if (!dirty_bitmap[i])
2284 continue;
2285
0dff0846 2286 flush = true;
2a31b9db
PB
2287 mask = xchg(&dirty_bitmap[i], 0);
2288 dirty_bitmap_buffer[i] = mask;
2289
a67794ca
LT
2290 offset = i * BITS_PER_LONG;
2291 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2292 offset, mask);
2a31b9db 2293 }
531810ca 2294 KVM_MMU_UNLOCK(kvm);
2a31b9db
PB
2295 }
2296
0dff0846 2297 if (flush)
619b5072 2298 kvm_flush_remote_tlbs_memslot(kvm, memslot);
0dff0846 2299
2a31b9db
PB
2300 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2301 return -EFAULT;
2302 return 0;
2303}
0dff0846
SC
2304
2305
2306/**
2307 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2308 * @kvm: kvm instance
2309 * @log: slot id and address to which we copy the log
2310 *
2311 * Steps 1-4 below provide general overview of dirty page logging. See
2312 * kvm_get_dirty_log_protect() function description for additional details.
2313 *
2314 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2315 * always flush the TLB (step 4) even if previous step failed and the dirty
2316 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2317 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2318 * writes will be marked dirty for next log read.
2319 *
2320 * 1. Take a snapshot of the bit and clear it if needed.
2321 * 2. Write protect the corresponding page.
2322 * 3. Copy the snapshot to the userspace.
2323 * 4. Flush TLB's if needed.
2324 */
2325static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2326 struct kvm_dirty_log *log)
2327{
2328 int r;
2329
2330 mutex_lock(&kvm->slots_lock);
2331
2332 r = kvm_get_dirty_log_protect(kvm, log);
2333
2334 mutex_unlock(&kvm->slots_lock);
2335 return r;
2336}
2a31b9db
PB
2337
2338/**
2339 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2340 * and reenable dirty page tracking for the corresponding pages.
2341 * @kvm: pointer to kvm instance
2342 * @log: slot id and address from which to fetch the bitmap of dirty pages
2343 */
0dff0846
SC
2344static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2345 struct kvm_clear_dirty_log *log)
2a31b9db
PB
2346{
2347 struct kvm_memslots *slots;
2348 struct kvm_memory_slot *memslot;
98938aa8 2349 int as_id, id;
2a31b9db 2350 gfn_t offset;
98938aa8 2351 unsigned long i, n;
2a31b9db
PB
2352 unsigned long *dirty_bitmap;
2353 unsigned long *dirty_bitmap_buffer;
0dff0846 2354 bool flush;
2a31b9db 2355
86bdf3eb
GS
2356 /* Dirty ring tracking may be exclusive to dirty log tracking */
2357 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2358 return -ENXIO;
2359
2a31b9db
PB
2360 as_id = log->slot >> 16;
2361 id = (u16)log->slot;
eed52e43 2362 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2a31b9db
PB
2363 return -EINVAL;
2364
76d58e0f 2365 if (log->first_page & 63)
2a31b9db
PB
2366 return -EINVAL;
2367
2368 slots = __kvm_memslots(kvm, as_id);
2369 memslot = id_to_memslot(slots, id);
0577d1ab
SC
2370 if (!memslot || !memslot->dirty_bitmap)
2371 return -ENOENT;
2a31b9db
PB
2372
2373 dirty_bitmap = memslot->dirty_bitmap;
2a31b9db 2374
4ddc9204 2375 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
98938aa8
TB
2376
2377 if (log->first_page > memslot->npages ||
76d58e0f
PB
2378 log->num_pages > memslot->npages - log->first_page ||
2379 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2380 return -EINVAL;
98938aa8 2381
0dff0846
SC
2382 kvm_arch_sync_dirty_log(kvm, memslot);
2383
2384 flush = false;
2a31b9db
PB
2385 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2386 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2387 return -EFAULT;
ba0513b5 2388
531810ca 2389 KVM_MMU_LOCK(kvm);
53eac7a8
PX
2390 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2391 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2a31b9db
PB
2392 i++, offset += BITS_PER_LONG) {
2393 unsigned long mask = *dirty_bitmap_buffer++;
2394 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2395 if (!mask)
ba0513b5
MS
2396 continue;
2397
2a31b9db 2398 mask &= atomic_long_fetch_andnot(mask, p);
ba0513b5 2399
2a31b9db
PB
2400 /*
2401 * mask contains the bits that really have been cleared. This
2402 * never includes any bits beyond the length of the memslot (if
2403 * the length is not aligned to 64 pages), therefore it is not
2404 * a problem if userspace sets them in log->dirty_bitmap.
2405 */
58d2930f 2406 if (mask) {
0dff0846 2407 flush = true;
58d2930f
TY
2408 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2409 offset, mask);
2410 }
ba0513b5 2411 }
531810ca 2412 KVM_MMU_UNLOCK(kvm);
2a31b9db 2413
0dff0846 2414 if (flush)
619b5072 2415 kvm_flush_remote_tlbs_memslot(kvm, memslot);
0dff0846 2416
58d6db34 2417 return 0;
ba0513b5 2418}
0dff0846
SC
2419
2420static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2421 struct kvm_clear_dirty_log *log)
2422{
2423 int r;
2424
2425 mutex_lock(&kvm->slots_lock);
2426
2427 r = kvm_clear_dirty_log_protect(kvm, log);
2428
2429 mutex_unlock(&kvm->slots_lock);
2430 return r;
2431}
2432#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
ba0513b5 2433
5a475554
CP
2434#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
2435/*
2436 * Returns true if _all_ gfns in the range [@start, @end) have attributes
2437 * matching @attrs.
2438 */
2439bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2440 unsigned long attrs)
2441{
2442 XA_STATE(xas, &kvm->mem_attr_array, start);
2443 unsigned long index;
2444 bool has_attrs;
2445 void *entry;
2446
2447 rcu_read_lock();
2448
2449 if (!attrs) {
2450 has_attrs = !xas_find(&xas, end - 1);
2451 goto out;
2452 }
2453
2454 has_attrs = true;
2455 for (index = start; index < end; index++) {
2456 do {
2457 entry = xas_next(&xas);
2458 } while (xas_retry(&xas, entry));
2459
2460 if (xas.xa_index != index || xa_to_value(entry) != attrs) {
2461 has_attrs = false;
2462 break;
2463 }
2464 }
2465
2466out:
2467 rcu_read_unlock();
2468 return has_attrs;
2469}
2470
2471static u64 kvm_supported_mem_attributes(struct kvm *kvm)
2472{
a7800aa8 2473 if (!kvm || kvm_arch_has_private_mem(kvm))
5a475554
CP
2474 return KVM_MEMORY_ATTRIBUTE_PRIVATE;
2475
2476 return 0;
2477}
2478
2479static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
2480 struct kvm_mmu_notifier_range *range)
2481{
2482 struct kvm_gfn_range gfn_range;
2483 struct kvm_memory_slot *slot;
2484 struct kvm_memslots *slots;
2485 struct kvm_memslot_iter iter;
2486 bool found_memslot = false;
2487 bool ret = false;
2488 int i;
2489
2490 gfn_range.arg = range->arg;
2491 gfn_range.may_block = range->may_block;
2492
eed52e43 2493 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
5a475554
CP
2494 slots = __kvm_memslots(kvm, i);
2495
2496 kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
2497 slot = iter.slot;
2498 gfn_range.slot = slot;
2499
2500 gfn_range.start = max(range->start, slot->base_gfn);
2501 gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
2502 if (gfn_range.start >= gfn_range.end)
2503 continue;
2504
2505 if (!found_memslot) {
2506 found_memslot = true;
2507 KVM_MMU_LOCK(kvm);
2508 if (!IS_KVM_NULL_FN(range->on_lock))
2509 range->on_lock(kvm);
2510 }
2511
2512 ret |= range->handler(kvm, &gfn_range);
2513 }
2514 }
2515
2516 if (range->flush_on_ret && ret)
2517 kvm_flush_remote_tlbs(kvm);
2518
2519 if (found_memslot)
2520 KVM_MMU_UNLOCK(kvm);
2521}
2522
2523static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
2524 struct kvm_gfn_range *range)
2525{
2526 /*
2527 * Unconditionally add the range to the invalidation set, regardless of
2528 * whether or not the arch callback actually needs to zap SPTEs. E.g.
2529 * if KVM supports RWX attributes in the future and the attributes are
2530 * going from R=>RW, zapping isn't strictly necessary. Unconditionally
2531 * adding the range allows KVM to require that MMU invalidations add at
2532 * least one range between begin() and end(), e.g. allows KVM to detect
2533 * bugs where the add() is missed. Relaxing the rule *might* be safe,
2534 * but it's not obvious that allowing new mappings while the attributes
2535 * are in flux is desirable or worth the complexity.
2536 */
2537 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
2538
2539 return kvm_arch_pre_set_memory_attributes(kvm, range);
2540}
2541
2542/* Set @attributes for the gfn range [@start, @end). */
2543static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2544 unsigned long attributes)
2545{
2546 struct kvm_mmu_notifier_range pre_set_range = {
2547 .start = start,
2548 .end = end,
2549 .handler = kvm_pre_set_memory_attributes,
2550 .on_lock = kvm_mmu_invalidate_begin,
2551 .flush_on_ret = true,
2552 .may_block = true,
2553 };
2554 struct kvm_mmu_notifier_range post_set_range = {
2555 .start = start,
2556 .end = end,
2557 .arg.attributes = attributes,
2558 .handler = kvm_arch_post_set_memory_attributes,
2559 .on_lock = kvm_mmu_invalidate_end,
2560 .may_block = true,
2561 };
2562 unsigned long i;
2563 void *entry;
2564 int r = 0;
2565
2566 entry = attributes ? xa_mk_value(attributes) : NULL;
2567
2568 mutex_lock(&kvm->slots_lock);
2569
2570 /* Nothing to do if the entire range as the desired attributes. */
2571 if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
2572 goto out_unlock;
2573
2574 /*
2575 * Reserve memory ahead of time to avoid having to deal with failures
2576 * partway through setting the new attributes.
2577 */
2578 for (i = start; i < end; i++) {
2579 r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
2580 if (r)
2581 goto out_unlock;
2582 }
2583
2584 kvm_handle_gfn_range(kvm, &pre_set_range);
2585
2586 for (i = start; i < end; i++) {
2587 r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
2588 GFP_KERNEL_ACCOUNT));
2589 KVM_BUG_ON(r, kvm);
2590 }
2591
2592 kvm_handle_gfn_range(kvm, &post_set_range);
2593
2594out_unlock:
2595 mutex_unlock(&kvm->slots_lock);
2596
2597 return r;
2598}
2599static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
2600 struct kvm_memory_attributes *attrs)
2601{
2602 gfn_t start, end;
2603
2604 /* flags is currently not used. */
2605 if (attrs->flags)
2606 return -EINVAL;
2607 if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
2608 return -EINVAL;
2609 if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
2610 return -EINVAL;
2611 if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
2612 return -EINVAL;
2613
2614 start = attrs->address >> PAGE_SHIFT;
2615 end = (attrs->address + attrs->size) >> PAGE_SHIFT;
2616
2617 /*
2618 * xarray tracks data using "unsigned long", and as a result so does
2619 * KVM. For simplicity, supports generic attributes only on 64-bit
2620 * architectures.
2621 */
2622 BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
2623
2624 return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
2625}
2626#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
2627
49c7754c
GN
2628struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2629{
2630 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2631}
a1f4d395 2632EXPORT_SYMBOL_GPL(gfn_to_memslot);
6aa8b732 2633
8e73485c
PB
2634struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2635{
fe22ed82 2636 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
a54d8066 2637 u64 gen = slots->generation;
fe22ed82 2638 struct kvm_memory_slot *slot;
fe22ed82 2639
a54d8066
MS
2640 /*
2641 * This also protects against using a memslot from a different address space,
2642 * since different address spaces have different generation numbers.
2643 */
2644 if (unlikely(gen != vcpu->last_used_slot_gen)) {
2645 vcpu->last_used_slot = NULL;
2646 vcpu->last_used_slot_gen = gen;
2647 }
2648
2649 slot = try_get_memslot(vcpu->last_used_slot, gfn);
fe22ed82
DM
2650 if (slot)
2651 return slot;
2652
2653 /*
2654 * Fall back to searching all memslots. We purposely use
2655 * search_memslots() instead of __gfn_to_memslot() to avoid
a54d8066 2656 * thrashing the VM-wide last_used_slot in kvm_memslots.
fe22ed82 2657 */
a54d8066 2658 slot = search_memslots(slots, gfn, false);
fe22ed82 2659 if (slot) {
a54d8066 2660 vcpu->last_used_slot = slot;
fe22ed82
DM
2661 return slot;
2662 }
2663
2664 return NULL;
8e73485c
PB
2665}
2666
33e94154 2667bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
e0d62c7f 2668{
bf3e05bc 2669 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
e0d62c7f 2670
c36b7150 2671 return kvm_is_visible_memslot(memslot);
e0d62c7f
IE
2672}
2673EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2674
995decb6
VK
2675bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2676{
2677 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2678
2679 return kvm_is_visible_memslot(memslot);
2680}
2681EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2682
f9b84e19 2683unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
8f0b1ab6
JR
2684{
2685 struct vm_area_struct *vma;
2686 unsigned long addr, size;
2687
2688 size = PAGE_SIZE;
2689
42cde48b 2690 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
8f0b1ab6
JR
2691 if (kvm_is_error_hva(addr))
2692 return PAGE_SIZE;
2693
d8ed45c5 2694 mmap_read_lock(current->mm);
8f0b1ab6
JR
2695 vma = find_vma(current->mm, addr);
2696 if (!vma)
2697 goto out;
2698
2699 size = vma_kernel_pagesize(vma);
2700
2701out:
d8ed45c5 2702 mmap_read_unlock(current->mm);
8f0b1ab6
JR
2703
2704 return size;
2705}
2706
8283e36a 2707static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
4d8b81ab
XG
2708{
2709 return slot->flags & KVM_MEM_READONLY;
2710}
2711
8283e36a 2712static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
4d8b81ab 2713 gfn_t *nr_pages, bool write)
539cb660 2714{
bc6678a3 2715 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
ca3a490c 2716 return KVM_HVA_ERR_BAD;
48987781 2717
4d8b81ab
XG
2718 if (memslot_is_readonly(slot) && write)
2719 return KVM_HVA_ERR_RO_BAD;
48987781
XG
2720
2721 if (nr_pages)
2722 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2723
4d8b81ab 2724 return __gfn_to_hva_memslot(slot, gfn);
539cb660 2725}
48987781 2726
4d8b81ab
XG
2727static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2728 gfn_t *nr_pages)
2729{
2730 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
539cb660 2731}
48987781 2732
4d8b81ab 2733unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
7940876e 2734 gfn_t gfn)
4d8b81ab
XG
2735{
2736 return gfn_to_hva_many(slot, gfn, NULL);
2737}
2738EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2739
48987781
XG
2740unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2741{
49c7754c 2742 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
48987781 2743}
0d150298 2744EXPORT_SYMBOL_GPL(gfn_to_hva);
539cb660 2745
8e73485c
PB
2746unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2747{
2748 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2749}
2750EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2751
86ab8cff 2752/*
970c0d4b
WY
2753 * Return the hva of a @gfn and the R/W attribute if possible.
2754 *
2755 * @slot: the kvm_memory_slot which contains @gfn
2756 * @gfn: the gfn to be translated
2757 * @writable: used to return the read/write attribute of the @slot if the hva
2758 * is valid and @writable is not NULL
86ab8cff 2759 */
64d83126
CD
2760unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2761 gfn_t gfn, bool *writable)
86ab8cff 2762{
a2ac07fe
GN
2763 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2764
2765 if (!kvm_is_error_hva(hva) && writable)
ba6a3541
PB
2766 *writable = !memslot_is_readonly(slot);
2767
a2ac07fe 2768 return hva;
86ab8cff
XG
2769}
2770
64d83126
CD
2771unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2772{
2773 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2774
2775 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2776}
2777
8e73485c
PB
2778unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2779{
2780 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2781
2782 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2783}
2784
fafc3dba
HY
2785static inline int check_user_page_hwpoison(unsigned long addr)
2786{
0d731759 2787 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
fafc3dba 2788
54d02069 2789 rc = get_user_pages(addr, 1, flags, NULL);
fafc3dba
HY
2790 return rc == -EHWPOISON;
2791}
2792
2fc84311 2793/*
b9b33da2
PB
2794 * The fast path to get the writable pfn which will be stored in @pfn,
2795 * true indicates success, otherwise false is returned. It's also the
311497e0 2796 * only part that runs if we can in atomic context.
2fc84311 2797 */
b9b33da2
PB
2798static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2799 bool *writable, kvm_pfn_t *pfn)
954bbbc2 2800{
8d4e1288 2801 struct page *page[1];
954bbbc2 2802
12ce13fe
XG
2803 /*
2804 * Fast pin a writable pfn only if it is a write fault request
2805 * or the caller allows to map a writable pfn for a read fault
2806 * request.
2807 */
2808 if (!(write_fault || writable))
2809 return false;
612819c3 2810
dadbb612 2811 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2fc84311 2812 *pfn = page_to_pfn(page[0]);
612819c3 2813
2fc84311
XG
2814 if (writable)
2815 *writable = true;
2816 return true;
2817 }
af585b92 2818
2fc84311
XG
2819 return false;
2820}
612819c3 2821
2fc84311
XG
2822/*
2823 * The slow path to get the pfn of the specified host virtual address,
2824 * 1 indicates success, -errno is returned if error is detected.
2825 */
2826static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
c8b88b33 2827 bool interruptible, bool *writable, kvm_pfn_t *pfn)
2fc84311 2828{
b1e1296d
DH
2829 /*
2830 * When a VCPU accesses a page that is not mapped into the secondary
2831 * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2832 * make progress. We always want to honor NUMA hinting faults in that
2833 * case, because GUP usage corresponds to memory accesses from the VCPU.
2834 * Otherwise, we'd not trigger NUMA hinting faults once a page is
2835 * mapped into the secondary MMU and gets accessed by a VCPU.
2836 *
2837 * Note that get_user_page_fast_only() and FOLL_WRITE for now
2838 * implicitly honor NUMA hinting faults and don't need this flag.
2839 */
2840 unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
ce53053c 2841 struct page *page;
28249139 2842 int npages;
612819c3 2843
2fc84311
XG
2844 might_sleep();
2845
2846 if (writable)
2847 *writable = write_fault;
2848
ce53053c
AV
2849 if (write_fault)
2850 flags |= FOLL_WRITE;
2851 if (async)
2852 flags |= FOLL_NOWAIT;
c8b88b33
PX
2853 if (interruptible)
2854 flags |= FOLL_INTERRUPTIBLE;
d4944b0e 2855
ce53053c 2856 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2fc84311
XG
2857 if (npages != 1)
2858 return npages;
2859
2860 /* map read fault as writable if possible */
12ce13fe 2861 if (unlikely(!write_fault) && writable) {
ce53053c 2862 struct page *wpage;
2fc84311 2863
dadbb612 2864 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2fc84311 2865 *writable = true;
ce53053c
AV
2866 put_page(page);
2867 page = wpage;
612819c3 2868 }
887c08ac 2869 }
ce53053c 2870 *pfn = page_to_pfn(page);
2fc84311
XG
2871 return npages;
2872}
539cb660 2873
4d8b81ab
XG
2874static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2875{
2876 if (unlikely(!(vma->vm_flags & VM_READ)))
2877 return false;
2e2e3738 2878
4d8b81ab
XG
2879 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2880 return false;
887c08ac 2881
4d8b81ab
XG
2882 return true;
2883}
bf998156 2884
f8be156b
NP
2885static int kvm_try_get_pfn(kvm_pfn_t pfn)
2886{
b14b2690
SC
2887 struct page *page = kvm_pfn_to_refcounted_page(pfn);
2888
2889 if (!page)
f8be156b 2890 return 1;
b14b2690
SC
2891
2892 return get_page_unless_zero(page);
f8be156b
NP
2893}
2894
92176a8e 2895static int hva_to_pfn_remapped(struct vm_area_struct *vma,
1625566e
XT
2896 unsigned long addr, bool write_fault,
2897 bool *writable, kvm_pfn_t *p_pfn)
92176a8e 2898{
a9545779 2899 kvm_pfn_t pfn;
bd2fae8d 2900 pte_t *ptep;
c33c7948 2901 pte_t pte;
bd2fae8d 2902 spinlock_t *ptl;
add6a0cd
PB
2903 int r;
2904
29ae7d96 2905 r = follow_pte(vma, addr, &ptep, &ptl);
add6a0cd
PB
2906 if (r) {
2907 /*
2908 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2909 * not call the fault handler, so do it here.
2910 */
2911 bool unlocked = false;
64019a2e 2912 r = fixup_user_fault(current->mm, addr,
add6a0cd
PB
2913 (write_fault ? FAULT_FLAG_WRITE : 0),
2914 &unlocked);
a8387d0b
PB
2915 if (unlocked)
2916 return -EAGAIN;
add6a0cd
PB
2917 if (r)
2918 return r;
2919
29ae7d96 2920 r = follow_pte(vma, addr, &ptep, &ptl);
add6a0cd
PB
2921 if (r)
2922 return r;
bd2fae8d 2923 }
add6a0cd 2924
c33c7948
RR
2925 pte = ptep_get(ptep);
2926
2927 if (write_fault && !pte_write(pte)) {
bd2fae8d
PB
2928 pfn = KVM_PFN_ERR_RO_FAULT;
2929 goto out;
add6a0cd
PB
2930 }
2931
a340b3e2 2932 if (writable)
c33c7948
RR
2933 *writable = pte_write(pte);
2934 pfn = pte_pfn(pte);
add6a0cd
PB
2935
2936 /*
2937 * Get a reference here because callers of *hva_to_pfn* and
2938 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2939 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
36c3ce6c 2940 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
add6a0cd
PB
2941 * simply do nothing for reserved pfns.
2942 *
2943 * Whoever called remap_pfn_range is also going to call e.g.
2944 * unmap_mapping_range before the underlying pages are freed,
2945 * causing a call to our MMU notifier.
f8be156b
NP
2946 *
2947 * Certain IO or PFNMAP mappings can be backed with valid
2948 * struct pages, but be allocated without refcounting e.g.,
2949 * tail pages of non-compound higher order allocations, which
2950 * would then underflow the refcount when the caller does the
2951 * required put_page. Don't allow those pages here.
c33c7948 2952 */
f8be156b
NP
2953 if (!kvm_try_get_pfn(pfn))
2954 r = -EFAULT;
add6a0cd 2955
bd2fae8d
PB
2956out:
2957 pte_unmap_unlock(ptep, ptl);
add6a0cd 2958 *p_pfn = pfn;
f8be156b
NP
2959
2960 return r;
92176a8e
PB
2961}
2962
12ce13fe
XG
2963/*
2964 * Pin guest page in memory and return its pfn.
2965 * @addr: host virtual address which maps memory to the guest
2966 * @atomic: whether this function can sleep
c8b88b33 2967 * @interruptible: whether the process can be interrupted by non-fatal signals
12ce13fe
XG
2968 * @async: whether this function need to wait IO complete if the
2969 * host page is not in the memory
2970 * @write_fault: whether we should get a writable host page
2971 * @writable: whether it allows to map a writable host page for !@write_fault
2972 *
2973 * The function will map a writable host page for these two cases:
2974 * 1): @write_fault = true
2975 * 2): @write_fault = false && @writable, @writable will tell the caller
2976 * whether the mapping is writable.
2977 */
c8b88b33
PX
2978kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2979 bool *async, bool write_fault, bool *writable)
2fc84311
XG
2980{
2981 struct vm_area_struct *vma;
943dfea8 2982 kvm_pfn_t pfn;
92176a8e 2983 int npages, r;
2e2e3738 2984
2fc84311
XG
2985 /* we can do it either atomically or asynchronously, not both */
2986 BUG_ON(atomic && async);
8d4e1288 2987
b9b33da2 2988 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2fc84311
XG
2989 return pfn;
2990
2991 if (atomic)
2992 return KVM_PFN_ERR_FAULT;
2993
c8b88b33
PX
2994 npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2995 writable, &pfn);
2fc84311
XG
2996 if (npages == 1)
2997 return pfn;
fe5ed56c
PX
2998 if (npages == -EINTR)
2999 return KVM_PFN_ERR_SIGPENDING;
8d4e1288 3000
d8ed45c5 3001 mmap_read_lock(current->mm);
2fc84311
XG
3002 if (npages == -EHWPOISON ||
3003 (!async && check_user_page_hwpoison(addr))) {
3004 pfn = KVM_PFN_ERR_HWPOISON;
3005 goto exit;
3006 }
3007
a8387d0b 3008retry:
fc98c03b 3009 vma = vma_lookup(current->mm, addr);
2fc84311
XG
3010
3011 if (vma == NULL)
3012 pfn = KVM_PFN_ERR_FAULT;
92176a8e 3013 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
1625566e 3014 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
a8387d0b
PB
3015 if (r == -EAGAIN)
3016 goto retry;
92176a8e
PB
3017 if (r < 0)
3018 pfn = KVM_PFN_ERR_FAULT;
2fc84311 3019 } else {
4d8b81ab 3020 if (async && vma_is_valid(vma, write_fault))
2fc84311
XG
3021 *async = true;
3022 pfn = KVM_PFN_ERR_FAULT;
3023 }
3024exit:
d8ed45c5 3025 mmap_read_unlock(current->mm);
2e2e3738 3026 return pfn;
35149e21
AL
3027}
3028
8283e36a 3029kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
c8b88b33
PX
3030 bool atomic, bool interruptible, bool *async,
3031 bool write_fault, bool *writable, hva_t *hva)
887c08ac 3032{
4d8b81ab
XG
3033 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
3034
4a42d848
DS
3035 if (hva)
3036 *hva = addr;
3037
b2740d35
PB
3038 if (addr == KVM_HVA_ERR_RO_BAD) {
3039 if (writable)
3040 *writable = false;
4d8b81ab 3041 return KVM_PFN_ERR_RO_FAULT;
b2740d35 3042 }
4d8b81ab 3043
b2740d35
PB
3044 if (kvm_is_error_hva(addr)) {
3045 if (writable)
3046 *writable = false;
81c52c56 3047 return KVM_PFN_NOSLOT;
b2740d35 3048 }
4d8b81ab
XG
3049
3050 /* Do not map writable pfn in the readonly memslot. */
3051 if (writable && memslot_is_readonly(slot)) {
3052 *writable = false;
3053 writable = NULL;
3054 }
3055
c8b88b33 3056 return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
4d8b81ab 3057 writable);
887c08ac 3058}
3520469d 3059EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
887c08ac 3060
ba049e93 3061kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
612819c3
MT
3062 bool *writable)
3063{
c8b88b33
PX
3064 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
3065 NULL, write_fault, writable, NULL);
612819c3
MT
3066}
3067EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
3068
8283e36a 3069kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
506f0d6f 3070{
c8b88b33
PX
3071 return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
3072 NULL, NULL);
506f0d6f 3073}
e37afc6e 3074EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
506f0d6f 3075
8283e36a 3076kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
506f0d6f 3077{
c8b88b33
PX
3078 return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
3079 NULL, NULL);
506f0d6f 3080}
037d92dc 3081EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
506f0d6f 3082
ba049e93 3083kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
8e73485c
PB
3084{
3085 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
3086}
3087EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
3088
ba049e93 3089kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
e37afc6e
PB
3090{
3091 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
3092}
3093EXPORT_SYMBOL_GPL(gfn_to_pfn);
3094
ba049e93 3095kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
8e73485c
PB
3096{
3097 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
3098}
3099EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
3100
d9ef13c2
PB
3101int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3102 struct page **pages, int nr_pages)
48987781
XG
3103{
3104 unsigned long addr;
076b925d 3105 gfn_t entry = 0;
48987781 3106
d9ef13c2 3107 addr = gfn_to_hva_many(slot, gfn, &entry);
48987781
XG
3108 if (kvm_is_error_hva(addr))
3109 return -1;
3110
3111 if (entry < nr_pages)
3112 return 0;
3113
dadbb612 3114 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
48987781
XG
3115}
3116EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
3117
b1624f99
SC
3118/*
3119 * Do not use this helper unless you are absolutely certain the gfn _must_ be
3120 * backed by 'struct page'. A valid example is if the backing memslot is
3121 * controlled by KVM. Note, if the returned page is valid, it's refcount has
3122 * been elevated by gfn_to_pfn().
3123 */
35149e21
AL
3124struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
3125{
b14b2690 3126 struct page *page;
ba049e93 3127 kvm_pfn_t pfn;
2e2e3738
AL
3128
3129 pfn = gfn_to_pfn(kvm, gfn);
2e2e3738 3130
81c52c56 3131 if (is_error_noslot_pfn(pfn))
cb9aaa30 3132 return KVM_ERR_PTR_BAD_PAGE;
a2766325 3133
b14b2690
SC
3134 page = kvm_pfn_to_refcounted_page(pfn);
3135 if (!page)
6cede2e6 3136 return KVM_ERR_PTR_BAD_PAGE;
a2766325 3137
b14b2690 3138 return page;
954bbbc2
AK
3139}
3140EXPORT_SYMBOL_GPL(gfn_to_page);
3141
357a18ad 3142void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
91724814 3143{
91724814
BO
3144 if (dirty)
3145 kvm_release_pfn_dirty(pfn);
3146 else
3147 kvm_release_pfn_clean(pfn);
3148}
3149
357a18ad 3150int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
e45adf66
KA
3151{
3152 kvm_pfn_t pfn;
3153 void *hva = NULL;
3154 struct page *page = KVM_UNMAPPED_PAGE;
3155
3156 if (!map)
3157 return -EINVAL;
3158
357a18ad 3159 pfn = gfn_to_pfn(vcpu->kvm, gfn);
e45adf66
KA
3160 if (is_error_noslot_pfn(pfn))
3161 return -EINVAL;
3162
3163 if (pfn_valid(pfn)) {
3164 page = pfn_to_page(pfn);
357a18ad 3165 hva = kmap(page);
d30b214d 3166#ifdef CONFIG_HAS_IOMEM
91724814 3167 } else {
357a18ad 3168 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
d30b214d 3169#endif
e45adf66
KA
3170 }
3171
3172 if (!hva)
3173 return -EFAULT;
3174
3175 map->page = page;
3176 map->hva = hva;
3177 map->pfn = pfn;
3178 map->gfn = gfn;
3179
3180 return 0;
3181}
e45adf66
KA
3182EXPORT_SYMBOL_GPL(kvm_vcpu_map);
3183
357a18ad 3184void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
e45adf66
KA
3185{
3186 if (!map)
3187 return;
3188
3189 if (!map->hva)
3190 return;
3191
357a18ad
DW
3192 if (map->page != KVM_UNMAPPED_PAGE)
3193 kunmap(map->page);
eb1f2f38 3194#ifdef CONFIG_HAS_IOMEM
91724814 3195 else
357a18ad 3196 memunmap(map->hva);
eb1f2f38 3197#endif
e45adf66 3198
91724814 3199 if (dirty)
357a18ad 3200 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
91724814 3201
357a18ad 3202 kvm_release_pfn(map->pfn, dirty);
e45adf66
KA
3203
3204 map->hva = NULL;
3205 map->page = NULL;
3206}
3207EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
3208
8e1c6914 3209static bool kvm_is_ad_tracked_page(struct page *page)
8e73485c 3210{
8e1c6914
SC
3211 /*
3212 * Per page-flags.h, pages tagged PG_reserved "should in general not be
3213 * touched (e.g. set dirty) except by its owner".
3214 */
3215 return !PageReserved(page);
3216}
8e73485c 3217
8e1c6914
SC
3218static void kvm_set_page_dirty(struct page *page)
3219{
3220 if (kvm_is_ad_tracked_page(page))
3221 SetPageDirty(page);
3222}
8e73485c 3223
8e1c6914
SC
3224static void kvm_set_page_accessed(struct page *page)
3225{
3226 if (kvm_is_ad_tracked_page(page))
3227 mark_page_accessed(page);
8e73485c 3228}
8e73485c 3229
b4231d61
IE
3230void kvm_release_page_clean(struct page *page)
3231{
32cad84f
XG
3232 WARN_ON(is_error_page(page));
3233
8e1c6914
SC
3234 kvm_set_page_accessed(page);
3235 put_page(page);
b4231d61
IE
3236}
3237EXPORT_SYMBOL_GPL(kvm_release_page_clean);
3238
ba049e93 3239void kvm_release_pfn_clean(kvm_pfn_t pfn)
35149e21 3240{
b14b2690
SC
3241 struct page *page;
3242
3243 if (is_error_noslot_pfn(pfn))
3244 return;
3245
3246 page = kvm_pfn_to_refcounted_page(pfn);
3247 if (!page)
3248 return;
3249
3250 kvm_release_page_clean(page);
35149e21
AL
3251}
3252EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
3253
b4231d61 3254void kvm_release_page_dirty(struct page *page)
8a7ae055 3255{
a2766325
XG
3256 WARN_ON(is_error_page(page));
3257
8e1c6914
SC
3258 kvm_set_page_dirty(page);
3259 kvm_release_page_clean(page);
35149e21
AL
3260}
3261EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
3262
f7a6509f 3263void kvm_release_pfn_dirty(kvm_pfn_t pfn)
35149e21 3264{
b14b2690
SC
3265 struct page *page;
3266
3267 if (is_error_noslot_pfn(pfn))
3268 return;
3269
3270 page = kvm_pfn_to_refcounted_page(pfn);
3271 if (!page)
3272 return;
3273
3274 kvm_release_page_dirty(page);
35149e21 3275}
f7a6509f 3276EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
35149e21 3277
8e1c6914
SC
3278/*
3279 * Note, checking for an error/noslot pfn is the caller's responsibility when
3280 * directly marking a page dirty/accessed. Unlike the "release" helpers, the
3281 * "set" helpers are not to be used when the pfn might point at garbage.
3282 */
ba049e93 3283void kvm_set_pfn_dirty(kvm_pfn_t pfn)
35149e21 3284{
8e1c6914
SC
3285 if (WARN_ON(is_error_noslot_pfn(pfn)))
3286 return;
3287
3288 if (pfn_valid(pfn))
3289 kvm_set_page_dirty(pfn_to_page(pfn));
8a7ae055 3290}
35149e21
AL
3291EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3292
ba049e93 3293void kvm_set_pfn_accessed(kvm_pfn_t pfn)
35149e21 3294{
8e1c6914
SC
3295 if (WARN_ON(is_error_noslot_pfn(pfn)))
3296 return;
3297
3298 if (pfn_valid(pfn))
3299 kvm_set_page_accessed(pfn_to_page(pfn));
35149e21
AL
3300}
3301EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3302
195aefde
IE
3303static int next_segment(unsigned long len, int offset)
3304{
3305 if (len > PAGE_SIZE - offset)
3306 return PAGE_SIZE - offset;
3307 else
3308 return len;
3309}
3310
8e73485c
PB
3311static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3312 void *data, int offset, int len)
195aefde 3313{
e0506bcb
IE
3314 int r;
3315 unsigned long addr;
195aefde 3316
8e73485c 3317 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
e0506bcb
IE
3318 if (kvm_is_error_hva(addr))
3319 return -EFAULT;
3180a7fc 3320 r = __copy_from_user(data, (void __user *)addr + offset, len);
e0506bcb 3321 if (r)
195aefde 3322 return -EFAULT;
195aefde
IE
3323 return 0;
3324}
8e73485c
PB
3325
3326int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3327 int len)
3328{
3329 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3330
3331 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3332}
195aefde
IE
3333EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3334
8e73485c
PB
3335int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3336 int offset, int len)
3337{
3338 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3339
3340 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3341}
3342EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3343
195aefde
IE
3344int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3345{
3346 gfn_t gfn = gpa >> PAGE_SHIFT;
3347 int seg;
3348 int offset = offset_in_page(gpa);
3349 int ret;
3350
3351 while ((seg = next_segment(len, offset)) != 0) {
3352 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3353 if (ret < 0)
3354 return ret;
3355 offset = 0;
3356 len -= seg;
3357 data += seg;
3358 ++gfn;
3359 }
3360 return 0;
3361}
3362EXPORT_SYMBOL_GPL(kvm_read_guest);
3363
8e73485c 3364int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
7ec54588 3365{
7ec54588 3366 gfn_t gfn = gpa >> PAGE_SHIFT;
8e73485c 3367 int seg;
7ec54588 3368 int offset = offset_in_page(gpa);
8e73485c
PB
3369 int ret;
3370
3371 while ((seg = next_segment(len, offset)) != 0) {
3372 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3373 if (ret < 0)
3374 return ret;
3375 offset = 0;
3376 len -= seg;
3377 data += seg;
3378 ++gfn;
3379 }
3380 return 0;
3381}
3382EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
7ec54588 3383
8e73485c
PB
3384static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3385 void *data, int offset, unsigned long len)
3386{
3387 int r;
3388 unsigned long addr;
3389
3390 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
7ec54588
MT
3391 if (kvm_is_error_hva(addr))
3392 return -EFAULT;
0aac03f0 3393 pagefault_disable();
3180a7fc 3394 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
0aac03f0 3395 pagefault_enable();
7ec54588
MT
3396 if (r)
3397 return -EFAULT;
3398 return 0;
3399}
7ec54588 3400
8e73485c
PB
3401int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3402 void *data, unsigned long len)
3403{
3404 gfn_t gfn = gpa >> PAGE_SHIFT;
3405 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3406 int offset = offset_in_page(gpa);
3407
3408 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3409}
3410EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3411
28bd726a
PX
3412static int __kvm_write_guest_page(struct kvm *kvm,
3413 struct kvm_memory_slot *memslot, gfn_t gfn,
8e73485c 3414 const void *data, int offset, int len)
195aefde 3415{
e0506bcb
IE
3416 int r;
3417 unsigned long addr;
195aefde 3418
251eb841 3419 addr = gfn_to_hva_memslot(memslot, gfn);
e0506bcb
IE
3420 if (kvm_is_error_hva(addr))
3421 return -EFAULT;
8b0cedff 3422 r = __copy_to_user((void __user *)addr + offset, data, len);
e0506bcb 3423 if (r)
195aefde 3424 return -EFAULT;
28bd726a 3425 mark_page_dirty_in_slot(kvm, memslot, gfn);
195aefde
IE
3426 return 0;
3427}
8e73485c
PB
3428
3429int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3430 const void *data, int offset, int len)
3431{
3432 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3433
28bd726a 3434 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
8e73485c 3435}
195aefde
IE
3436EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3437
8e73485c
PB
3438int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3439 const void *data, int offset, int len)
3440{
3441 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3442
28bd726a 3443 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
8e73485c
PB
3444}
3445EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3446
195aefde
IE
3447int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3448 unsigned long len)
3449{
3450 gfn_t gfn = gpa >> PAGE_SHIFT;
3451 int seg;
3452 int offset = offset_in_page(gpa);
3453 int ret;
3454
3455 while ((seg = next_segment(len, offset)) != 0) {
3456 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3457 if (ret < 0)
3458 return ret;
3459 offset = 0;
3460 len -= seg;
3461 data += seg;
3462 ++gfn;
3463 }
3464 return 0;
3465}
ff651cb6 3466EXPORT_SYMBOL_GPL(kvm_write_guest);
195aefde 3467
8e73485c
PB
3468int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3469 unsigned long len)
3470{
3471 gfn_t gfn = gpa >> PAGE_SHIFT;
3472 int seg;
3473 int offset = offset_in_page(gpa);
3474 int ret;
3475
3476 while ((seg = next_segment(len, offset)) != 0) {
3477 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3478 if (ret < 0)
3479 return ret;
3480 offset = 0;
3481 len -= seg;
3482 data += seg;
3483 ++gfn;
3484 }
3485 return 0;
3486}
3487EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3488
5a2d4365
PB
3489static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3490 struct gfn_to_hva_cache *ghc,
3491 gpa_t gpa, unsigned long len)
49c7754c 3492{
49c7754c 3493 int offset = offset_in_page(gpa);
8f964525
AH
3494 gfn_t start_gfn = gpa >> PAGE_SHIFT;
3495 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3496 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3497 gfn_t nr_pages_avail;
49c7754c 3498
6ad1e29f 3499 /* Update ghc->generation before performing any error checks. */
49c7754c 3500 ghc->generation = slots->generation;
6ad1e29f
SC
3501
3502 if (start_gfn > end_gfn) {
3503 ghc->hva = KVM_HVA_ERR_BAD;
3504 return -EINVAL;
3505 }
f1b9dd5e
JM
3506
3507 /*
3508 * If the requested region crosses two memslots, we still
3509 * verify that the entire region is valid here.
3510 */
6ad1e29f 3511 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
f1b9dd5e
JM
3512 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3513 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3514 &nr_pages_avail);
3515 if (kvm_is_error_hva(ghc->hva))
6ad1e29f 3516 return -EFAULT;
f1b9dd5e
JM
3517 }
3518
3519 /* Use the slow path for cross page reads and writes. */
6ad1e29f 3520 if (nr_pages_needed == 1)
49c7754c 3521 ghc->hva += offset;
f1b9dd5e 3522 else
8f964525 3523 ghc->memslot = NULL;
f1b9dd5e 3524
6ad1e29f
SC
3525 ghc->gpa = gpa;
3526 ghc->len = len;
3527 return 0;
49c7754c 3528}
5a2d4365 3529
4e335d9e 3530int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
5a2d4365
PB
3531 gpa_t gpa, unsigned long len)
3532{
4e335d9e 3533 struct kvm_memslots *slots = kvm_memslots(kvm);
5a2d4365
PB
3534 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3535}
4e335d9e 3536EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
49c7754c 3537
4e335d9e 3538int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
7a86dab8
JM
3539 void *data, unsigned int offset,
3540 unsigned long len)
49c7754c 3541{
4e335d9e 3542 struct kvm_memslots *slots = kvm_memslots(kvm);
49c7754c 3543 int r;
4ec6e863 3544 gpa_t gpa = ghc->gpa + offset;
49c7754c 3545
5f25e71e
PB
3546 if (WARN_ON_ONCE(len + offset > ghc->len))
3547 return -EINVAL;
8f964525 3548
dc9ce71e
SC
3549 if (slots->generation != ghc->generation) {
3550 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3551 return -EFAULT;
3552 }
8f964525 3553
49c7754c
GN
3554 if (kvm_is_error_hva(ghc->hva))
3555 return -EFAULT;
3556
fcfbc617
SC
3557 if (unlikely(!ghc->memslot))
3558 return kvm_write_guest(kvm, gpa, data, len);
3559
4ec6e863 3560 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
49c7754c
GN
3561 if (r)
3562 return -EFAULT;
28bd726a 3563 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
49c7754c
GN
3564
3565 return 0;
3566}
4e335d9e 3567EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
4ec6e863 3568
4e335d9e
PB
3569int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3570 void *data, unsigned long len)
4ec6e863 3571{
4e335d9e 3572 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
4ec6e863 3573}
4e335d9e 3574EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
49c7754c 3575
0958f0ce
VK
3576int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3577 void *data, unsigned int offset,
3578 unsigned long len)
e03b644f 3579{
4e335d9e 3580 struct kvm_memslots *slots = kvm_memslots(kvm);
e03b644f 3581 int r;
0958f0ce 3582 gpa_t gpa = ghc->gpa + offset;
e03b644f 3583
5f25e71e
PB
3584 if (WARN_ON_ONCE(len + offset > ghc->len))
3585 return -EINVAL;
8f964525 3586
dc9ce71e
SC
3587 if (slots->generation != ghc->generation) {
3588 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3589 return -EFAULT;
3590 }
8f964525 3591
e03b644f
GN
3592 if (kvm_is_error_hva(ghc->hva))
3593 return -EFAULT;
3594
fcfbc617 3595 if (unlikely(!ghc->memslot))
0958f0ce 3596 return kvm_read_guest(kvm, gpa, data, len);
fcfbc617 3597
0958f0ce 3598 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
e03b644f
GN
3599 if (r)
3600 return -EFAULT;
3601
3602 return 0;
3603}
0958f0ce
VK
3604EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3605
3606int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3607 void *data, unsigned long len)
3608{
3609 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3610}
4e335d9e 3611EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
e03b644f 3612
195aefde
IE
3613int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3614{
2f541442 3615 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
195aefde
IE
3616 gfn_t gfn = gpa >> PAGE_SHIFT;
3617 int seg;
3618 int offset = offset_in_page(gpa);
3619 int ret;
3620
bfda0e84 3621 while ((seg = next_segment(len, offset)) != 0) {
2f541442 3622 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
195aefde
IE
3623 if (ret < 0)
3624 return ret;
3625 offset = 0;
3626 len -= seg;
3627 ++gfn;
3628 }
3629 return 0;
3630}
3631EXPORT_SYMBOL_GPL(kvm_clear_guest);
3632
28bd726a 3633void mark_page_dirty_in_slot(struct kvm *kvm,
8283e36a 3634 const struct kvm_memory_slot *memslot,
28bd726a 3635 gfn_t gfn)
6aa8b732 3636{
2efd61a6
DW
3637 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3638
e09fccb5 3639#ifdef CONFIG_HAVE_KVM_DIRTY_RING
86bdf3eb 3640 if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
2efd61a6 3641 return;
86bdf3eb 3642
c57351a7 3643 WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
e09fccb5 3644#endif
2efd61a6 3645
044c59c4 3646 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
7e9d619d 3647 unsigned long rel_gfn = gfn - memslot->base_gfn;
fb04a1ed 3648 u32 slot = (memslot->as_id << 16) | memslot->id;
6aa8b732 3649
86bdf3eb 3650 if (kvm->dirty_ring_size && vcpu)
cf87ac73 3651 kvm_dirty_ring_push(vcpu, slot, rel_gfn);
c57351a7 3652 else if (memslot->dirty_bitmap)
fb04a1ed 3653 set_bit_le(rel_gfn, memslot->dirty_bitmap);
6aa8b732
AK
3654 }
3655}
a6a0b05d 3656EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
6aa8b732 3657
49c7754c
GN
3658void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3659{
3660 struct kvm_memory_slot *memslot;
3661
3662 memslot = gfn_to_memslot(kvm, gfn);
28bd726a 3663 mark_page_dirty_in_slot(kvm, memslot, gfn);
49c7754c 3664}
2ba9f0d8 3665EXPORT_SYMBOL_GPL(mark_page_dirty);
49c7754c 3666
8e73485c
PB
3667void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3668{
3669 struct kvm_memory_slot *memslot;
3670
3671 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
28bd726a 3672 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
8e73485c
PB
3673}
3674EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3675
20b7035c
JS
3676void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3677{
3678 if (!vcpu->sigset_active)
3679 return;
3680
3681 /*
3682 * This does a lockless modification of ->real_blocked, which is fine
3683 * because, only current can change ->real_blocked and all readers of
3684 * ->real_blocked don't care as long ->real_blocked is always a subset
3685 * of ->blocked.
3686 */
3687 sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3688}
3689
3690void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3691{
3692 if (!vcpu->sigset_active)
3693 return;
3694
3695 sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3696 sigemptyset(&current->real_blocked);
3697}
3698
aca6ff29
WL
3699static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3700{
dee339b5 3701 unsigned int old, val, grow, grow_start;
aca6ff29 3702
2cbd7824 3703 old = val = vcpu->halt_poll_ns;
dee339b5 3704 grow_start = READ_ONCE(halt_poll_ns_grow_start);
6b6de68c 3705 grow = READ_ONCE(halt_poll_ns_grow);
7fa08e71
NW
3706 if (!grow)
3707 goto out;
3708
dee339b5
NW
3709 val *= grow;
3710 if (val < grow_start)
3711 val = grow_start;
aca6ff29
WL
3712
3713 vcpu->halt_poll_ns = val;
7fa08e71 3714out:
2cbd7824 3715 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
aca6ff29
WL
3716}
3717
3718static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3719{
ae232ea4 3720 unsigned int old, val, shrink, grow_start;
aca6ff29 3721
2cbd7824 3722 old = val = vcpu->halt_poll_ns;
6b6de68c 3723 shrink = READ_ONCE(halt_poll_ns_shrink);
ae232ea4 3724 grow_start = READ_ONCE(halt_poll_ns_grow_start);
6b6de68c 3725 if (shrink == 0)
aca6ff29
WL
3726 val = 0;
3727 else
6b6de68c 3728 val /= shrink;
aca6ff29 3729
ae232ea4
SS
3730 if (val < grow_start)
3731 val = 0;
3732
aca6ff29 3733 vcpu->halt_poll_ns = val;
2cbd7824 3734 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
aca6ff29
WL
3735}
3736
f7819512
PB
3737static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3738{
50c28f21
JS
3739 int ret = -EINTR;
3740 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3741
c59fb127 3742 if (kvm_arch_vcpu_runnable(vcpu))
50c28f21 3743 goto out;
f7819512 3744 if (kvm_cpu_has_pending_timer(vcpu))
50c28f21 3745 goto out;
f7819512 3746 if (signal_pending(current))
50c28f21 3747 goto out;
084071d5
MT
3748 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3749 goto out;
f7819512 3750
50c28f21
JS
3751 ret = 0;
3752out:
3753 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3754 return ret;
f7819512
PB
3755}
3756
fac42688
SC
3757/*
3758 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3759 * pending. This is mostly used when halting a vCPU, but may also be used
3760 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3761 */
3762bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
cb953129 3763{
fac42688
SC
3764 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3765 bool waited = false;
3766
c3858335
JZ
3767 vcpu->stat.generic.blocking = 1;
3768
18869f26 3769 preempt_disable();
fac42688 3770 kvm_arch_vcpu_blocking(vcpu);
fac42688 3771 prepare_to_rcuwait(wait);
18869f26
ML
3772 preempt_enable();
3773
fac42688
SC
3774 for (;;) {
3775 set_current_state(TASK_INTERRUPTIBLE);
3776
3777 if (kvm_vcpu_check_block(vcpu) < 0)
3778 break;
3779
3780 waited = true;
3781 schedule();
3782 }
fac42688 3783
18869f26
ML
3784 preempt_disable();
3785 finish_rcuwait(wait);
fac42688 3786 kvm_arch_vcpu_unblocking(vcpu);
18869f26 3787 preempt_enable();
fac42688 3788
c3858335
JZ
3789 vcpu->stat.generic.blocking = 0;
3790
fac42688
SC
3791 return waited;
3792}
3793
29e72893
SC
3794static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3795 ktime_t end, bool success)
cb953129 3796{
30c94347 3797 struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
29e72893
SC
3798 u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3799
30c94347
SC
3800 ++vcpu->stat.generic.halt_attempted_poll;
3801
3802 if (success) {
3803 ++vcpu->stat.generic.halt_successful_poll;
3804
3805 if (!vcpu_valid_wakeup(vcpu))
3806 ++vcpu->stat.generic.halt_poll_invalid;
3807
3808 stats->halt_poll_success_ns += poll_ns;
3809 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3810 } else {
3811 stats->halt_poll_fail_ns += poll_ns;
3812 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3813 }
cb953129
DM
3814}
3815
175d5dc7
DM
3816static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3817{
9eb8ca04
DM
3818 struct kvm *kvm = vcpu->kvm;
3819
3820 if (kvm->override_halt_poll_ns) {
3821 /*
3822 * Ensure kvm->max_halt_poll_ns is not read before
3823 * kvm->override_halt_poll_ns.
3824 *
3825 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3826 */
3827 smp_rmb();
3828 return READ_ONCE(kvm->max_halt_poll_ns);
3829 }
3830
3831 return READ_ONCE(halt_poll_ns);
175d5dc7
DM
3832}
3833
b6958ce4 3834/*
fac42688
SC
3835 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3836 * polling is enabled, busy wait for a short time before blocking to avoid the
3837 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3838 * is halted.
b6958ce4 3839 */
91b99ea7 3840void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
d3bef15f 3841{
175d5dc7 3842 unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
6f390916 3843 bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
cb953129 3844 ktime_t start, cur, poll_end;
f7819512 3845 bool waited = false;
97b6847a 3846 bool do_halt_poll;
91b99ea7 3847 u64 halt_ns;
07ab0f8d 3848
175d5dc7
DM
3849 if (vcpu->halt_poll_ns > max_halt_poll_ns)
3850 vcpu->halt_poll_ns = max_halt_poll_ns;
97b6847a
DM
3851
3852 do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3853
cb953129 3854 start = cur = poll_end = ktime_get();
8df6a61c 3855 if (do_halt_poll) {
109a9826 3856 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
f95ef0cd 3857
f7819512 3858 do {
30c94347 3859 if (kvm_vcpu_check_block(vcpu) < 0)
f7819512 3860 goto out;
74775654 3861 cpu_relax();
cb953129 3862 poll_end = cur = ktime_get();
6bd5b743 3863 } while (kvm_vcpu_can_poll(cur, stop));
f7819512 3864 }
e5c239cf 3865
fac42688 3866 waited = kvm_vcpu_block(vcpu);
8ccba534 3867
f7819512 3868 cur = ktime_get();
87bcc5fa
JZ
3869 if (waited) {
3870 vcpu->stat.generic.halt_wait_ns +=
3871 ktime_to_ns(cur) - ktime_to_ns(poll_end);
8ccba534
JZ
3872 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3873 ktime_to_ns(cur) - ktime_to_ns(poll_end));
87bcc5fa 3874 }
f7819512 3875out:
91b99ea7
SC
3876 /* The total time the vCPU was "halted", including polling time. */
3877 halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
aca6ff29 3878
29e72893
SC
3879 /*
3880 * Note, halt-polling is considered successful so long as the vCPU was
3881 * never actually scheduled out, i.e. even if the wake event arrived
3882 * after of the halt-polling loop itself, but before the full wait.
3883 */
8df6a61c 3884 if (do_halt_poll)
29e72893 3885 update_halt_poll_stats(vcpu, start, poll_end, !waited);
cb953129 3886
6f390916 3887 if (halt_poll_allowed) {
175d5dc7
DM
3888 /* Recompute the max halt poll time in case it changed. */
3889 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3890
44551b2f 3891 if (!vcpu_valid_wakeup(vcpu)) {
aca6ff29 3892 shrink_halt_poll_ns(vcpu);
175d5dc7 3893 } else if (max_halt_poll_ns) {
91b99ea7 3894 if (halt_ns <= vcpu->halt_poll_ns)
44551b2f
WL
3895 ;
3896 /* we had a long block, shrink polling */
acd05785 3897 else if (vcpu->halt_poll_ns &&
175d5dc7 3898 halt_ns > max_halt_poll_ns)
44551b2f
WL
3899 shrink_halt_poll_ns(vcpu);
3900 /* we had a short halt and our poll time is too small */
175d5dc7
DM
3901 else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3902 halt_ns < max_halt_poll_ns)
44551b2f
WL
3903 grow_halt_poll_ns(vcpu);
3904 } else {
3905 vcpu->halt_poll_ns = 0;
3906 }
3907 }
aca6ff29 3908
91b99ea7 3909 trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
b6958ce4 3910}
91b99ea7 3911EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
b6958ce4 3912
178f02ff 3913bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
b6d33834 3914{
d92a5d1c 3915 if (__kvm_vcpu_wake_up(vcpu)) {
d73eb57b 3916 WRITE_ONCE(vcpu->ready, true);
0193cc90 3917 ++vcpu->stat.generic.halt_wakeup;
178f02ff 3918 return true;
b6d33834
CD
3919 }
3920
178f02ff 3921 return false;
dd1a4cc1
RK
3922}
3923EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3924
0266c894 3925#ifndef CONFIG_S390
dd1a4cc1
RK
3926/*
3927 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3928 */
3929void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3930{
85b64045 3931 int me, cpu;
dd1a4cc1 3932
178f02ff
RK
3933 if (kvm_vcpu_wake_up(vcpu))
3934 return;
3935
aefdc2ed
PB
3936 me = get_cpu();
3937 /*
3938 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3939 * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3940 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3941 * within the vCPU thread itself.
3942 */
3943 if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3944 if (vcpu->mode == IN_GUEST_MODE)
3945 WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3946 goto out;
3947 }
3948
85b64045
SC
3949 /*
3950 * Note, the vCPU could get migrated to a different pCPU at any point
3951 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3952 * IPI to the previous pCPU. But, that's ok because the purpose of the
3953 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3954 * vCPU also requires it to leave IN_GUEST_MODE.
3955 */
85b64045
SC
3956 if (kvm_arch_vcpu_should_kick(vcpu)) {
3957 cpu = READ_ONCE(vcpu->cpu);
3958 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
b6d33834 3959 smp_send_reschedule(cpu);
85b64045 3960 }
aefdc2ed 3961out:
b6d33834
CD
3962 put_cpu();
3963}
a20ed54d 3964EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
0266c894 3965#endif /* !CONFIG_S390 */
b6d33834 3966
fa93384f 3967int kvm_vcpu_yield_to(struct kvm_vcpu *target)
41628d33
KW
3968{
3969 struct pid *pid;
3970 struct task_struct *task = NULL;
fa93384f 3971 int ret = 0;
41628d33
KW
3972
3973 rcu_read_lock();
3974 pid = rcu_dereference(target->pid);
3975 if (pid)
27fbe64b 3976 task = get_pid_task(pid, PIDTYPE_PID);
41628d33
KW
3977 rcu_read_unlock();
3978 if (!task)
c45c528e 3979 return ret;
c45c528e 3980 ret = yield_to(task, 1);
41628d33 3981 put_task_struct(task);
c45c528e
R
3982
3983 return ret;
41628d33
KW
3984}
3985EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3986
06e48c51
R
3987/*
3988 * Helper that checks whether a VCPU is eligible for directed yield.
3989 * Most eligible candidate to yield is decided by following heuristics:
3990 *
3991 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3992 * (preempted lock holder), indicated by @in_spin_loop.
656012c7 3993 * Set at the beginning and cleared at the end of interception/PLE handler.
06e48c51
R
3994 *
3995 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3996 * chance last time (mostly it has become eligible now since we have probably
3997 * yielded to lockholder in last iteration. This is done by toggling
3998 * @dy_eligible each time a VCPU checked for eligibility.)
3999 *
4000 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
4001 * to preempted lock-holder could result in wrong VCPU selection and CPU
4002 * burning. Giving priority for a potential lock-holder increases lock
4003 * progress.
4004 *
4005 * Since algorithm is based on heuristics, accessing another VCPU data without
4006 * locking does not harm. It may result in trying to yield to same VCPU, fail
4007 * and continue with next VCPU and so on.
4008 */
7940876e 4009static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
06e48c51 4010{
4a55dd72 4011#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
06e48c51
R
4012 bool eligible;
4013
4014 eligible = !vcpu->spin_loop.in_spin_loop ||
34656113 4015 vcpu->spin_loop.dy_eligible;
06e48c51
R
4016
4017 if (vcpu->spin_loop.in_spin_loop)
4018 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
4019
4020 return eligible;
4a55dd72
SW
4021#else
4022 return true;
06e48c51 4023#endif
4a55dd72 4024}
c45c528e 4025
17e433b5
WL
4026/*
4027 * Unlike kvm_arch_vcpu_runnable, this function is called outside
4028 * a vcpu_load/vcpu_put pair. However, for most architectures
4029 * kvm_arch_vcpu_runnable does not require vcpu_load.
4030 */
4031bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
4032{
4033 return kvm_arch_vcpu_runnable(vcpu);
4034}
4035
4036static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
4037{
4038 if (kvm_arch_dy_runnable(vcpu))
4039 return true;
4040
4041#ifdef CONFIG_KVM_ASYNC_PF
4042 if (!list_empty_careful(&vcpu->async_pf.done))
4043 return true;
4044#endif
4045
4046 return false;
4047}
4048
77bcd9e6
SC
4049/*
4050 * By default, simply query the target vCPU's current mode when checking if a
4051 * vCPU was preempted in kernel mode. All architectures except x86 (or more
4052 * specifical, except VMX) allow querying whether or not a vCPU is in kernel
4053 * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
4054 * directly for cross-vCPU checks is functionally correct and accurate.
4055 */
4056bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
4057{
4058 return kvm_arch_vcpu_in_kernel(vcpu);
4059}
4060
52acd22f
WL
4061bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
4062{
4063 return false;
4064}
4065
199b5763 4066void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
d255f4f2 4067{
217ece61
RR
4068 struct kvm *kvm = me->kvm;
4069 struct kvm_vcpu *vcpu;
4070 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
46808a4c 4071 unsigned long i;
217ece61 4072 int yielded = 0;
c45c528e 4073 int try = 3;
217ece61 4074 int pass;
d255f4f2 4075
4c088493 4076 kvm_vcpu_set_in_spin_loop(me, true);
217ece61
RR
4077 /*
4078 * We boost the priority of a VCPU that is runnable but not
4079 * currently running, because it got preempted by something
4080 * else and called schedule in __vcpu_run. Hopefully that
4081 * VCPU is holding the lock that we need and will release it.
4082 * We approximate round-robin by starting at the last boosted VCPU.
4083 */
c45c528e 4084 for (pass = 0; pass < 2 && !yielded && try; pass++) {
217ece61 4085 kvm_for_each_vcpu(i, vcpu, kvm) {
5cfc2aab 4086 if (!pass && i <= last_boosted_vcpu) {
217ece61
RR
4087 i = last_boosted_vcpu;
4088 continue;
4089 } else if (pass && i > last_boosted_vcpu)
4090 break;
d73eb57b 4091 if (!READ_ONCE(vcpu->ready))
7bc7ae25 4092 continue;
217ece61
RR
4093 if (vcpu == me)
4094 continue;
d92a5d1c 4095 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
217ece61 4096 continue;
dafc17dd
SC
4097
4098 /*
4099 * Treat the target vCPU as being in-kernel if it has a
4100 * pending interrupt, as the vCPU trying to yield may
4101 * be spinning waiting on IPI delivery, i.e. the target
4102 * vCPU is in-kernel for the purposes of directed yield.
4103 */
046ddeed 4104 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
52acd22f 4105 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
77bcd9e6 4106 !kvm_arch_vcpu_preempted_in_kernel(vcpu))
199b5763 4107 continue;
06e48c51
R
4108 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
4109 continue;
c45c528e
R
4110
4111 yielded = kvm_vcpu_yield_to(vcpu);
4112 if (yielded > 0) {
217ece61 4113 kvm->last_boosted_vcpu = i;
217ece61 4114 break;
c45c528e
R
4115 } else if (yielded < 0) {
4116 try--;
4117 if (!try)
4118 break;
217ece61 4119 }
217ece61
RR
4120 }
4121 }
4c088493 4122 kvm_vcpu_set_in_spin_loop(me, false);
06e48c51
R
4123
4124 /* Ensure vcpu is not eligible during next spinloop */
4125 kvm_vcpu_set_dy_eligible(me, false);
d255f4f2
ZE
4126}
4127EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
4128
fb04a1ed
PX
4129static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
4130{
dc70ec21 4131#ifdef CONFIG_HAVE_KVM_DIRTY_RING
fb04a1ed
PX
4132 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
4133 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
4134 kvm->dirty_ring_size / PAGE_SIZE);
4135#else
4136 return false;
4137#endif
4138}
4139
1499fa80 4140static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
9a2bb7f4 4141{
11bac800 4142 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
9a2bb7f4
AK
4143 struct page *page;
4144
e4a533a4 4145 if (vmf->pgoff == 0)
039576c0 4146 page = virt_to_page(vcpu->run);
09566765 4147#ifdef CONFIG_X86
e4a533a4 4148 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
ad312c7c 4149 page = virt_to_page(vcpu->arch.pio_data);
5f94c174 4150#endif
4b4357e0 4151#ifdef CONFIG_KVM_MMIO
5f94c174
LV
4152 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
4153 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
09566765 4154#endif
fb04a1ed
PX
4155 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
4156 page = kvm_dirty_ring_get_page(
4157 &vcpu->dirty_ring,
4158 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
039576c0 4159 else
5b1c1493 4160 return kvm_arch_vcpu_fault(vcpu, vmf);
9a2bb7f4 4161 get_page(page);
e4a533a4 4162 vmf->page = page;
4163 return 0;
9a2bb7f4
AK
4164}
4165
f0f37e2f 4166static const struct vm_operations_struct kvm_vcpu_vm_ops = {
e4a533a4 4167 .fault = kvm_vcpu_fault,
9a2bb7f4
AK
4168};
4169
4170static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
4171{
fb04a1ed 4172 struct kvm_vcpu *vcpu = file->private_data;
11476d27 4173 unsigned long pages = vma_pages(vma);
fb04a1ed
PX
4174
4175 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
4176 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
4177 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
4178 return -EINVAL;
4179
9a2bb7f4
AK
4180 vma->vm_ops = &kvm_vcpu_vm_ops;
4181 return 0;
4182}
4183
bccf2150
AK
4184static int kvm_vcpu_release(struct inode *inode, struct file *filp)
4185{
4186 struct kvm_vcpu *vcpu = filp->private_data;
4187
66c0b394 4188 kvm_put_kvm(vcpu->kvm);
bccf2150
AK
4189 return 0;
4190}
4191
087e1520 4192static struct file_operations kvm_vcpu_fops = {
bccf2150
AK
4193 .release = kvm_vcpu_release,
4194 .unlocked_ioctl = kvm_vcpu_ioctl,
9a2bb7f4 4195 .mmap = kvm_vcpu_mmap,
6038f373 4196 .llseek = noop_llseek,
7ddfd3e0 4197 KVM_COMPAT(kvm_vcpu_compat_ioctl),
bccf2150
AK
4198};
4199
4200/*
4201 * Allocates an inode for the vcpu.
4202 */
4203static int create_vcpu_fd(struct kvm_vcpu *vcpu)
4204{
e46b4692
MY
4205 char name[8 + 1 + ITOA_MAX_LEN + 1];
4206
4207 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
4208 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
bccf2150
AK
4209}
4210
e36de87d
VP
4211#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
4212static int vcpu_get_pid(void *data, u64 *val)
4213{
14aa40a1 4214 struct kvm_vcpu *vcpu = data;
76021e96
SC
4215
4216 rcu_read_lock();
4217 *val = pid_nr(rcu_dereference(vcpu->pid));
4218 rcu_read_unlock();
e36de87d
VP
4219 return 0;
4220}
4221
4222DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
4223
3e7093d0 4224static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
45b5939e 4225{
d56f5136 4226 struct dentry *debugfs_dentry;
45b5939e 4227 char dir_name[ITOA_MAX_LEN * 2];
45b5939e 4228
45b5939e 4229 if (!debugfs_initialized())
3e7093d0 4230 return;
45b5939e
LC
4231
4232 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
d56f5136
PB
4233 debugfs_dentry = debugfs_create_dir(dir_name,
4234 vcpu->kvm->debugfs_dentry);
e36de87d
VP
4235 debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
4236 &vcpu_get_pid_fops);
45b5939e 4237
d56f5136 4238 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
45b5939e 4239}
e36de87d 4240#endif
45b5939e 4241
c5ea7660
AK
4242/*
4243 * Creates some virtual cpus. Good luck creating more than one.
4244 */
73880c80 4245static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
c5ea7660
AK
4246{
4247 int r;
e09fefde 4248 struct kvm_vcpu *vcpu;
8bd826d6 4249 struct page *page;
c5ea7660 4250
a1c42dde 4251 if (id >= KVM_MAX_VCPU_IDS)
338c7dba
AH
4252 return -EINVAL;
4253
6c7caebc 4254 mutex_lock(&kvm->lock);
f502cc56 4255 if (kvm->created_vcpus >= kvm->max_vcpus) {
6c7caebc
PB
4256 mutex_unlock(&kvm->lock);
4257 return -EINVAL;
4258 }
4259
1d5e740d
ZG
4260 r = kvm_arch_vcpu_precreate(kvm, id);
4261 if (r) {
4262 mutex_unlock(&kvm->lock);
4263 return r;
4264 }
4265
6c7caebc
PB
4266 kvm->created_vcpus++;
4267 mutex_unlock(&kvm->lock);
4268
85f47930 4269 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
e529ef66
SC
4270 if (!vcpu) {
4271 r = -ENOMEM;
6c7caebc
PB
4272 goto vcpu_decrement;
4273 }
c5ea7660 4274
fcd97ad5 4275 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
93bb59ca 4276 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
8bd826d6
SC
4277 if (!page) {
4278 r = -ENOMEM;
e529ef66 4279 goto vcpu_free;
8bd826d6
SC
4280 }
4281 vcpu->run = page_address(page);
4282
4283 kvm_vcpu_init(vcpu, kvm, id);
e529ef66
SC
4284
4285 r = kvm_arch_vcpu_create(vcpu);
4286 if (r)
8bd826d6 4287 goto vcpu_free_run_page;
e529ef66 4288
fb04a1ed
PX
4289 if (kvm->dirty_ring_size) {
4290 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
4291 id, kvm->dirty_ring_size);
4292 if (r)
4293 goto arch_vcpu_destroy;
4294 }
4295
11ec2804 4296 mutex_lock(&kvm->lock);
42a90008
DW
4297
4298#ifdef CONFIG_LOCKDEP
4299 /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
4300 mutex_lock(&vcpu->mutex);
4301 mutex_unlock(&vcpu->mutex);
4302#endif
4303
e09fefde
DH
4304 if (kvm_get_vcpu_by_id(kvm, id)) {
4305 r = -EEXIST;
4306 goto unlock_vcpu_destroy;
4307 }
73880c80 4308
8750e72a 4309 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
afb2acb2 4310 r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
c5b07754
MZ
4311 if (r)
4312 goto unlock_vcpu_destroy;
c5ea7660 4313
fb3f0f51 4314 /* Now it's all set up, let userspace reach it */
66c0b394 4315 kvm_get_kvm(kvm);
bccf2150 4316 r = create_vcpu_fd(vcpu);
afb2acb2
ML
4317 if (r < 0)
4318 goto kvm_put_xa_release;
4319
5f643e46 4320 if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
afb2acb2
ML
4321 r = -EINVAL;
4322 goto kvm_put_xa_release;
73880c80
GN
4323 }
4324
dd489240 4325 /*
c5b07754
MZ
4326 * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
4327 * pointer before kvm->online_vcpu's incremented value.
dd489240 4328 */
73880c80
GN
4329 smp_wmb();
4330 atomic_inc(&kvm->online_vcpus);
4331
73880c80 4332 mutex_unlock(&kvm->lock);
42897d86 4333 kvm_arch_vcpu_postcreate(vcpu);
63d04348 4334 kvm_create_vcpu_debugfs(vcpu);
fb3f0f51 4335 return r;
39c3b86e 4336
afb2acb2
ML
4337kvm_put_xa_release:
4338 kvm_put_kvm_no_destroy(kvm);
4339 xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
d780592b 4340unlock_vcpu_destroy:
7d8fece6 4341 mutex_unlock(&kvm->lock);
fb04a1ed
PX
4342 kvm_dirty_ring_free(&vcpu->dirty_ring);
4343arch_vcpu_destroy:
d40ccc62 4344 kvm_arch_vcpu_destroy(vcpu);
8bd826d6
SC
4345vcpu_free_run_page:
4346 free_page((unsigned long)vcpu->run);
e529ef66
SC
4347vcpu_free:
4348 kmem_cache_free(kvm_vcpu_cache, vcpu);
6c7caebc
PB
4349vcpu_decrement:
4350 mutex_lock(&kvm->lock);
4351 kvm->created_vcpus--;
4352 mutex_unlock(&kvm->lock);
c5ea7660
AK
4353 return r;
4354}
4355
1961d276
AK
4356static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4357{
4358 if (sigset) {
4359 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4360 vcpu->sigset_active = 1;
4361 vcpu->sigset = *sigset;
4362 } else
4363 vcpu->sigset_active = 0;
4364 return 0;
4365}
4366
ce55c049
JZ
4367static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4368 size_t size, loff_t *offset)
4369{
4370 struct kvm_vcpu *vcpu = file->private_data;
4371
4372 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4373 &kvm_vcpu_stats_desc[0], &vcpu->stat,
4374 sizeof(vcpu->stat), user_buffer, size, offset);
4375}
4376
eed3013f
SC
4377static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4378{
4379 struct kvm_vcpu *vcpu = file->private_data;
4380
4381 kvm_put_kvm(vcpu->kvm);
4382 return 0;
4383}
4384
ce55c049 4385static const struct file_operations kvm_vcpu_stats_fops = {
087e1520 4386 .owner = THIS_MODULE,
ce55c049 4387 .read = kvm_vcpu_stats_read,
eed3013f 4388 .release = kvm_vcpu_stats_release,
ce55c049
JZ
4389 .llseek = noop_llseek,
4390};
4391
4392static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4393{
4394 int fd;
4395 struct file *file;
4396 char name[15 + ITOA_MAX_LEN + 1];
4397
4398 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4399
4400 fd = get_unused_fd_flags(O_CLOEXEC);
4401 if (fd < 0)
4402 return fd;
4403
4404 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4405 if (IS_ERR(file)) {
4406 put_unused_fd(fd);
4407 return PTR_ERR(file);
4408 }
eed3013f
SC
4409
4410 kvm_get_kvm(vcpu->kvm);
4411
ce55c049
JZ
4412 file->f_mode |= FMODE_PREAD;
4413 fd_install(fd, file);
4414
4415 return fd;
4416}
4417
bccf2150
AK
4418static long kvm_vcpu_ioctl(struct file *filp,
4419 unsigned int ioctl, unsigned long arg)
6aa8b732 4420{
bccf2150 4421 struct kvm_vcpu *vcpu = filp->private_data;
2f366987 4422 void __user *argp = (void __user *)arg;
313a3dc7 4423 int r;
fa3795a7
DH
4424 struct kvm_fpu *fpu = NULL;
4425 struct kvm_sregs *kvm_sregs = NULL;
6aa8b732 4426
f4d31653 4427 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
6d4e4c4f 4428 return -EIO;
2122ff5e 4429
2ea75be3
DM
4430 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4431 return -EINVAL;
4432
2122ff5e 4433 /*
5cb0944c
PB
4434 * Some architectures have vcpu ioctls that are asynchronous to vcpu
4435 * execution; mutex_lock() would break them.
2122ff5e 4436 */
5cb0944c
PB
4437 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4438 if (r != -ENOIOCTLCMD)
9fc77441 4439 return r;
2122ff5e 4440
ec7660cc
CD
4441 if (mutex_lock_killable(&vcpu->mutex))
4442 return -EINTR;
6aa8b732 4443 switch (ioctl) {
0e4524a5
CB
4444 case KVM_RUN: {
4445 struct pid *oldpid;
f0fe5108
AK
4446 r = -EINVAL;
4447 if (arg)
4448 goto out;
0e4524a5 4449 oldpid = rcu_access_pointer(vcpu->pid);
71dbc8a9 4450 if (unlikely(oldpid != task_pid(current))) {
7a72f7a1 4451 /* The thread running this VCPU changed. */
bd2a6394 4452 struct pid *newpid;
f95ef0cd 4453
bd2a6394
CD
4454 r = kvm_arch_vcpu_run_pid_change(vcpu);
4455 if (r)
4456 break;
4457
4458 newpid = get_task_pid(current, PIDTYPE_PID);
7a72f7a1
CB
4459 rcu_assign_pointer(vcpu->pid, newpid);
4460 if (oldpid)
4461 synchronize_rcu();
4462 put_pid(oldpid);
4463 }
1b94f6f8 4464 r = kvm_arch_vcpu_ioctl_run(vcpu);
64be5007 4465 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
6aa8b732 4466 break;
0e4524a5 4467 }
6aa8b732 4468 case KVM_GET_REGS: {
3e4bb3ac 4469 struct kvm_regs *kvm_regs;
6aa8b732 4470
3e4bb3ac 4471 r = -ENOMEM;
b12ce36a 4472 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3e4bb3ac 4473 if (!kvm_regs)
6aa8b732 4474 goto out;
3e4bb3ac
XZ
4475 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4476 if (r)
4477 goto out_free1;
6aa8b732 4478 r = -EFAULT;
3e4bb3ac
XZ
4479 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4480 goto out_free1;
6aa8b732 4481 r = 0;
3e4bb3ac
XZ
4482out_free1:
4483 kfree(kvm_regs);
6aa8b732
AK
4484 break;
4485 }
4486 case KVM_SET_REGS: {
3e4bb3ac 4487 struct kvm_regs *kvm_regs;
6aa8b732 4488
ff5c2c03
SL
4489 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4490 if (IS_ERR(kvm_regs)) {
4491 r = PTR_ERR(kvm_regs);
6aa8b732 4492 goto out;
ff5c2c03 4493 }
3e4bb3ac 4494 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3e4bb3ac 4495 kfree(kvm_regs);
6aa8b732
AK
4496 break;
4497 }
4498 case KVM_GET_SREGS: {
b12ce36a
BG
4499 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4500 GFP_KERNEL_ACCOUNT);
fa3795a7
DH
4501 r = -ENOMEM;
4502 if (!kvm_sregs)
4503 goto out;
4504 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
6aa8b732
AK
4505 if (r)
4506 goto out;
4507 r = -EFAULT;
fa3795a7 4508 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
6aa8b732
AK
4509 goto out;
4510 r = 0;
4511 break;
4512 }
4513 case KVM_SET_SREGS: {
ff5c2c03
SL
4514 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4515 if (IS_ERR(kvm_sregs)) {
4516 r = PTR_ERR(kvm_sregs);
18595411 4517 kvm_sregs = NULL;
6aa8b732 4518 goto out;
ff5c2c03 4519 }
fa3795a7 4520 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
6aa8b732
AK
4521 break;
4522 }
62d9f0db
MT
4523 case KVM_GET_MP_STATE: {
4524 struct kvm_mp_state mp_state;
4525
4526 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4527 if (r)
4528 goto out;
4529 r = -EFAULT;
893bdbf1 4530 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
62d9f0db
MT
4531 goto out;
4532 r = 0;
4533 break;
4534 }
4535 case KVM_SET_MP_STATE: {
4536 struct kvm_mp_state mp_state;
4537
4538 r = -EFAULT;
893bdbf1 4539 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
62d9f0db
MT
4540 goto out;
4541 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
62d9f0db
MT
4542 break;
4543 }
6aa8b732
AK
4544 case KVM_TRANSLATE: {
4545 struct kvm_translation tr;
4546
4547 r = -EFAULT;
893bdbf1 4548 if (copy_from_user(&tr, argp, sizeof(tr)))
6aa8b732 4549 goto out;
8b006791 4550 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
6aa8b732
AK
4551 if (r)
4552 goto out;
4553 r = -EFAULT;
893bdbf1 4554 if (copy_to_user(argp, &tr, sizeof(tr)))
6aa8b732
AK
4555 goto out;
4556 r = 0;
4557 break;
4558 }
d0bfb940
JK
4559 case KVM_SET_GUEST_DEBUG: {
4560 struct kvm_guest_debug dbg;
6aa8b732
AK
4561
4562 r = -EFAULT;
893bdbf1 4563 if (copy_from_user(&dbg, argp, sizeof(dbg)))
6aa8b732 4564 goto out;
d0bfb940 4565 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
6aa8b732
AK
4566 break;
4567 }
1961d276
AK
4568 case KVM_SET_SIGNAL_MASK: {
4569 struct kvm_signal_mask __user *sigmask_arg = argp;
4570 struct kvm_signal_mask kvm_sigmask;
4571 sigset_t sigset, *p;
4572
4573 p = NULL;
4574 if (argp) {
4575 r = -EFAULT;
4576 if (copy_from_user(&kvm_sigmask, argp,
893bdbf1 4577 sizeof(kvm_sigmask)))
1961d276
AK
4578 goto out;
4579 r = -EINVAL;
893bdbf1 4580 if (kvm_sigmask.len != sizeof(sigset))
1961d276
AK
4581 goto out;
4582 r = -EFAULT;
4583 if (copy_from_user(&sigset, sigmask_arg->sigset,
893bdbf1 4584 sizeof(sigset)))
1961d276
AK
4585 goto out;
4586 p = &sigset;
4587 }
376d41ff 4588 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1961d276
AK
4589 break;
4590 }
b8836737 4591 case KVM_GET_FPU: {
b12ce36a 4592 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
fa3795a7
DH
4593 r = -ENOMEM;
4594 if (!fpu)
4595 goto out;
4596 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
b8836737
AK
4597 if (r)
4598 goto out;
4599 r = -EFAULT;
fa3795a7 4600 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
b8836737
AK
4601 goto out;
4602 r = 0;
4603 break;
4604 }
4605 case KVM_SET_FPU: {
ff5c2c03
SL
4606 fpu = memdup_user(argp, sizeof(*fpu));
4607 if (IS_ERR(fpu)) {
4608 r = PTR_ERR(fpu);
18595411 4609 fpu = NULL;
b8836737 4610 goto out;
ff5c2c03 4611 }
fa3795a7 4612 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
b8836737
AK
4613 break;
4614 }
ce55c049
JZ
4615 case KVM_GET_STATS_FD: {
4616 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4617 break;
4618 }
bccf2150 4619 default:
313a3dc7 4620 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
bccf2150
AK
4621 }
4622out:
ec7660cc 4623 mutex_unlock(&vcpu->mutex);
fa3795a7
DH
4624 kfree(fpu);
4625 kfree(kvm_sregs);
bccf2150
AK
4626 return r;
4627}
4628
de8e5d74 4629#ifdef CONFIG_KVM_COMPAT
1dda606c
AG
4630static long kvm_vcpu_compat_ioctl(struct file *filp,
4631 unsigned int ioctl, unsigned long arg)
4632{
4633 struct kvm_vcpu *vcpu = filp->private_data;
4634 void __user *argp = compat_ptr(arg);
4635 int r;
4636
f4d31653 4637 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
1dda606c
AG
4638 return -EIO;
4639
4640 switch (ioctl) {
4641 case KVM_SET_SIGNAL_MASK: {
4642 struct kvm_signal_mask __user *sigmask_arg = argp;
4643 struct kvm_signal_mask kvm_sigmask;
1dda606c
AG
4644 sigset_t sigset;
4645
4646 if (argp) {
4647 r = -EFAULT;
4648 if (copy_from_user(&kvm_sigmask, argp,
893bdbf1 4649 sizeof(kvm_sigmask)))
1dda606c
AG
4650 goto out;
4651 r = -EINVAL;
3968cf62 4652 if (kvm_sigmask.len != sizeof(compat_sigset_t))
1dda606c
AG
4653 goto out;
4654 r = -EFAULT;
1393b4aa
PB
4655 if (get_compat_sigset(&sigset,
4656 (compat_sigset_t __user *)sigmask_arg->sigset))
1dda606c 4657 goto out;
760a9a30
AC
4658 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4659 } else
4660 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
1dda606c
AG
4661 break;
4662 }
4663 default:
4664 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4665 }
4666
4667out:
4668 return r;
4669}
4670#endif
4671
a1cd3f08
CLG
4672static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4673{
4674 struct kvm_device *dev = filp->private_data;
4675
4676 if (dev->ops->mmap)
4677 return dev->ops->mmap(dev, vma);
4678
4679 return -ENODEV;
4680}
4681
852b6d57
SW
4682static int kvm_device_ioctl_attr(struct kvm_device *dev,
4683 int (*accessor)(struct kvm_device *dev,
4684 struct kvm_device_attr *attr),
4685 unsigned long arg)
4686{
4687 struct kvm_device_attr attr;
4688
4689 if (!accessor)
4690 return -EPERM;
4691
4692 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4693 return -EFAULT;
4694
4695 return accessor(dev, &attr);
4696}
4697
4698static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4699 unsigned long arg)
4700{
4701 struct kvm_device *dev = filp->private_data;
4702
f4d31653 4703 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
ddba9180
SC
4704 return -EIO;
4705
852b6d57
SW
4706 switch (ioctl) {
4707 case KVM_SET_DEVICE_ATTR:
4708 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4709 case KVM_GET_DEVICE_ATTR:
4710 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4711 case KVM_HAS_DEVICE_ATTR:
4712 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4713 default:
4714 if (dev->ops->ioctl)
4715 return dev->ops->ioctl(dev, ioctl, arg);
4716
4717 return -ENOTTY;
4718 }
4719}
4720
852b6d57
SW
4721static int kvm_device_release(struct inode *inode, struct file *filp)
4722{
4723 struct kvm_device *dev = filp->private_data;
4724 struct kvm *kvm = dev->kvm;
4725
2bde9b3e
CLG
4726 if (dev->ops->release) {
4727 mutex_lock(&kvm->lock);
4728 list_del(&dev->vm_node);
4729 dev->ops->release(dev);
4730 mutex_unlock(&kvm->lock);
4731 }
4732
852b6d57
SW
4733 kvm_put_kvm(kvm);
4734 return 0;
4735}
4736
087e1520 4737static struct file_operations kvm_device_fops = {
852b6d57
SW
4738 .unlocked_ioctl = kvm_device_ioctl,
4739 .release = kvm_device_release,
7ddfd3e0 4740 KVM_COMPAT(kvm_device_ioctl),
a1cd3f08 4741 .mmap = kvm_device_mmap,
852b6d57
SW
4742};
4743
4744struct kvm_device *kvm_device_from_filp(struct file *filp)
4745{
4746 if (filp->f_op != &kvm_device_fops)
4747 return NULL;
4748
4749 return filp->private_data;
4750}
4751
8538cb22 4752static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
5df554ad 4753#ifdef CONFIG_KVM_MPIC
d60eacb0
WD
4754 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4755 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
5975a2e0 4756#endif
d60eacb0
WD
4757};
4758
8538cb22 4759int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
d60eacb0
WD
4760{
4761 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4762 return -ENOSPC;
4763
4764 if (kvm_device_ops_table[type] != NULL)
4765 return -EEXIST;
4766
4767 kvm_device_ops_table[type] = ops;
4768 return 0;
4769}
4770
571ee1b6
WL
4771void kvm_unregister_device_ops(u32 type)
4772{
4773 if (kvm_device_ops_table[type] != NULL)
4774 kvm_device_ops_table[type] = NULL;
4775}
4776
852b6d57
SW
4777static int kvm_ioctl_create_device(struct kvm *kvm,
4778 struct kvm_create_device *cd)
4779{
eceb6e1d 4780 const struct kvm_device_ops *ops;
852b6d57
SW
4781 struct kvm_device *dev;
4782 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
1d487e9b 4783 int type;
852b6d57
SW
4784 int ret;
4785
d60eacb0
WD
4786 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4787 return -ENODEV;
4788
1d487e9b
PB
4789 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4790 ops = kvm_device_ops_table[type];
d60eacb0 4791 if (ops == NULL)
852b6d57 4792 return -ENODEV;
852b6d57
SW
4793
4794 if (test)
4795 return 0;
4796
b12ce36a 4797 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
852b6d57
SW
4798 if (!dev)
4799 return -ENOMEM;
4800
4801 dev->ops = ops;
4802 dev->kvm = kvm;
852b6d57 4803
a28ebea2 4804 mutex_lock(&kvm->lock);
1d487e9b 4805 ret = ops->create(dev, type);
852b6d57 4806 if (ret < 0) {
a28ebea2 4807 mutex_unlock(&kvm->lock);
852b6d57
SW
4808 kfree(dev);
4809 return ret;
4810 }
a28ebea2
CD
4811 list_add(&dev->vm_node, &kvm->devices);
4812 mutex_unlock(&kvm->lock);
852b6d57 4813
023e9fdd
CD
4814 if (ops->init)
4815 ops->init(dev);
4816
cfa39381 4817 kvm_get_kvm(kvm);
24009b05 4818 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
852b6d57 4819 if (ret < 0) {
149487bd 4820 kvm_put_kvm_no_destroy(kvm);
a28ebea2
CD
4821 mutex_lock(&kvm->lock);
4822 list_del(&dev->vm_node);
e8bc2427
AK
4823 if (ops->release)
4824 ops->release(dev);
a28ebea2 4825 mutex_unlock(&kvm->lock);
e8bc2427
AK
4826 if (ops->destroy)
4827 ops->destroy(dev);
852b6d57
SW
4828 return ret;
4829 }
4830
852b6d57
SW
4831 cd->fd = ret;
4832 return 0;
4833}
4834
f15ba52b 4835static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
92b591a4
AG
4836{
4837 switch (arg) {
4838 case KVM_CAP_USER_MEMORY:
bb58b90b 4839 case KVM_CAP_USER_MEMORY2:
92b591a4
AG
4840 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4841 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
92b591a4
AG
4842 case KVM_CAP_INTERNAL_ERROR_DATA:
4843#ifdef CONFIG_HAVE_KVM_MSI
4844 case KVM_CAP_SIGNAL_MSI:
4845#endif
c5b31cc2 4846#ifdef CONFIG_HAVE_KVM_IRQCHIP
dc9be0fa 4847 case KVM_CAP_IRQFD:
92b591a4 4848#endif
e9ea5069 4849 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
92b591a4 4850 case KVM_CAP_CHECK_EXTENSION_VM:
e5d83c74 4851 case KVM_CAP_ENABLE_CAP_VM:
acd05785 4852 case KVM_CAP_HALT_POLL:
92b591a4 4853 return 1;
4b4357e0 4854#ifdef CONFIG_KVM_MMIO
30422558
PB
4855 case KVM_CAP_COALESCED_MMIO:
4856 return KVM_COALESCED_MMIO_PAGE_OFFSET;
0804c849
PH
4857 case KVM_CAP_COALESCED_PIO:
4858 return 1;
30422558 4859#endif
3c9bd400
JZ
4860#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4861 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4862 return KVM_DIRTY_LOG_MANUAL_CAPS;
4863#endif
92b591a4
AG
4864#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4865 case KVM_CAP_IRQ_ROUTING:
4866 return KVM_MAX_IRQ_ROUTES;
f481b069 4867#endif
eed52e43 4868#if KVM_MAX_NR_ADDRESS_SPACES > 1
f481b069 4869 case KVM_CAP_MULTI_ADDRESS_SPACE:
eed52e43
SC
4870 if (kvm)
4871 return kvm_arch_nr_memslot_as_ids(kvm);
4872 return KVM_MAX_NR_ADDRESS_SPACES;
92b591a4 4873#endif
c110ae57
PB
4874 case KVM_CAP_NR_MEMSLOTS:
4875 return KVM_USER_MEM_SLOTS;
fb04a1ed 4876 case KVM_CAP_DIRTY_LOG_RING:
17601bfe
MZ
4877#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4878 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4879#else
4880 return 0;
4881#endif
4882 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4883#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
fb04a1ed
PX
4884 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4885#else
4886 return 0;
86bdf3eb
GS
4887#endif
4888#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4889 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
fb04a1ed 4890#endif
ce55c049 4891 case KVM_CAP_BINARY_STATS_FD:
d495f942 4892 case KVM_CAP_SYSTEM_EVENT_DATA:
63912245 4893 case KVM_CAP_DEVICE_CTRL:
ce55c049 4894 return 1;
5a475554
CP
4895#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
4896 case KVM_CAP_MEMORY_ATTRIBUTES:
4897 return kvm_supported_mem_attributes(kvm);
a7800aa8
SC
4898#endif
4899#ifdef CONFIG_KVM_PRIVATE_MEM
4900 case KVM_CAP_GUEST_MEMFD:
4901 return !kvm || kvm_arch_has_private_mem(kvm);
5a475554 4902#endif
92b591a4
AG
4903 default:
4904 break;
4905 }
4906 return kvm_vm_ioctl_check_extension(kvm, arg);
4907}
4908
fb04a1ed
PX
4909static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4910{
4911 int r;
4912
4913 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4914 return -EINVAL;
4915
4916 /* the size should be power of 2 */
4917 if (!size || (size & (size - 1)))
4918 return -EINVAL;
4919
4920 /* Should be bigger to keep the reserved entries, or a page */
4921 if (size < kvm_dirty_ring_get_rsvd_entries() *
4922 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4923 return -EINVAL;
4924
4925 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4926 sizeof(struct kvm_dirty_gfn))
4927 return -E2BIG;
4928
4929 /* We only allow it to set once */
4930 if (kvm->dirty_ring_size)
4931 return -EINVAL;
4932
4933 mutex_lock(&kvm->lock);
4934
4935 if (kvm->created_vcpus) {
4936 /* We don't allow to change this value after vcpu created */
4937 r = -EINVAL;
4938 } else {
4939 kvm->dirty_ring_size = size;
4940 r = 0;
4941 }
4942
4943 mutex_unlock(&kvm->lock);
4944 return r;
4945}
4946
4947static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4948{
46808a4c 4949 unsigned long i;
fb04a1ed
PX
4950 struct kvm_vcpu *vcpu;
4951 int cleared = 0;
4952
4953 if (!kvm->dirty_ring_size)
4954 return -EINVAL;
4955
4956 mutex_lock(&kvm->slots_lock);
4957
4958 kvm_for_each_vcpu(i, vcpu, kvm)
4959 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4960
4961 mutex_unlock(&kvm->slots_lock);
4962
4963 if (cleared)
4964 kvm_flush_remote_tlbs(kvm);
4965
4966 return cleared;
4967}
4968
e5d83c74
PB
4969int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4970 struct kvm_enable_cap *cap)
4971{
4972 return -EINVAL;
4973}
4974
26f45714 4975bool kvm_are_all_memslots_empty(struct kvm *kvm)
86bdf3eb
GS
4976{
4977 int i;
4978
4979 lockdep_assert_held(&kvm->slots_lock);
4980
eed52e43 4981 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
86bdf3eb
GS
4982 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4983 return false;
4984 }
4985
4986 return true;
4987}
26f45714 4988EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
86bdf3eb 4989
e5d83c74
PB
4990static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4991 struct kvm_enable_cap *cap)
4992{
4993 switch (cap->cap) {
2a31b9db 4994#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3c9bd400
JZ
4995 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4996 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4997
4998 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4999 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
5000
5001 if (cap->flags || (cap->args[0] & ~allowed_options))
2a31b9db
PB
5002 return -EINVAL;
5003 kvm->manual_dirty_log_protect = cap->args[0];
5004 return 0;
3c9bd400 5005 }
2a31b9db 5006#endif
acd05785
DM
5007 case KVM_CAP_HALT_POLL: {
5008 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
5009 return -EINVAL;
5010
5011 kvm->max_halt_poll_ns = cap->args[0];
9eb8ca04
DM
5012
5013 /*
5014 * Ensure kvm->override_halt_poll_ns does not become visible
5015 * before kvm->max_halt_poll_ns.
5016 *
5017 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
5018 */
5019 smp_wmb();
5020 kvm->override_halt_poll_ns = true;
5021
acd05785
DM
5022 return 0;
5023 }
fb04a1ed 5024 case KVM_CAP_DIRTY_LOG_RING:
17601bfe 5025 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
7a2726ec
GS
5026 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
5027 return -EINVAL;
5028
fb04a1ed 5029 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
86bdf3eb
GS
5030 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
5031 int r = -EINVAL;
5032
5033 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
5034 !kvm->dirty_ring_size || cap->flags)
5035 return r;
5036
5037 mutex_lock(&kvm->slots_lock);
5038
5039 /*
5040 * For simplicity, allow enabling ring+bitmap if and only if
5041 * there are no memslots, e.g. to ensure all memslots allocate
5042 * a bitmap after the capability is enabled.
5043 */
5044 if (kvm_are_all_memslots_empty(kvm)) {
5045 kvm->dirty_ring_with_bitmap = true;
5046 r = 0;
5047 }
5048
5049 mutex_unlock(&kvm->slots_lock);
5050
5051 return r;
5052 }
e5d83c74
PB
5053 default:
5054 return kvm_vm_ioctl_enable_cap(kvm, cap);
5055 }
5056}
5057
fcfe1bae
JZ
5058static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
5059 size_t size, loff_t *offset)
5060{
5061 struct kvm *kvm = file->private_data;
5062
5063 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
5064 &kvm_vm_stats_desc[0], &kvm->stat,
5065 sizeof(kvm->stat), user_buffer, size, offset);
5066}
5067
eed3013f
SC
5068static int kvm_vm_stats_release(struct inode *inode, struct file *file)
5069{
5070 struct kvm *kvm = file->private_data;
5071
5072 kvm_put_kvm(kvm);
5073 return 0;
5074}
5075
fcfe1bae 5076static const struct file_operations kvm_vm_stats_fops = {
087e1520 5077 .owner = THIS_MODULE,
fcfe1bae 5078 .read = kvm_vm_stats_read,
eed3013f 5079 .release = kvm_vm_stats_release,
fcfe1bae
JZ
5080 .llseek = noop_llseek,
5081};
5082
5083static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
5084{
5085 int fd;
5086 struct file *file;
5087
5088 fd = get_unused_fd_flags(O_CLOEXEC);
5089 if (fd < 0)
5090 return fd;
5091
5092 file = anon_inode_getfile("kvm-vm-stats",
5093 &kvm_vm_stats_fops, kvm, O_RDONLY);
5094 if (IS_ERR(file)) {
5095 put_unused_fd(fd);
5096 return PTR_ERR(file);
5097 }
eed3013f
SC
5098
5099 kvm_get_kvm(kvm);
5100
fcfe1bae
JZ
5101 file->f_mode |= FMODE_PREAD;
5102 fd_install(fd, file);
5103
5104 return fd;
5105}
5106
bb58b90b
SC
5107#define SANITY_CHECK_MEM_REGION_FIELD(field) \
5108do { \
5109 BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
5110 offsetof(struct kvm_userspace_memory_region2, field)); \
5111 BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
5112 sizeof_field(struct kvm_userspace_memory_region2, field)); \
5113} while (0)
5114
bccf2150
AK
5115static long kvm_vm_ioctl(struct file *filp,
5116 unsigned int ioctl, unsigned long arg)
5117{
5118 struct kvm *kvm = filp->private_data;
5119 void __user *argp = (void __user *)arg;
1fe779f8 5120 int r;
bccf2150 5121
f4d31653 5122 if (kvm->mm != current->mm || kvm->vm_dead)
6d4e4c4f 5123 return -EIO;
bccf2150
AK
5124 switch (ioctl) {
5125 case KVM_CREATE_VCPU:
5126 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
bccf2150 5127 break;
e5d83c74
PB
5128 case KVM_ENABLE_CAP: {
5129 struct kvm_enable_cap cap;
5130
5131 r = -EFAULT;
5132 if (copy_from_user(&cap, argp, sizeof(cap)))
5133 goto out;
5134 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
5135 break;
5136 }
bb58b90b 5137 case KVM_SET_USER_MEMORY_REGION2:
6fc138d2 5138 case KVM_SET_USER_MEMORY_REGION: {
bb58b90b
SC
5139 struct kvm_userspace_memory_region2 mem;
5140 unsigned long size;
5141
5142 if (ioctl == KVM_SET_USER_MEMORY_REGION) {
5143 /*
5144 * Fields beyond struct kvm_userspace_memory_region shouldn't be
5145 * accessed, but avoid leaking kernel memory in case of a bug.
5146 */
5147 memset(&mem, 0, sizeof(mem));
5148 size = sizeof(struct kvm_userspace_memory_region);
5149 } else {
5150 size = sizeof(struct kvm_userspace_memory_region2);
5151 }
5152
5153 /* Ensure the common parts of the two structs are identical. */
5154 SANITY_CHECK_MEM_REGION_FIELD(slot);
5155 SANITY_CHECK_MEM_REGION_FIELD(flags);
5156 SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
5157 SANITY_CHECK_MEM_REGION_FIELD(memory_size);
5158 SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
6fc138d2
IE
5159
5160 r = -EFAULT;
bb58b90b
SC
5161 if (copy_from_user(&mem, argp, size))
5162 goto out;
5163
5164 r = -EINVAL;
5165 if (ioctl == KVM_SET_USER_MEMORY_REGION &&
5166 (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
6fc138d2
IE
5167 goto out;
5168
bb58b90b 5169 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
6aa8b732
AK
5170 break;
5171 }
5172 case KVM_GET_DIRTY_LOG: {
5173 struct kvm_dirty_log log;
5174
5175 r = -EFAULT;
893bdbf1 5176 if (copy_from_user(&log, argp, sizeof(log)))
6aa8b732 5177 goto out;
2c6f5df9 5178 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6aa8b732
AK
5179 break;
5180 }
2a31b9db
PB
5181#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5182 case KVM_CLEAR_DIRTY_LOG: {
5183 struct kvm_clear_dirty_log log;
5184
5185 r = -EFAULT;
5186 if (copy_from_user(&log, argp, sizeof(log)))
5187 goto out;
5188 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5189 break;
5190 }
5191#endif
4b4357e0 5192#ifdef CONFIG_KVM_MMIO
5f94c174
LV
5193 case KVM_REGISTER_COALESCED_MMIO: {
5194 struct kvm_coalesced_mmio_zone zone;
f95ef0cd 5195
5f94c174 5196 r = -EFAULT;
893bdbf1 5197 if (copy_from_user(&zone, argp, sizeof(zone)))
5f94c174 5198 goto out;
5f94c174 5199 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
5f94c174
LV
5200 break;
5201 }
5202 case KVM_UNREGISTER_COALESCED_MMIO: {
5203 struct kvm_coalesced_mmio_zone zone;
f95ef0cd 5204
5f94c174 5205 r = -EFAULT;
893bdbf1 5206 if (copy_from_user(&zone, argp, sizeof(zone)))
5f94c174 5207 goto out;
5f94c174 5208 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
5f94c174
LV
5209 break;
5210 }
5211#endif
721eecbf
GH
5212 case KVM_IRQFD: {
5213 struct kvm_irqfd data;
5214
5215 r = -EFAULT;
893bdbf1 5216 if (copy_from_user(&data, argp, sizeof(data)))
721eecbf 5217 goto out;
d4db2935 5218 r = kvm_irqfd(kvm, &data);
721eecbf
GH
5219 break;
5220 }
d34e6b17
GH
5221 case KVM_IOEVENTFD: {
5222 struct kvm_ioeventfd data;
5223
5224 r = -EFAULT;
893bdbf1 5225 if (copy_from_user(&data, argp, sizeof(data)))
d34e6b17
GH
5226 goto out;
5227 r = kvm_ioeventfd(kvm, &data);
5228 break;
5229 }
07975ad3
JK
5230#ifdef CONFIG_HAVE_KVM_MSI
5231 case KVM_SIGNAL_MSI: {
5232 struct kvm_msi msi;
5233
5234 r = -EFAULT;
893bdbf1 5235 if (copy_from_user(&msi, argp, sizeof(msi)))
07975ad3
JK
5236 goto out;
5237 r = kvm_send_userspace_msi(kvm, &msi);
5238 break;
5239 }
23d43cf9
CD
5240#endif
5241#ifdef __KVM_HAVE_IRQ_LINE
5242 case KVM_IRQ_LINE_STATUS:
5243 case KVM_IRQ_LINE: {
5244 struct kvm_irq_level irq_event;
5245
5246 r = -EFAULT;
893bdbf1 5247 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
23d43cf9
CD
5248 goto out;
5249
aa2fbe6d
YZ
5250 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
5251 ioctl == KVM_IRQ_LINE_STATUS);
23d43cf9
CD
5252 if (r)
5253 goto out;
5254
5255 r = -EFAULT;
5256 if (ioctl == KVM_IRQ_LINE_STATUS) {
893bdbf1 5257 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
23d43cf9
CD
5258 goto out;
5259 }
5260
5261 r = 0;
5262 break;
5263 }
73880c80 5264#endif
aa8d5944
AG
5265#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
5266 case KVM_SET_GSI_ROUTING: {
5267 struct kvm_irq_routing routing;
5268 struct kvm_irq_routing __user *urouting;
f8c1b85b 5269 struct kvm_irq_routing_entry *entries = NULL;
aa8d5944
AG
5270
5271 r = -EFAULT;
5272 if (copy_from_user(&routing, argp, sizeof(routing)))
5273 goto out;
5274 r = -EINVAL;
5c0aea0e
DH
5275 if (!kvm_arch_can_set_irq_routing(kvm))
5276 goto out;
caf1ff26 5277 if (routing.nr > KVM_MAX_IRQ_ROUTES)
aa8d5944
AG
5278 goto out;
5279 if (routing.flags)
5280 goto out;
f8c1b85b 5281 if (routing.nr) {
f8c1b85b 5282 urouting = argp;
1f829359
PS
5283 entries = vmemdup_array_user(urouting->entries,
5284 routing.nr, sizeof(*entries));
7ec28e26
DE
5285 if (IS_ERR(entries)) {
5286 r = PTR_ERR(entries);
5287 goto out;
5288 }
f8c1b85b 5289 }
aa8d5944
AG
5290 r = kvm_set_irq_routing(kvm, entries, routing.nr,
5291 routing.flags);
7ec28e26 5292 kvfree(entries);
aa8d5944
AG
5293 break;
5294 }
5295#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
5a475554
CP
5296#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
5297 case KVM_SET_MEMORY_ATTRIBUTES: {
5298 struct kvm_memory_attributes attrs;
5299
5300 r = -EFAULT;
5301 if (copy_from_user(&attrs, argp, sizeof(attrs)))
5302 goto out;
5303
5304 r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
5305 break;
5306 }
5307#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
852b6d57
SW
5308 case KVM_CREATE_DEVICE: {
5309 struct kvm_create_device cd;
5310
5311 r = -EFAULT;
5312 if (copy_from_user(&cd, argp, sizeof(cd)))
5313 goto out;
5314
5315 r = kvm_ioctl_create_device(kvm, &cd);
5316 if (r)
5317 goto out;
5318
5319 r = -EFAULT;
5320 if (copy_to_user(argp, &cd, sizeof(cd)))
5321 goto out;
5322
5323 r = 0;
5324 break;
5325 }
92b591a4
AG
5326 case KVM_CHECK_EXTENSION:
5327 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
5328 break;
fb04a1ed
PX
5329 case KVM_RESET_DIRTY_RINGS:
5330 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
5331 break;
fcfe1bae
JZ
5332 case KVM_GET_STATS_FD:
5333 r = kvm_vm_ioctl_get_stats_fd(kvm);
5334 break;
a7800aa8
SC
5335#ifdef CONFIG_KVM_PRIVATE_MEM
5336 case KVM_CREATE_GUEST_MEMFD: {
5337 struct kvm_create_guest_memfd guest_memfd;
5338
5339 r = -EFAULT;
5340 if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
5341 goto out;
5342
5343 r = kvm_gmem_create(kvm, &guest_memfd);
5344 break;
5345 }
5346#endif
f17abe9a 5347 default:
1fe779f8 5348 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
f17abe9a
AK
5349 }
5350out:
5351 return r;
5352}
5353
de8e5d74 5354#ifdef CONFIG_KVM_COMPAT
6ff5894c
AB
5355struct compat_kvm_dirty_log {
5356 __u32 slot;
5357 __u32 padding1;
5358 union {
5359 compat_uptr_t dirty_bitmap; /* one bit per page */
5360 __u64 padding2;
5361 };
5362};
5363
8750f9bb
PB
5364struct compat_kvm_clear_dirty_log {
5365 __u32 slot;
5366 __u32 num_pages;
5367 __u64 first_page;
5368 union {
5369 compat_uptr_t dirty_bitmap; /* one bit per page */
5370 __u64 padding2;
5371 };
5372};
5373
ed51862f
AG
5374long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5375 unsigned long arg)
5376{
5377 return -ENOTTY;
5378}
5379
6ff5894c
AB
5380static long kvm_vm_compat_ioctl(struct file *filp,
5381 unsigned int ioctl, unsigned long arg)
5382{
5383 struct kvm *kvm = filp->private_data;
5384 int r;
5385
f4d31653 5386 if (kvm->mm != current->mm || kvm->vm_dead)
6ff5894c 5387 return -EIO;
ed51862f
AG
5388
5389 r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5390 if (r != -ENOTTY)
5391 return r;
5392
6ff5894c 5393 switch (ioctl) {
8750f9bb
PB
5394#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5395 case KVM_CLEAR_DIRTY_LOG: {
5396 struct compat_kvm_clear_dirty_log compat_log;
5397 struct kvm_clear_dirty_log log;
5398
5399 if (copy_from_user(&compat_log, (void __user *)arg,
5400 sizeof(compat_log)))
5401 return -EFAULT;
5402 log.slot = compat_log.slot;
5403 log.num_pages = compat_log.num_pages;
5404 log.first_page = compat_log.first_page;
5405 log.padding2 = compat_log.padding2;
5406 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5407
5408 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5409 break;
5410 }
5411#endif
6ff5894c
AB
5412 case KVM_GET_DIRTY_LOG: {
5413 struct compat_kvm_dirty_log compat_log;
5414 struct kvm_dirty_log log;
5415
6ff5894c
AB
5416 if (copy_from_user(&compat_log, (void __user *)arg,
5417 sizeof(compat_log)))
f6a3b168 5418 return -EFAULT;
6ff5894c
AB
5419 log.slot = compat_log.slot;
5420 log.padding1 = compat_log.padding1;
5421 log.padding2 = compat_log.padding2;
5422 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5423
5424 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6ff5894c
AB
5425 break;
5426 }
5427 default:
5428 r = kvm_vm_ioctl(filp, ioctl, arg);
5429 }
6ff5894c
AB
5430 return r;
5431}
5432#endif
5433
087e1520 5434static struct file_operations kvm_vm_fops = {
f17abe9a
AK
5435 .release = kvm_vm_release,
5436 .unlocked_ioctl = kvm_vm_ioctl,
6038f373 5437 .llseek = noop_llseek,
7ddfd3e0 5438 KVM_COMPAT(kvm_vm_compat_ioctl),
f17abe9a
AK
5439};
5440
54526d1f
NT
5441bool file_is_kvm(struct file *file)
5442{
5443 return file && file->f_op == &kvm_vm_fops;
5444}
5445EXPORT_SYMBOL_GPL(file_is_kvm);
5446
e08b9637 5447static int kvm_dev_ioctl_create_vm(unsigned long type)
f17abe9a 5448{
59f82aad 5449 char fdname[ITOA_MAX_LEN + 1];
20020f4c 5450 int r, fd;
f17abe9a 5451 struct kvm *kvm;
506cfba9 5452 struct file *file;
f17abe9a 5453
20020f4c
OU
5454 fd = get_unused_fd_flags(O_CLOEXEC);
5455 if (fd < 0)
5456 return fd;
5457
59f82aad
OU
5458 snprintf(fdname, sizeof(fdname), "%d", fd);
5459
b74ed7a6 5460 kvm = kvm_create_vm(type, fdname);
20020f4c
OU
5461 if (IS_ERR(kvm)) {
5462 r = PTR_ERR(kvm);
5463 goto put_fd;
5464 }
5465
506cfba9
AV
5466 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5467 if (IS_ERR(file)) {
78588335
ME
5468 r = PTR_ERR(file);
5469 goto put_kvm;
506cfba9 5470 }
536a6f88 5471
525df861
PB
5472 /*
5473 * Don't call kvm_put_kvm anymore at this point; file->f_op is
5474 * already set, with ->release() being kvm_vm_release(). In error
5475 * cases it will be called by the final fput(file) and will take
5476 * care of doing kvm_put_kvm(kvm).
5477 */
286de8f6 5478 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
f17abe9a 5479
20020f4c
OU
5480 fd_install(fd, file);
5481 return fd;
78588335
ME
5482
5483put_kvm:
5484 kvm_put_kvm(kvm);
20020f4c
OU
5485put_fd:
5486 put_unused_fd(fd);
78588335 5487 return r;
f17abe9a
AK
5488}
5489
5490static long kvm_dev_ioctl(struct file *filp,
5491 unsigned int ioctl, unsigned long arg)
5492{
f15ba52b 5493 int r = -EINVAL;
f17abe9a
AK
5494
5495 switch (ioctl) {
5496 case KVM_GET_API_VERSION:
f0fe5108
AK
5497 if (arg)
5498 goto out;
f17abe9a
AK
5499 r = KVM_API_VERSION;
5500 break;
5501 case KVM_CREATE_VM:
e08b9637 5502 r = kvm_dev_ioctl_create_vm(arg);
f17abe9a 5503 break;
018d00d2 5504 case KVM_CHECK_EXTENSION:
784aa3d7 5505 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5d308f45 5506 break;
07c45a36 5507 case KVM_GET_VCPU_MMAP_SIZE:
07c45a36
AK
5508 if (arg)
5509 goto out;
adb1ff46
AK
5510 r = PAGE_SIZE; /* struct kvm_run */
5511#ifdef CONFIG_X86
5512 r += PAGE_SIZE; /* pio data page */
5f94c174 5513#endif
4b4357e0 5514#ifdef CONFIG_KVM_MMIO
5f94c174 5515 r += PAGE_SIZE; /* coalesced mmio ring page */
adb1ff46 5516#endif
07c45a36 5517 break;
6aa8b732 5518 default:
043405e1 5519 return kvm_arch_dev_ioctl(filp, ioctl, arg);
6aa8b732
AK
5520 }
5521out:
5522 return r;
5523}
5524
6aa8b732 5525static struct file_operations kvm_chardev_ops = {
6aa8b732 5526 .unlocked_ioctl = kvm_dev_ioctl,
6038f373 5527 .llseek = noop_llseek,
7ddfd3e0 5528 KVM_COMPAT(kvm_dev_ioctl),
6aa8b732
AK
5529};
5530
5531static struct miscdevice kvm_dev = {
bbe4432e 5532 KVM_MINOR,
6aa8b732
AK
5533 "kvm",
5534 &kvm_chardev_ops,
5535};
5536
441f7bfa
SC
5537#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5538__visible bool kvm_rebooting;
5539EXPORT_SYMBOL_GPL(kvm_rebooting);
5540
5541static DEFINE_PER_CPU(bool, hardware_enabled);
5542static int kvm_usage_count;
5543
e6fb7d6e 5544static int __hardware_enable_nolock(void)
1b6c0168 5545{
37d25881 5546 if (__this_cpu_read(hardware_enabled))
e6fb7d6e 5547 return 0;
10474ae8 5548
37d25881 5549 if (kvm_arch_hardware_enable()) {
37d25881
SC
5550 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5551 raw_smp_processor_id());
e6fb7d6e 5552 return -EIO;
10474ae8 5553 }
37d25881
SC
5554
5555 __this_cpu_write(hardware_enabled, true);
e6fb7d6e
IY
5556 return 0;
5557}
5558
5559static void hardware_enable_nolock(void *failed)
5560{
5561 if (__hardware_enable_nolock())
5562 atomic_inc(failed);
1b6c0168
AK
5563}
5564
aaf12a7b 5565static int kvm_online_cpu(unsigned int cpu)
75b7127c 5566{
aaf12a7b
CG
5567 int ret = 0;
5568
5569 /*
5570 * Abort the CPU online process if hardware virtualization cannot
5571 * be enabled. Otherwise running VMs would encounter unrecoverable
5572 * errors when scheduled to this CPU.
5573 */
0bf50497 5574 mutex_lock(&kvm_lock);
e6fb7d6e
IY
5575 if (kvm_usage_count)
5576 ret = __hardware_enable_nolock();
0bf50497 5577 mutex_unlock(&kvm_lock);
aaf12a7b 5578 return ret;
75b7127c
TY
5579}
5580
5581static void hardware_disable_nolock(void *junk)
1b6c0168 5582{
37d25881
SC
5583 /*
5584 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
5585 * hardware, not just CPUs that successfully enabled hardware!
5586 */
5587 if (!__this_cpu_read(hardware_enabled))
1b6c0168 5588 return;
37d25881 5589
13a34e06 5590 kvm_arch_hardware_disable();
37d25881
SC
5591
5592 __this_cpu_write(hardware_enabled, false);
1b6c0168
AK
5593}
5594
aaf12a7b 5595static int kvm_offline_cpu(unsigned int cpu)
75b7127c 5596{
0bf50497 5597 mutex_lock(&kvm_lock);
4fa92fb2
PB
5598 if (kvm_usage_count)
5599 hardware_disable_nolock(NULL);
0bf50497 5600 mutex_unlock(&kvm_lock);
8c18b2d2 5601 return 0;
75b7127c
TY
5602}
5603
10474ae8
AG
5604static void hardware_disable_all_nolock(void)
5605{
5606 BUG_ON(!kvm_usage_count);
5607
5608 kvm_usage_count--;
5609 if (!kvm_usage_count)
75b7127c 5610 on_each_cpu(hardware_disable_nolock, NULL, 1);
10474ae8
AG
5611}
5612
5613static void hardware_disable_all(void)
5614{
e4aa7f88 5615 cpus_read_lock();
0bf50497 5616 mutex_lock(&kvm_lock);
10474ae8 5617 hardware_disable_all_nolock();
0bf50497 5618 mutex_unlock(&kvm_lock);
e4aa7f88 5619 cpus_read_unlock();
10474ae8
AG
5620}
5621
5622static int hardware_enable_all(void)
5623{
e6fb7d6e 5624 atomic_t failed = ATOMIC_INIT(0);
e0ceec22
SC
5625 int r;
5626
5627 /*
5628 * Do not enable hardware virtualization if the system is going down.
5629 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5630 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5631 * after kvm_reboot() is called. Note, this relies on system_state
5632 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5633 * hook instead of registering a dedicated reboot notifier (the latter
5634 * runs before system_state is updated).
5635 */
5636 if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5637 system_state == SYSTEM_RESTART)
5638 return -EBUSY;
10474ae8 5639
e4aa7f88
CG
5640 /*
5641 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5642 * is called, and so on_each_cpu() between them includes the CPU that
5643 * is being onlined. As a result, hardware_enable_nolock() may get
5644 * invoked before kvm_online_cpu(), which also enables hardware if the
5645 * usage count is non-zero. Disable CPU hotplug to avoid attempting to
5646 * enable hardware multiple times.
5647 */
5648 cpus_read_lock();
0bf50497 5649 mutex_lock(&kvm_lock);
10474ae8 5650
e0ceec22
SC
5651 r = 0;
5652
10474ae8
AG
5653 kvm_usage_count++;
5654 if (kvm_usage_count == 1) {
e6fb7d6e 5655 on_each_cpu(hardware_enable_nolock, &failed, 1);
10474ae8 5656
e6fb7d6e 5657 if (atomic_read(&failed)) {
10474ae8
AG
5658 hardware_disable_all_nolock();
5659 r = -EBUSY;
5660 }
5661 }
5662
0bf50497 5663 mutex_unlock(&kvm_lock);
e4aa7f88 5664 cpus_read_unlock();
10474ae8
AG
5665
5666 return r;
5667}
5668
6735150b 5669static void kvm_shutdown(void)
9a2b85c6 5670{
8e1c1815 5671 /*
6735150b
SC
5672 * Disable hardware virtualization and set kvm_rebooting to indicate
5673 * that KVM has asynchronously disabled hardware virtualization, i.e.
5674 * that relevant errors and exceptions aren't entirely unexpected.
5675 * Some flavors of hardware virtualization need to be disabled before
5676 * transferring control to firmware (to perform shutdown/reboot), e.g.
5677 * on x86, virtualization can block INIT interrupts, which are used by
5678 * firmware to pull APs back under firmware control. Note, this path
5679 * is used for both shutdown and reboot scenarios, i.e. neither name is
5680 * 100% comprehensive.
8e1c1815 5681 */
1170adc6 5682 pr_info("kvm: exiting hardware virtualization\n");
8e1c1815 5683 kvm_rebooting = true;
75b7127c 5684 on_each_cpu(hardware_disable_nolock, NULL, 1);
9a2b85c6
RR
5685}
5686
35774a9f
SC
5687static int kvm_suspend(void)
5688{
5689 /*
5690 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5691 * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5692 * is stable. Assert that kvm_lock is not held to ensure the system
5693 * isn't suspended while KVM is enabling hardware. Hardware enabling
5694 * can be preempted, but the task cannot be frozen until it has dropped
5695 * all locks (userspace tasks are frozen via a fake signal).
5696 */
5697 lockdep_assert_not_held(&kvm_lock);
5698 lockdep_assert_irqs_disabled();
5699
5700 if (kvm_usage_count)
5701 hardware_disable_nolock(NULL);
5702 return 0;
5703}
5704
5705static void kvm_resume(void)
5706{
5707 lockdep_assert_not_held(&kvm_lock);
5708 lockdep_assert_irqs_disabled();
5709
5710 if (kvm_usage_count)
5711 WARN_ON_ONCE(__hardware_enable_nolock());
5712}
5713
5714static struct syscore_ops kvm_syscore_ops = {
5715 .suspend = kvm_suspend,
5716 .resume = kvm_resume,
6735150b 5717 .shutdown = kvm_shutdown,
35774a9f 5718};
441f7bfa
SC
5719#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5720static int hardware_enable_all(void)
5721{
5722 return 0;
5723}
5724
5725static void hardware_disable_all(void)
5726{
5727
5728}
5729#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
35774a9f 5730
5ea5ca3c
WW
5731static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5732{
5733 if (dev->ops->destructor)
5734 dev->ops->destructor(dev);
5735}
5736
e93f8a0f 5737static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2eeb2e94
GH
5738{
5739 int i;
5740
5741 for (i = 0; i < bus->dev_count; i++) {
743eeb0b 5742 struct kvm_io_device *pos = bus->range[i].dev;
2eeb2e94
GH
5743
5744 kvm_iodevice_destructor(pos);
5745 }
e93f8a0f 5746 kfree(bus);
2eeb2e94
GH
5747}
5748
c21fbff1 5749static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
20e87b72 5750 const struct kvm_io_range *r2)
743eeb0b 5751{
8f4216c7
JW
5752 gpa_t addr1 = r1->addr;
5753 gpa_t addr2 = r2->addr;
5754
5755 if (addr1 < addr2)
743eeb0b 5756 return -1;
8f4216c7
JW
5757
5758 /* If r2->len == 0, match the exact address. If r2->len != 0,
5759 * accept any overlapping write. Any order is acceptable for
5760 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5761 * we process all of them.
5762 */
5763 if (r2->len) {
5764 addr1 += r1->len;
5765 addr2 += r2->len;
5766 }
5767
5768 if (addr1 > addr2)
743eeb0b 5769 return 1;
8f4216c7 5770
743eeb0b
SL
5771 return 0;
5772}
5773
a343c9b7
PB
5774static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5775{
c21fbff1 5776 return kvm_io_bus_cmp(p1, p2);
a343c9b7
PB
5777}
5778
39369f7a 5779static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
743eeb0b
SL
5780 gpa_t addr, int len)
5781{
5782 struct kvm_io_range *range, key;
5783 int off;
5784
5785 key = (struct kvm_io_range) {
5786 .addr = addr,
5787 .len = len,
5788 };
5789
5790 range = bsearch(&key, bus->range, bus->dev_count,
5791 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5792 if (range == NULL)
5793 return -ENOENT;
5794
5795 off = range - bus->range;
5796
c21fbff1 5797 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
743eeb0b
SL
5798 off--;
5799
5800 return off;
5801}
5802
e32edf4f 5803static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
126a5af5
CH
5804 struct kvm_io_range *range, const void *val)
5805{
5806 int idx;
5807
5808 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5809 if (idx < 0)
5810 return -EOPNOTSUPP;
5811
5812 while (idx < bus->dev_count &&
c21fbff1 5813 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
e32edf4f 5814 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
126a5af5
CH
5815 range->len, val))
5816 return idx;
5817 idx++;
5818 }
5819
5820 return -EOPNOTSUPP;
5821}
5822
bda9020e 5823/* kvm_io_bus_write - called under kvm->slots_lock */
e32edf4f 5824int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
bda9020e 5825 int len, const void *val)
2eeb2e94 5826{
90d83dc3 5827 struct kvm_io_bus *bus;
743eeb0b 5828 struct kvm_io_range range;
126a5af5 5829 int r;
743eeb0b
SL
5830
5831 range = (struct kvm_io_range) {
5832 .addr = addr,
5833 .len = len,
5834 };
90d83dc3 5835
e32edf4f 5836 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5837 if (!bus)
5838 return -ENOMEM;
e32edf4f 5839 r = __kvm_io_bus_write(vcpu, bus, &range, val);
126a5af5
CH
5840 return r < 0 ? r : 0;
5841}
a2420107 5842EXPORT_SYMBOL_GPL(kvm_io_bus_write);
126a5af5
CH
5843
5844/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
e32edf4f
NN
5845int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5846 gpa_t addr, int len, const void *val, long cookie)
126a5af5
CH
5847{
5848 struct kvm_io_bus *bus;
5849 struct kvm_io_range range;
5850
5851 range = (struct kvm_io_range) {
5852 .addr = addr,
5853 .len = len,
5854 };
5855
e32edf4f 5856 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5857 if (!bus)
5858 return -ENOMEM;
126a5af5
CH
5859
5860 /* First try the device referenced by cookie. */
5861 if ((cookie >= 0) && (cookie < bus->dev_count) &&
c21fbff1 5862 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
e32edf4f 5863 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
126a5af5
CH
5864 val))
5865 return cookie;
5866
5867 /*
5868 * cookie contained garbage; fall back to search and return the
5869 * correct cookie value.
5870 */
e32edf4f 5871 return __kvm_io_bus_write(vcpu, bus, &range, val);
126a5af5
CH
5872}
5873
e32edf4f
NN
5874static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5875 struct kvm_io_range *range, void *val)
126a5af5
CH
5876{
5877 int idx;
5878
5879 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
743eeb0b
SL
5880 if (idx < 0)
5881 return -EOPNOTSUPP;
5882
5883 while (idx < bus->dev_count &&
c21fbff1 5884 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
e32edf4f 5885 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
126a5af5
CH
5886 range->len, val))
5887 return idx;
743eeb0b
SL
5888 idx++;
5889 }
5890
bda9020e
MT
5891 return -EOPNOTSUPP;
5892}
2eeb2e94 5893
bda9020e 5894/* kvm_io_bus_read - called under kvm->slots_lock */
e32edf4f 5895int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
e93f8a0f 5896 int len, void *val)
bda9020e 5897{
90d83dc3 5898 struct kvm_io_bus *bus;
743eeb0b 5899 struct kvm_io_range range;
126a5af5 5900 int r;
743eeb0b
SL
5901
5902 range = (struct kvm_io_range) {
5903 .addr = addr,
5904 .len = len,
5905 };
e93f8a0f 5906
e32edf4f 5907 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5908 if (!bus)
5909 return -ENOMEM;
e32edf4f 5910 r = __kvm_io_bus_read(vcpu, bus, &range, val);
126a5af5
CH
5911 return r < 0 ? r : 0;
5912}
743eeb0b 5913
743eeb0b
SL
5914int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5915 int len, struct kvm_io_device *dev)
6c474694 5916{
d4c67a7a 5917 int i;
e93f8a0f 5918 struct kvm_io_bus *new_bus, *bus;
d4c67a7a 5919 struct kvm_io_range range;
090b7aff 5920
b1a39a71
MZ
5921 lockdep_assert_held(&kvm->slots_lock);
5922
4a12f951 5923 bus = kvm_get_bus(kvm, bus_idx);
90db1043
DH
5924 if (!bus)
5925 return -ENOMEM;
5926
6ea34c9b
AK
5927 /* exclude ioeventfd which is limited by maximum fd */
5928 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
090b7aff 5929 return -ENOSPC;
2eeb2e94 5930
90952cd3 5931 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
b12ce36a 5932 GFP_KERNEL_ACCOUNT);
e93f8a0f
MT
5933 if (!new_bus)
5934 return -ENOMEM;
d4c67a7a
GH
5935
5936 range = (struct kvm_io_range) {
5937 .addr = addr,
5938 .len = len,
5939 .dev = dev,
5940 };
5941
5942 for (i = 0; i < bus->dev_count; i++)
5943 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5944 break;
5945
5946 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5947 new_bus->dev_count++;
5948 new_bus->range[i] = range;
5949 memcpy(new_bus->range + i + 1, bus->range + i,
5950 (bus->dev_count - i) * sizeof(struct kvm_io_range));
e93f8a0f
MT
5951 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5952 synchronize_srcu_expedited(&kvm->srcu);
5953 kfree(bus);
090b7aff
GH
5954
5955 return 0;
5956}
5957
5d3c4c79
SC
5958int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5959 struct kvm_io_device *dev)
090b7aff 5960{
5ea5ca3c 5961 int i;
e93f8a0f 5962 struct kvm_io_bus *new_bus, *bus;
090b7aff 5963
7c896d37
SC
5964 lockdep_assert_held(&kvm->slots_lock);
5965
4a12f951 5966 bus = kvm_get_bus(kvm, bus_idx);
df630b8c 5967 if (!bus)
5d3c4c79 5968 return 0;
df630b8c 5969
7c896d37 5970 for (i = 0; i < bus->dev_count; i++) {
a1300716 5971 if (bus->range[i].dev == dev) {
090b7aff
GH
5972 break;
5973 }
7c896d37 5974 }
e93f8a0f 5975
90db1043 5976 if (i == bus->dev_count)
5d3c4c79 5977 return 0;
a1300716 5978
90952cd3 5979 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
b12ce36a 5980 GFP_KERNEL_ACCOUNT);
f6588660 5981 if (new_bus) {
871c433b 5982 memcpy(new_bus, bus, struct_size(bus, range, i));
f6588660
RK
5983 new_bus->dev_count--;
5984 memcpy(new_bus->range + i, bus->range + i + 1,
871c433b 5985 flex_array_size(new_bus, range, new_bus->dev_count - i));
2ee37574
SC
5986 }
5987
5988 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5989 synchronize_srcu_expedited(&kvm->srcu);
5990
5ea5ca3c
WW
5991 /*
5992 * If NULL bus is installed, destroy the old bus, including all the
5993 * attached devices. Otherwise, destroy the caller's device only.
5994 */
2ee37574 5995 if (!new_bus) {
90db1043 5996 pr_err("kvm: failed to shrink bus, removing it completely\n");
5ea5ca3c
WW
5997 kvm_io_bus_destroy(bus);
5998 return -ENOMEM;
90db1043 5999 }
a1300716 6000
5ea5ca3c 6001 kvm_iodevice_destructor(dev);
e93f8a0f 6002 kfree(bus);
5ea5ca3c 6003 return 0;
2eeb2e94
GH
6004}
6005
8a39d006
AP
6006struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
6007 gpa_t addr)
6008{
6009 struct kvm_io_bus *bus;
6010 int dev_idx, srcu_idx;
6011 struct kvm_io_device *iodev = NULL;
6012
6013 srcu_idx = srcu_read_lock(&kvm->srcu);
6014
6015 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
90db1043
DH
6016 if (!bus)
6017 goto out_unlock;
8a39d006
AP
6018
6019 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
6020 if (dev_idx < 0)
6021 goto out_unlock;
6022
6023 iodev = bus->range[dev_idx].dev;
6024
6025out_unlock:
6026 srcu_read_unlock(&kvm->srcu, srcu_idx);
6027
6028 return iodev;
6029}
6030EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
6031
536a6f88
JF
6032static int kvm_debugfs_open(struct inode *inode, struct file *file,
6033 int (*get)(void *, u64 *), int (*set)(void *, u64),
6034 const char *fmt)
6035{
180418e2 6036 int ret;
14aa40a1 6037 struct kvm_stat_data *stat_data = inode->i_private;
536a6f88 6038
605c7130
PX
6039 /*
6040 * The debugfs files are a reference to the kvm struct which
6041 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
6042 * avoids the race between open and the removal of the debugfs directory.
536a6f88 6043 */
605c7130 6044 if (!kvm_get_kvm_safe(stat_data->kvm))
536a6f88
JF
6045 return -ENOENT;
6046
180418e2
HW
6047 ret = simple_attr_open(inode, file, get,
6048 kvm_stats_debugfs_mode(stat_data->desc) & 0222
6049 ? set : NULL, fmt);
6050 if (ret)
536a6f88 6051 kvm_put_kvm(stat_data->kvm);
536a6f88 6052
180418e2 6053 return ret;
536a6f88
JF
6054}
6055
6056static int kvm_debugfs_release(struct inode *inode, struct file *file)
6057{
14aa40a1 6058 struct kvm_stat_data *stat_data = inode->i_private;
536a6f88
JF
6059
6060 simple_attr_release(inode, file);
6061 kvm_put_kvm(stat_data->kvm);
6062
6063 return 0;
6064}
6065
09cbcef6 6066static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
536a6f88 6067{
bc9e9e67 6068 *val = *(u64 *)((void *)(&kvm->stat) + offset);
536a6f88 6069
09cbcef6
MP
6070 return 0;
6071}
6072
6073static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
6074{
bc9e9e67 6075 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
536a6f88
JF
6076
6077 return 0;
6078}
6079
09cbcef6 6080static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
ce35ef27 6081{
46808a4c 6082 unsigned long i;
09cbcef6 6083 struct kvm_vcpu *vcpu;
ce35ef27 6084
09cbcef6 6085 *val = 0;
ce35ef27 6086
09cbcef6 6087 kvm_for_each_vcpu(i, vcpu, kvm)
bc9e9e67 6088 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
ce35ef27
SJS
6089
6090 return 0;
6091}
6092
09cbcef6 6093static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
536a6f88 6094{
46808a4c 6095 unsigned long i;
09cbcef6 6096 struct kvm_vcpu *vcpu;
536a6f88 6097
09cbcef6 6098 kvm_for_each_vcpu(i, vcpu, kvm)
bc9e9e67 6099 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
09cbcef6
MP
6100
6101 return 0;
6102}
536a6f88 6103
09cbcef6 6104static int kvm_stat_data_get(void *data, u64 *val)
536a6f88 6105{
09cbcef6 6106 int r = -EFAULT;
14aa40a1 6107 struct kvm_stat_data *stat_data = data;
536a6f88 6108
bc9e9e67 6109 switch (stat_data->kind) {
09cbcef6
MP
6110 case KVM_STAT_VM:
6111 r = kvm_get_stat_per_vm(stat_data->kvm,
bc9e9e67 6112 stat_data->desc->desc.offset, val);
09cbcef6
MP
6113 break;
6114 case KVM_STAT_VCPU:
6115 r = kvm_get_stat_per_vcpu(stat_data->kvm,
bc9e9e67 6116 stat_data->desc->desc.offset, val);
09cbcef6
MP
6117 break;
6118 }
536a6f88 6119
09cbcef6 6120 return r;
536a6f88
JF
6121}
6122
09cbcef6 6123static int kvm_stat_data_clear(void *data, u64 val)
ce35ef27 6124{
09cbcef6 6125 int r = -EFAULT;
14aa40a1 6126 struct kvm_stat_data *stat_data = data;
ce35ef27
SJS
6127
6128 if (val)
6129 return -EINVAL;
6130
bc9e9e67 6131 switch (stat_data->kind) {
09cbcef6
MP
6132 case KVM_STAT_VM:
6133 r = kvm_clear_stat_per_vm(stat_data->kvm,
bc9e9e67 6134 stat_data->desc->desc.offset);
09cbcef6
MP
6135 break;
6136 case KVM_STAT_VCPU:
6137 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
bc9e9e67 6138 stat_data->desc->desc.offset);
09cbcef6
MP
6139 break;
6140 }
ce35ef27 6141
09cbcef6 6142 return r;
ce35ef27
SJS
6143}
6144
09cbcef6 6145static int kvm_stat_data_open(struct inode *inode, struct file *file)
536a6f88
JF
6146{
6147 __simple_attr_check_format("%llu\n", 0ull);
09cbcef6
MP
6148 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
6149 kvm_stat_data_clear, "%llu\n");
536a6f88
JF
6150}
6151
09cbcef6
MP
6152static const struct file_operations stat_fops_per_vm = {
6153 .owner = THIS_MODULE,
6154 .open = kvm_stat_data_open,
536a6f88 6155 .release = kvm_debugfs_release,
09cbcef6
MP
6156 .read = simple_attr_read,
6157 .write = simple_attr_write,
6158 .llseek = no_llseek,
536a6f88
JF
6159};
6160
8b88b099 6161static int vm_stat_get(void *_offset, u64 *val)
ba1389b7
AK
6162{
6163 unsigned offset = (long)_offset;
ba1389b7 6164 struct kvm *kvm;
536a6f88 6165 u64 tmp_val;
ba1389b7 6166
8b88b099 6167 *val = 0;
0d9ce162 6168 mutex_lock(&kvm_lock);
536a6f88 6169 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 6170 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
536a6f88
JF
6171 *val += tmp_val;
6172 }
0d9ce162 6173 mutex_unlock(&kvm_lock);
8b88b099 6174 return 0;
ba1389b7
AK
6175}
6176
ce35ef27
SJS
6177static int vm_stat_clear(void *_offset, u64 val)
6178{
6179 unsigned offset = (long)_offset;
6180 struct kvm *kvm;
ce35ef27
SJS
6181
6182 if (val)
6183 return -EINVAL;
6184
0d9ce162 6185 mutex_lock(&kvm_lock);
ce35ef27 6186 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 6187 kvm_clear_stat_per_vm(kvm, offset);
ce35ef27 6188 }
0d9ce162 6189 mutex_unlock(&kvm_lock);
ce35ef27
SJS
6190
6191 return 0;
6192}
6193
6194DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
bc9e9e67 6195DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
ba1389b7 6196
8b88b099 6197static int vcpu_stat_get(void *_offset, u64 *val)
1165f5fe
AK
6198{
6199 unsigned offset = (long)_offset;
1165f5fe 6200 struct kvm *kvm;
536a6f88 6201 u64 tmp_val;
1165f5fe 6202
8b88b099 6203 *val = 0;
0d9ce162 6204 mutex_lock(&kvm_lock);
536a6f88 6205 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 6206 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
536a6f88
JF
6207 *val += tmp_val;
6208 }
0d9ce162 6209 mutex_unlock(&kvm_lock);
8b88b099 6210 return 0;
1165f5fe
AK
6211}
6212
ce35ef27
SJS
6213static int vcpu_stat_clear(void *_offset, u64 val)
6214{
6215 unsigned offset = (long)_offset;
6216 struct kvm *kvm;
ce35ef27
SJS
6217
6218 if (val)
6219 return -EINVAL;
6220
0d9ce162 6221 mutex_lock(&kvm_lock);
ce35ef27 6222 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 6223 kvm_clear_stat_per_vcpu(kvm, offset);
ce35ef27 6224 }
0d9ce162 6225 mutex_unlock(&kvm_lock);
ce35ef27
SJS
6226
6227 return 0;
6228}
6229
6230DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
6231 "%llu\n");
bc9e9e67 6232DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
1165f5fe 6233
286de8f6
CI
6234static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
6235{
6236 struct kobj_uevent_env *env;
286de8f6
CI
6237 unsigned long long created, active;
6238
6239 if (!kvm_dev.this_device || !kvm)
6240 return;
6241
0d9ce162 6242 mutex_lock(&kvm_lock);
286de8f6
CI
6243 if (type == KVM_EVENT_CREATE_VM) {
6244 kvm_createvm_count++;
6245 kvm_active_vms++;
6246 } else if (type == KVM_EVENT_DESTROY_VM) {
6247 kvm_active_vms--;
6248 }
6249 created = kvm_createvm_count;
6250 active = kvm_active_vms;
0d9ce162 6251 mutex_unlock(&kvm_lock);
286de8f6 6252
b12ce36a 6253 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
286de8f6
CI
6254 if (!env)
6255 return;
6256
6257 add_uevent_var(env, "CREATED=%llu", created);
6258 add_uevent_var(env, "COUNT=%llu", active);
6259
fdeaf7e3 6260 if (type == KVM_EVENT_CREATE_VM) {
286de8f6 6261 add_uevent_var(env, "EVENT=create");
fdeaf7e3
CI
6262 kvm->userspace_pid = task_pid_nr(current);
6263 } else if (type == KVM_EVENT_DESTROY_VM) {
286de8f6 6264 add_uevent_var(env, "EVENT=destroy");
fdeaf7e3
CI
6265 }
6266 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
286de8f6 6267
a44a4cc1 6268 if (!IS_ERR(kvm->debugfs_dentry)) {
b12ce36a 6269 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
fdeaf7e3
CI
6270
6271 if (p) {
6272 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
6273 if (!IS_ERR(tmp))
6274 add_uevent_var(env, "STATS_PATH=%s", tmp);
6275 kfree(p);
286de8f6
CI
6276 }
6277 }
6278 /* no need for checks, since we are adding at most only 5 keys */
6279 env->envp[env->envp_idx++] = NULL;
6280 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
6281 kfree(env);
286de8f6
CI
6282}
6283
929f45e3 6284static void kvm_init_debug(void)
6aa8b732 6285{
bc9e9e67
JZ
6286 const struct file_operations *fops;
6287 const struct _kvm_stats_desc *pdesc;
6288 int i;
6aa8b732 6289
76f7c879 6290 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
4f69b680 6291
bc9e9e67
JZ
6292 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
6293 pdesc = &kvm_vm_stats_desc[i];
6294 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6295 fops = &vm_stat_fops;
6296 else
6297 fops = &vm_stat_readonly_fops;
6298 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6299 kvm_debugfs_dir,
6300 (void *)(long)pdesc->desc.offset, fops);
6301 }
6302
6303 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
6304 pdesc = &kvm_vcpu_stats_desc[i];
6305 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6306 fops = &vcpu_stat_fops;
6307 else
6308 fops = &vcpu_stat_readonly_fops;
6309 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6310 kvm_debugfs_dir,
6311 (void *)(long)pdesc->desc.offset, fops);
4f69b680 6312 }
6aa8b732
AK
6313}
6314
15ad7146
AK
6315static inline
6316struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
6317{
6318 return container_of(pn, struct kvm_vcpu, preempt_notifier);
6319}
6320
6321static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
6322{
6323 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
f95ef0cd 6324
046ddeed 6325 WRITE_ONCE(vcpu->preempted, false);
d73eb57b 6326 WRITE_ONCE(vcpu->ready, false);
15ad7146 6327
7495e22b 6328 __this_cpu_write(kvm_running_vcpu, vcpu);
e790d9ef 6329 kvm_arch_sched_in(vcpu, cpu);
e9b11c17 6330 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146
AK
6331}
6332
6333static void kvm_sched_out(struct preempt_notifier *pn,
6334 struct task_struct *next)
6335{
6336 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6337
3ba9f93b 6338 if (current->on_rq) {
046ddeed 6339 WRITE_ONCE(vcpu->preempted, true);
d73eb57b
WL
6340 WRITE_ONCE(vcpu->ready, true);
6341 }
e9b11c17 6342 kvm_arch_vcpu_put(vcpu);
7495e22b
PB
6343 __this_cpu_write(kvm_running_vcpu, NULL);
6344}
6345
6346/**
6347 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
1f03b2bc
MZ
6348 *
6349 * We can disable preemption locally around accessing the per-CPU variable,
6350 * and use the resolved vcpu pointer after enabling preemption again,
6351 * because even if the current thread is migrated to another CPU, reading
6352 * the per-CPU value later will give us the same value as we update the
6353 * per-CPU variable in the preempt notifier handlers.
7495e22b
PB
6354 */
6355struct kvm_vcpu *kvm_get_running_vcpu(void)
6356{
1f03b2bc
MZ
6357 struct kvm_vcpu *vcpu;
6358
6359 preempt_disable();
6360 vcpu = __this_cpu_read(kvm_running_vcpu);
6361 preempt_enable();
6362
6363 return vcpu;
7495e22b 6364}
379a3c8e 6365EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
7495e22b
PB
6366
6367/**
6368 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6369 */
6370struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6371{
6372 return &kvm_running_vcpu;
15ad7146
AK
6373}
6374
e1bfc245
SC
6375#ifdef CONFIG_GUEST_PERF_EVENTS
6376static unsigned int kvm_guest_state(void)
6377{
6378 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6379 unsigned int state;
6380
6381 if (!kvm_arch_pmi_in_guest(vcpu))
6382 return 0;
6383
6384 state = PERF_GUEST_ACTIVE;
6385 if (!kvm_arch_vcpu_in_kernel(vcpu))
6386 state |= PERF_GUEST_USER;
6387
6388 return state;
6389}
6390
6391static unsigned long kvm_guest_get_ip(void)
6392{
6393 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6394
6395 /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6396 if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6397 return 0;
6398
6399 return kvm_arch_vcpu_get_ip(vcpu);
6400}
6401
6402static struct perf_guest_info_callbacks kvm_guest_cbs = {
6403 .state = kvm_guest_state,
6404 .get_ip = kvm_guest_get_ip,
6405 .handle_intel_pt_intr = NULL,
6406};
6407
6408void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6409{
6410 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6411 perf_register_guest_info_callbacks(&kvm_guest_cbs);
6412}
6413void kvm_unregister_perf_callbacks(void)
6414{
6415 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6416}
6417#endif
6418
81a1cf9f 6419int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
f257d6dc 6420{
6aa8b732 6421 int r;
002c7f7c 6422 int cpu;
6aa8b732 6423
441f7bfa 6424#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
aaf12a7b
CG
6425 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6426 kvm_online_cpu, kvm_offline_cpu);
774c47f1 6427 if (r)
37d25881
SC
6428 return r;
6429
35774a9f 6430 register_syscore_ops(&kvm_syscore_ops);
441f7bfa 6431#endif
6aa8b732 6432
c16f862d 6433 /* A kmem cache lets us meet the alignment requirements of fx_save. */
0ee75bea
AK
6434 if (!vcpu_align)
6435 vcpu_align = __alignof__(struct kvm_vcpu);
46515736
PB
6436 kvm_vcpu_cache =
6437 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6438 SLAB_ACCOUNT,
6439 offsetof(struct kvm_vcpu, arch),
ce55c049
JZ
6440 offsetofend(struct kvm_vcpu, stats_id)
6441 - offsetof(struct kvm_vcpu, arch),
46515736 6442 NULL);
c16f862d
RR
6443 if (!kvm_vcpu_cache) {
6444 r = -ENOMEM;
9f1a4c00 6445 goto err_vcpu_cache;
c16f862d
RR
6446 }
6447
baff59cc
VK
6448 for_each_possible_cpu(cpu) {
6449 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6450 GFP_KERNEL, cpu_to_node(cpu))) {
6451 r = -ENOMEM;
9f1a4c00 6452 goto err_cpu_kick_mask;
baff59cc
VK
6453 }
6454 }
6455
5910ccf0
SC
6456 r = kvm_irqfd_init();
6457 if (r)
6458 goto err_irqfd;
6459
af585b92
GN
6460 r = kvm_async_pf_init();
6461 if (r)
5910ccf0 6462 goto err_async_pf;
af585b92 6463
6aa8b732 6464 kvm_chardev_ops.owner = module;
087e1520
SC
6465 kvm_vm_fops.owner = module;
6466 kvm_vcpu_fops.owner = module;
6467 kvm_device_fops.owner = module;
6aa8b732 6468
15ad7146
AK
6469 kvm_preempt_ops.sched_in = kvm_sched_in;
6470 kvm_preempt_ops.sched_out = kvm_sched_out;
6471
929f45e3 6472 kvm_init_debug();
0ea4ed8e 6473
3c3c29fd 6474 r = kvm_vfio_ops_init();
2b012812
SC
6475 if (WARN_ON_ONCE(r))
6476 goto err_vfio;
6477
a7800aa8
SC
6478 kvm_gmem_init(module);
6479
2b012812
SC
6480 /*
6481 * Registration _must_ be the very last thing done, as this exposes
6482 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6483 */
6484 r = misc_register(&kvm_dev);
6485 if (r) {
6486 pr_err("kvm: misc device register failed\n");
6487 goto err_register;
6488 }
3c3c29fd 6489
c7addb90 6490 return 0;
6aa8b732 6491
2b012812
SC
6492err_register:
6493 kvm_vfio_ops_exit();
6494err_vfio:
af585b92 6495 kvm_async_pf_deinit();
5910ccf0
SC
6496err_async_pf:
6497 kvm_irqfd_exit();
6498err_irqfd:
9f1a4c00 6499err_cpu_kick_mask:
baff59cc
VK
6500 for_each_possible_cpu(cpu)
6501 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
c16f862d 6502 kmem_cache_destroy(kvm_vcpu_cache);
9f1a4c00 6503err_vcpu_cache:
441f7bfa 6504#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
35774a9f 6505 unregister_syscore_ops(&kvm_syscore_ops);
aaf12a7b 6506 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
441f7bfa 6507#endif
6aa8b732
AK
6508 return r;
6509}
cb498ea2 6510EXPORT_SYMBOL_GPL(kvm_init);
6aa8b732 6511
cb498ea2 6512void kvm_exit(void)
6aa8b732 6513{
baff59cc
VK
6514 int cpu;
6515
2b012812
SC
6516 /*
6517 * Note, unregistering /dev/kvm doesn't strictly need to come first,
6518 * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6519 * to KVM while the module is being stopped.
6520 */
6aa8b732 6521 misc_deregister(&kvm_dev);
2b012812
SC
6522
6523 debugfs_remove_recursive(kvm_debugfs_dir);
baff59cc
VK
6524 for_each_possible_cpu(cpu)
6525 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
c16f862d 6526 kmem_cache_destroy(kvm_vcpu_cache);
73b8dc04 6527 kvm_vfio_ops_exit();
af585b92 6528 kvm_async_pf_deinit();
441f7bfa 6529#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
fb3600cc 6530 unregister_syscore_ops(&kvm_syscore_ops);
aaf12a7b 6531 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
441f7bfa 6532#endif
5910ccf0 6533 kvm_irqfd_exit();
6aa8b732 6534}
cb498ea2 6535EXPORT_SYMBOL_GPL(kvm_exit);
c57c8046
JS
6536
6537struct kvm_vm_worker_thread_context {
6538 struct kvm *kvm;
6539 struct task_struct *parent;
6540 struct completion init_done;
6541 kvm_vm_thread_fn_t thread_fn;
6542 uintptr_t data;
6543 int err;
6544};
6545
6546static int kvm_vm_worker_thread(void *context)
6547{
6548 /*
6549 * The init_context is allocated on the stack of the parent thread, so
6550 * we have to locally copy anything that is needed beyond initialization
6551 */
6552 struct kvm_vm_worker_thread_context *init_context = context;
e45cce30 6553 struct task_struct *parent;
c57c8046
JS
6554 struct kvm *kvm = init_context->kvm;
6555 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6556 uintptr_t data = init_context->data;
6557 int err;
6558
6559 err = kthread_park(current);
6560 /* kthread_park(current) is never supposed to return an error */
6561 WARN_ON(err != 0);
6562 if (err)
6563 goto init_complete;
6564
6565 err = cgroup_attach_task_all(init_context->parent, current);
6566 if (err) {
6567 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6568 __func__, err);
6569 goto init_complete;
6570 }
6571
6572 set_user_nice(current, task_nice(init_context->parent));
6573
6574init_complete:
6575 init_context->err = err;
6576 complete(&init_context->init_done);
6577 init_context = NULL;
6578
6579 if (err)
e45cce30 6580 goto out;
c57c8046
JS
6581
6582 /* Wait to be woken up by the spawner before proceeding. */
6583 kthread_parkme();
6584
6585 if (!kthread_should_stop())
6586 err = thread_fn(kvm, data);
6587
e45cce30
VS
6588out:
6589 /*
6590 * Move kthread back to its original cgroup to prevent it lingering in
6591 * the cgroup of the VM process, after the latter finishes its
6592 * execution.
6593 *
6594 * kthread_stop() waits on the 'exited' completion condition which is
6595 * set in exit_mm(), via mm_release(), in do_exit(). However, the
6596 * kthread is removed from the cgroup in the cgroup_exit() which is
6597 * called after the exit_mm(). This causes the kthread_stop() to return
6598 * before the kthread actually quits the cgroup.
6599 */
6600 rcu_read_lock();
6601 parent = rcu_dereference(current->real_parent);
6602 get_task_struct(parent);
6603 rcu_read_unlock();
6604 cgroup_attach_task_all(parent, current);
6605 put_task_struct(parent);
6606
c57c8046
JS
6607 return err;
6608}
6609
6610int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6611 uintptr_t data, const char *name,
6612 struct task_struct **thread_ptr)
6613{
6614 struct kvm_vm_worker_thread_context init_context = {};
6615 struct task_struct *thread;
6616
6617 *thread_ptr = NULL;
6618 init_context.kvm = kvm;
6619 init_context.parent = current;
6620 init_context.thread_fn = thread_fn;
6621 init_context.data = data;
6622 init_completion(&init_context.init_done);
6623
6624 thread = kthread_run(kvm_vm_worker_thread, &init_context,
6625 "%s-%d", name, task_pid_nr(current));
6626 if (IS_ERR(thread))
6627 return PTR_ERR(thread);
6628
6629 /* kthread_run is never supposed to return NULL */
6630 WARN_ON(thread == NULL);
6631
6632 wait_for_completion(&init_context.init_done);
6633
6634 if (!init_context.err)
6635 *thread_ptr = thread;
6636
6637 return init_context.err;
6638}