KVM: SVM: Use get_host_vmcb function in svm_get_msr for TSC
[linux-2.6-block.git] / virt / kvm / kvm_main.c
CommitLineData
6aa8b732
AK
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
9611c187 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
9 *
10 * Authors:
11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com>
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory.
16 *
17 */
18
e2174021 19#include "iodev.h"
6aa8b732 20
edf88417 21#include <linux/kvm_host.h>
6aa8b732
AK
22#include <linux/kvm.h>
23#include <linux/module.h>
24#include <linux/errno.h>
6aa8b732 25#include <linux/percpu.h>
6aa8b732
AK
26#include <linux/mm.h>
27#include <linux/miscdevice.h>
28#include <linux/vmalloc.h>
6aa8b732 29#include <linux/reboot.h>
6aa8b732
AK
30#include <linux/debugfs.h>
31#include <linux/highmem.h>
32#include <linux/file.h>
59ae6c6b 33#include <linux/sysdev.h>
774c47f1 34#include <linux/cpu.h>
e8edc6e0 35#include <linux/sched.h>
d9e368d6
AK
36#include <linux/cpumask.h>
37#include <linux/smp.h>
d6d28168 38#include <linux/anon_inodes.h>
04d2cc77 39#include <linux/profile.h>
7aa81cc0 40#include <linux/kvm_para.h>
6fc138d2 41#include <linux/pagemap.h>
8d4e1288 42#include <linux/mman.h>
35149e21 43#include <linux/swap.h>
e56d532f 44#include <linux/bitops.h>
547de29e 45#include <linux/spinlock.h>
6ff5894c 46#include <linux/compat.h>
bc6678a3 47#include <linux/srcu.h>
8f0b1ab6 48#include <linux/hugetlb.h>
5a0e3ad6 49#include <linux/slab.h>
6aa8b732 50
e495606d 51#include <asm/processor.h>
e495606d
AK
52#include <asm/io.h>
53#include <asm/uaccess.h>
3e021bf5 54#include <asm/pgtable.h>
c8240bd6 55#include <asm-generic/bitops/le.h>
6aa8b732 56
5f94c174 57#include "coalesced_mmio.h"
af585b92 58#include "async_pf.h"
5f94c174 59
229456fc
MT
60#define CREATE_TRACE_POINTS
61#include <trace/events/kvm.h>
62
6aa8b732
AK
63MODULE_AUTHOR("Qumranet");
64MODULE_LICENSE("GPL");
65
fa40a821
MT
66/*
67 * Ordering of locks:
68 *
fae3a353 69 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
fa40a821
MT
70 */
71
e9b11c17
ZX
72DEFINE_SPINLOCK(kvm_lock);
73LIST_HEAD(vm_list);
133de902 74
7f59f492 75static cpumask_var_t cpus_hardware_enabled;
10474ae8
AG
76static int kvm_usage_count = 0;
77static atomic_t hardware_enable_failed;
1b6c0168 78
c16f862d
RR
79struct kmem_cache *kvm_vcpu_cache;
80EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
1165f5fe 81
15ad7146
AK
82static __read_mostly struct preempt_ops kvm_preempt_ops;
83
76f7c879 84struct dentry *kvm_debugfs_dir;
6aa8b732 85
bccf2150
AK
86static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
87 unsigned long arg);
10474ae8
AG
88static int hardware_enable_all(void);
89static void hardware_disable_all(void);
bccf2150 90
e93f8a0f
MT
91static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
92
e8ba5d31 93static bool kvm_rebooting;
4ecac3fd 94
54dee993
MT
95static bool largepages_enabled = true;
96
fa7bff8f
GN
97static struct page *hwpoison_page;
98static pfn_t hwpoison_pfn;
bf998156 99
edba23e5
GN
100static struct page *fault_page;
101static pfn_t fault_pfn;
102
c77fb9dc 103inline int kvm_is_mmio_pfn(pfn_t pfn)
cbff90a7 104{
fc5659c8
JR
105 if (pfn_valid(pfn)) {
106 struct page *page = compound_head(pfn_to_page(pfn));
107 return PageReserved(page);
108 }
cbff90a7
BAY
109
110 return true;
111}
112
bccf2150
AK
113/*
114 * Switches to specified vcpu, until a matching vcpu_put()
115 */
313a3dc7 116void vcpu_load(struct kvm_vcpu *vcpu)
6aa8b732 117{
15ad7146
AK
118 int cpu;
119
bccf2150 120 mutex_lock(&vcpu->mutex);
15ad7146
AK
121 cpu = get_cpu();
122 preempt_notifier_register(&vcpu->preempt_notifier);
313a3dc7 123 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146 124 put_cpu();
6aa8b732
AK
125}
126
313a3dc7 127void vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 128{
15ad7146 129 preempt_disable();
313a3dc7 130 kvm_arch_vcpu_put(vcpu);
15ad7146
AK
131 preempt_notifier_unregister(&vcpu->preempt_notifier);
132 preempt_enable();
6aa8b732
AK
133 mutex_unlock(&vcpu->mutex);
134}
135
d9e368d6
AK
136static void ack_flush(void *_completed)
137{
d9e368d6
AK
138}
139
49846896 140static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
d9e368d6 141{
597a5f55 142 int i, cpu, me;
6ef7a1bc
RR
143 cpumask_var_t cpus;
144 bool called = true;
d9e368d6 145 struct kvm_vcpu *vcpu;
d9e368d6 146
79f55997 147 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
6ef7a1bc 148
70e335e1 149 raw_spin_lock(&kvm->requests_lock);
e601e3be 150 me = smp_processor_id();
988a2cae 151 kvm_for_each_vcpu(i, vcpu, kvm) {
a8eeb04a 152 if (kvm_make_check_request(req, vcpu))
d9e368d6
AK
153 continue;
154 cpu = vcpu->cpu;
6ef7a1bc
RR
155 if (cpus != NULL && cpu != -1 && cpu != me)
156 cpumask_set_cpu(cpu, cpus);
49846896 157 }
6ef7a1bc
RR
158 if (unlikely(cpus == NULL))
159 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
160 else if (!cpumask_empty(cpus))
161 smp_call_function_many(cpus, ack_flush, NULL, 1);
162 else
163 called = false;
70e335e1 164 raw_spin_unlock(&kvm->requests_lock);
6ef7a1bc 165 free_cpumask_var(cpus);
49846896 166 return called;
d9e368d6
AK
167}
168
49846896 169void kvm_flush_remote_tlbs(struct kvm *kvm)
2e53d63a 170{
a4ee1ca4
XG
171 int dirty_count = kvm->tlbs_dirty;
172
173 smp_mb();
49846896
RR
174 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
175 ++kvm->stat.remote_tlb_flush;
a4ee1ca4 176 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
2e53d63a
MT
177}
178
49846896
RR
179void kvm_reload_remote_mmus(struct kvm *kvm)
180{
181 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
182}
2e53d63a 183
fb3f0f51
RR
184int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
185{
186 struct page *page;
187 int r;
188
189 mutex_init(&vcpu->mutex);
190 vcpu->cpu = -1;
fb3f0f51
RR
191 vcpu->kvm = kvm;
192 vcpu->vcpu_id = id;
b6958ce4 193 init_waitqueue_head(&vcpu->wq);
af585b92 194 kvm_async_pf_vcpu_init(vcpu);
fb3f0f51
RR
195
196 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
197 if (!page) {
198 r = -ENOMEM;
199 goto fail;
200 }
201 vcpu->run = page_address(page);
202
e9b11c17 203 r = kvm_arch_vcpu_init(vcpu);
fb3f0f51 204 if (r < 0)
e9b11c17 205 goto fail_free_run;
fb3f0f51
RR
206 return 0;
207
fb3f0f51
RR
208fail_free_run:
209 free_page((unsigned long)vcpu->run);
210fail:
76fafa5e 211 return r;
fb3f0f51
RR
212}
213EXPORT_SYMBOL_GPL(kvm_vcpu_init);
214
215void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
216{
e9b11c17 217 kvm_arch_vcpu_uninit(vcpu);
fb3f0f51
RR
218 free_page((unsigned long)vcpu->run);
219}
220EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
221
e930bffe
AA
222#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
223static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
224{
225 return container_of(mn, struct kvm, mmu_notifier);
226}
227
228static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
229 struct mm_struct *mm,
230 unsigned long address)
231{
232 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 233 int need_tlb_flush, idx;
e930bffe
AA
234
235 /*
236 * When ->invalidate_page runs, the linux pte has been zapped
237 * already but the page is still allocated until
238 * ->invalidate_page returns. So if we increase the sequence
239 * here the kvm page fault will notice if the spte can't be
240 * established because the page is going to be freed. If
241 * instead the kvm page fault establishes the spte before
242 * ->invalidate_page runs, kvm_unmap_hva will release it
243 * before returning.
244 *
245 * The sequence increase only need to be seen at spin_unlock
246 * time, and not at spin_lock time.
247 *
248 * Increasing the sequence after the spin_unlock would be
249 * unsafe because the kvm page fault could then establish the
250 * pte after kvm_unmap_hva returned, without noticing the page
251 * is going to be freed.
252 */
bc6678a3 253 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
254 spin_lock(&kvm->mmu_lock);
255 kvm->mmu_notifier_seq++;
a4ee1ca4 256 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
e930bffe 257 spin_unlock(&kvm->mmu_lock);
bc6678a3 258 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
259
260 /* we've to flush the tlb before the pages can be freed */
261 if (need_tlb_flush)
262 kvm_flush_remote_tlbs(kvm);
263
264}
265
3da0dd43
IE
266static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
267 struct mm_struct *mm,
268 unsigned long address,
269 pte_t pte)
270{
271 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 272 int idx;
3da0dd43 273
bc6678a3 274 idx = srcu_read_lock(&kvm->srcu);
3da0dd43
IE
275 spin_lock(&kvm->mmu_lock);
276 kvm->mmu_notifier_seq++;
277 kvm_set_spte_hva(kvm, address, pte);
278 spin_unlock(&kvm->mmu_lock);
bc6678a3 279 srcu_read_unlock(&kvm->srcu, idx);
3da0dd43
IE
280}
281
e930bffe
AA
282static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
283 struct mm_struct *mm,
284 unsigned long start,
285 unsigned long end)
286{
287 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 288 int need_tlb_flush = 0, idx;
e930bffe 289
bc6678a3 290 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
291 spin_lock(&kvm->mmu_lock);
292 /*
293 * The count increase must become visible at unlock time as no
294 * spte can be established without taking the mmu_lock and
295 * count is also read inside the mmu_lock critical section.
296 */
297 kvm->mmu_notifier_count++;
298 for (; start < end; start += PAGE_SIZE)
299 need_tlb_flush |= kvm_unmap_hva(kvm, start);
a4ee1ca4 300 need_tlb_flush |= kvm->tlbs_dirty;
e930bffe 301 spin_unlock(&kvm->mmu_lock);
bc6678a3 302 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
303
304 /* we've to flush the tlb before the pages can be freed */
305 if (need_tlb_flush)
306 kvm_flush_remote_tlbs(kvm);
307}
308
309static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
310 struct mm_struct *mm,
311 unsigned long start,
312 unsigned long end)
313{
314 struct kvm *kvm = mmu_notifier_to_kvm(mn);
315
316 spin_lock(&kvm->mmu_lock);
317 /*
318 * This sequence increase will notify the kvm page fault that
319 * the page that is going to be mapped in the spte could have
320 * been freed.
321 */
322 kvm->mmu_notifier_seq++;
323 /*
324 * The above sequence increase must be visible before the
325 * below count decrease but both values are read by the kvm
326 * page fault under mmu_lock spinlock so we don't need to add
327 * a smb_wmb() here in between the two.
328 */
329 kvm->mmu_notifier_count--;
330 spin_unlock(&kvm->mmu_lock);
331
332 BUG_ON(kvm->mmu_notifier_count < 0);
333}
334
335static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
336 struct mm_struct *mm,
337 unsigned long address)
338{
339 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 340 int young, idx;
e930bffe 341
bc6678a3 342 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
343 spin_lock(&kvm->mmu_lock);
344 young = kvm_age_hva(kvm, address);
345 spin_unlock(&kvm->mmu_lock);
bc6678a3 346 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
347
348 if (young)
349 kvm_flush_remote_tlbs(kvm);
350
351 return young;
352}
353
85db06e5
MT
354static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
355 struct mm_struct *mm)
356{
357 struct kvm *kvm = mmu_notifier_to_kvm(mn);
eda2beda
LJ
358 int idx;
359
360 idx = srcu_read_lock(&kvm->srcu);
85db06e5 361 kvm_arch_flush_shadow(kvm);
eda2beda 362 srcu_read_unlock(&kvm->srcu, idx);
85db06e5
MT
363}
364
e930bffe
AA
365static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
366 .invalidate_page = kvm_mmu_notifier_invalidate_page,
367 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
368 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
369 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
3da0dd43 370 .change_pte = kvm_mmu_notifier_change_pte,
85db06e5 371 .release = kvm_mmu_notifier_release,
e930bffe 372};
4c07b0a4
AK
373
374static int kvm_init_mmu_notifier(struct kvm *kvm)
375{
376 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
377 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
378}
379
380#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
381
382static int kvm_init_mmu_notifier(struct kvm *kvm)
383{
384 return 0;
385}
386
e930bffe
AA
387#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
388
f17abe9a 389static struct kvm *kvm_create_vm(void)
6aa8b732 390{
d89f5eff
JK
391 int r, i;
392 struct kvm *kvm = kvm_arch_alloc_vm();
6aa8b732 393
d89f5eff
JK
394 if (!kvm)
395 return ERR_PTR(-ENOMEM);
396
397 r = kvm_arch_init_vm(kvm);
398 if (r)
399 goto out_err_nodisable;
10474ae8
AG
400
401 r = hardware_enable_all();
402 if (r)
403 goto out_err_nodisable;
404
75858a84
AK
405#ifdef CONFIG_HAVE_KVM_IRQCHIP
406 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
136bdfee 407 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
75858a84 408#endif
6aa8b732 409
46a26bf5
MT
410 r = -ENOMEM;
411 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
412 if (!kvm->memslots)
57e7fbee 413 goto out_err_nosrcu;
bc6678a3 414 if (init_srcu_struct(&kvm->srcu))
57e7fbee 415 goto out_err_nosrcu;
e93f8a0f
MT
416 for (i = 0; i < KVM_NR_BUSES; i++) {
417 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
418 GFP_KERNEL);
57e7fbee 419 if (!kvm->buses[i])
e93f8a0f 420 goto out_err;
e93f8a0f 421 }
46a26bf5 422
4c07b0a4 423 r = kvm_init_mmu_notifier(kvm);
57e7fbee 424 if (r)
283d0c65 425 goto out_err;
e930bffe 426
6d4e4c4f
AK
427 kvm->mm = current->mm;
428 atomic_inc(&kvm->mm->mm_count);
aaee2c94 429 spin_lock_init(&kvm->mmu_lock);
70e335e1 430 raw_spin_lock_init(&kvm->requests_lock);
d34e6b17 431 kvm_eventfd_init(kvm);
11ec2804 432 mutex_init(&kvm->lock);
60eead79 433 mutex_init(&kvm->irq_lock);
79fac95e 434 mutex_init(&kvm->slots_lock);
d39f13b0 435 atomic_set(&kvm->users_count, 1);
5e58cfe4
RR
436 spin_lock(&kvm_lock);
437 list_add(&kvm->vm_list, &vm_list);
438 spin_unlock(&kvm_lock);
d89f5eff 439
f17abe9a 440 return kvm;
10474ae8
AG
441
442out_err:
57e7fbee
JK
443 cleanup_srcu_struct(&kvm->srcu);
444out_err_nosrcu:
10474ae8
AG
445 hardware_disable_all();
446out_err_nodisable:
e93f8a0f
MT
447 for (i = 0; i < KVM_NR_BUSES; i++)
448 kfree(kvm->buses[i]);
46a26bf5 449 kfree(kvm->memslots);
d89f5eff 450 kvm_arch_free_vm(kvm);
10474ae8 451 return ERR_PTR(r);
f17abe9a
AK
452}
453
a36a57b1
TY
454static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
455{
456 if (!memslot->dirty_bitmap)
457 return;
458
6f9e5c17
TY
459 if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
460 vfree(memslot->dirty_bitmap_head);
461 else
462 kfree(memslot->dirty_bitmap_head);
463
a36a57b1 464 memslot->dirty_bitmap = NULL;
515a0127 465 memslot->dirty_bitmap_head = NULL;
a36a57b1
TY
466}
467
6aa8b732
AK
468/*
469 * Free any memory in @free but not in @dont.
470 */
471static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
472 struct kvm_memory_slot *dont)
473{
ec04b260
JR
474 int i;
475
290fc38d
IE
476 if (!dont || free->rmap != dont->rmap)
477 vfree(free->rmap);
6aa8b732
AK
478
479 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
a36a57b1 480 kvm_destroy_dirty_bitmap(free);
6aa8b732 481
ec04b260
JR
482
483 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
484 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
485 vfree(free->lpage_info[i]);
486 free->lpage_info[i] = NULL;
487 }
488 }
05da4558 489
6aa8b732 490 free->npages = 0;
8d4e1288 491 free->rmap = NULL;
6aa8b732
AK
492}
493
d19a9cd2 494void kvm_free_physmem(struct kvm *kvm)
6aa8b732
AK
495{
496 int i;
46a26bf5
MT
497 struct kvm_memslots *slots = kvm->memslots;
498
499 for (i = 0; i < slots->nmemslots; ++i)
500 kvm_free_physmem_slot(&slots->memslots[i], NULL);
6aa8b732 501
46a26bf5 502 kfree(kvm->memslots);
6aa8b732
AK
503}
504
f17abe9a
AK
505static void kvm_destroy_vm(struct kvm *kvm)
506{
e93f8a0f 507 int i;
6d4e4c4f
AK
508 struct mm_struct *mm = kvm->mm;
509
ad8ba2cd 510 kvm_arch_sync_events(kvm);
133de902
AK
511 spin_lock(&kvm_lock);
512 list_del(&kvm->vm_list);
513 spin_unlock(&kvm_lock);
399ec807 514 kvm_free_irq_routing(kvm);
e93f8a0f
MT
515 for (i = 0; i < KVM_NR_BUSES; i++)
516 kvm_io_bus_destroy(kvm->buses[i]);
980da6ce 517 kvm_coalesced_mmio_free(kvm);
e930bffe
AA
518#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
519 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
f00be0ca
GN
520#else
521 kvm_arch_flush_shadow(kvm);
5f94c174 522#endif
d19a9cd2 523 kvm_arch_destroy_vm(kvm);
d89f5eff
JK
524 kvm_free_physmem(kvm);
525 cleanup_srcu_struct(&kvm->srcu);
526 kvm_arch_free_vm(kvm);
10474ae8 527 hardware_disable_all();
6d4e4c4f 528 mmdrop(mm);
f17abe9a
AK
529}
530
d39f13b0
IE
531void kvm_get_kvm(struct kvm *kvm)
532{
533 atomic_inc(&kvm->users_count);
534}
535EXPORT_SYMBOL_GPL(kvm_get_kvm);
536
537void kvm_put_kvm(struct kvm *kvm)
538{
539 if (atomic_dec_and_test(&kvm->users_count))
540 kvm_destroy_vm(kvm);
541}
542EXPORT_SYMBOL_GPL(kvm_put_kvm);
543
544
f17abe9a
AK
545static int kvm_vm_release(struct inode *inode, struct file *filp)
546{
547 struct kvm *kvm = filp->private_data;
548
721eecbf
GH
549 kvm_irqfd_release(kvm);
550
d39f13b0 551 kvm_put_kvm(kvm);
6aa8b732
AK
552 return 0;
553}
554
515a0127
TY
555/*
556 * Allocation size is twice as large as the actual dirty bitmap size.
557 * This makes it possible to do double buffering: see x86's
558 * kvm_vm_ioctl_get_dirty_log().
559 */
a36a57b1
TY
560static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
561{
515a0127 562 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
a36a57b1 563
6f9e5c17
TY
564 if (dirty_bytes > PAGE_SIZE)
565 memslot->dirty_bitmap = vzalloc(dirty_bytes);
566 else
567 memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
568
a36a57b1
TY
569 if (!memslot->dirty_bitmap)
570 return -ENOMEM;
571
515a0127 572 memslot->dirty_bitmap_head = memslot->dirty_bitmap;
a36a57b1
TY
573 return 0;
574}
575
6aa8b732
AK
576/*
577 * Allocate some memory and give it an address in the guest physical address
578 * space.
579 *
580 * Discontiguous memory is allowed, mostly for framebuffers.
f78e0e2e 581 *
10589a46 582 * Must be called holding mmap_sem for write.
6aa8b732 583 */
f78e0e2e
SY
584int __kvm_set_memory_region(struct kvm *kvm,
585 struct kvm_userspace_memory_region *mem,
586 int user_alloc)
6aa8b732 587{
bc6678a3 588 int r, flush_shadow = 0;
6aa8b732 589 gfn_t base_gfn;
28bcb112
HC
590 unsigned long npages;
591 unsigned long i;
6aa8b732
AK
592 struct kvm_memory_slot *memslot;
593 struct kvm_memory_slot old, new;
bc6678a3 594 struct kvm_memslots *slots, *old_memslots;
6aa8b732
AK
595
596 r = -EINVAL;
597 /* General sanity checks */
598 if (mem->memory_size & (PAGE_SIZE - 1))
599 goto out;
600 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
601 goto out;
e7cacd40 602 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
78749809 603 goto out;
e0d62c7f 604 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
6aa8b732
AK
605 goto out;
606 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
607 goto out;
608
46a26bf5 609 memslot = &kvm->memslots->memslots[mem->slot];
6aa8b732
AK
610 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
611 npages = mem->memory_size >> PAGE_SHIFT;
612
660c22c4
TY
613 r = -EINVAL;
614 if (npages > KVM_MEM_MAX_NR_PAGES)
615 goto out;
616
6aa8b732
AK
617 if (!npages)
618 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
619
6aa8b732
AK
620 new = old = *memslot;
621
e36d96f7 622 new.id = mem->slot;
6aa8b732
AK
623 new.base_gfn = base_gfn;
624 new.npages = npages;
625 new.flags = mem->flags;
626
627 /* Disallow changing a memory slot's size. */
628 r = -EINVAL;
629 if (npages && old.npages && npages != old.npages)
f78e0e2e 630 goto out_free;
6aa8b732
AK
631
632 /* Check for overlaps */
633 r = -EEXIST;
634 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
46a26bf5 635 struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
6aa8b732 636
4cd481f6 637 if (s == memslot || !s->npages)
6aa8b732
AK
638 continue;
639 if (!((base_gfn + npages <= s->base_gfn) ||
640 (base_gfn >= s->base_gfn + s->npages)))
f78e0e2e 641 goto out_free;
6aa8b732 642 }
6aa8b732 643
6aa8b732
AK
644 /* Free page dirty bitmap if unneeded */
645 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
8b6d44c7 646 new.dirty_bitmap = NULL;
6aa8b732
AK
647
648 r = -ENOMEM;
649
650 /* Allocate if a slot is being created */
eff0114a 651#ifndef CONFIG_S390
8d4e1288 652 if (npages && !new.rmap) {
26535037 653 new.rmap = vzalloc(npages * sizeof(*new.rmap));
290fc38d
IE
654
655 if (!new.rmap)
f78e0e2e 656 goto out_free;
290fc38d 657
80b14b5b 658 new.user_alloc = user_alloc;
bc6678a3 659 new.userspace_addr = mem->userspace_addr;
6aa8b732 660 }
ec04b260
JR
661 if (!npages)
662 goto skip_lpage;
05da4558 663
ec04b260 664 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
28bcb112
HC
665 unsigned long ugfn;
666 unsigned long j;
667 int lpages;
ec04b260 668 int level = i + 2;
05da4558 669
ec04b260
JR
670 /* Avoid unused variable warning if no large pages */
671 (void)level;
672
673 if (new.lpage_info[i])
674 continue;
675
82855413
JR
676 lpages = 1 + ((base_gfn + npages - 1)
677 >> KVM_HPAGE_GFN_SHIFT(level));
678 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
ec04b260 679
26535037 680 new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
ec04b260
JR
681
682 if (!new.lpage_info[i])
05da4558
MT
683 goto out_free;
684
82855413 685 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
ec04b260 686 new.lpage_info[i][0].write_count = 1;
82855413 687 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
ec04b260 688 new.lpage_info[i][lpages - 1].write_count = 1;
ac04527f
AK
689 ugfn = new.userspace_addr >> PAGE_SHIFT;
690 /*
691 * If the gfn and userspace address are not aligned wrt each
54dee993
MT
692 * other, or if explicitly asked to, disable large page
693 * support for this slot
ac04527f 694 */
ec04b260 695 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
54dee993 696 !largepages_enabled)
ec04b260
JR
697 for (j = 0; j < lpages; ++j)
698 new.lpage_info[i][j].write_count = 1;
05da4558 699 }
6aa8b732 700
ec04b260
JR
701skip_lpage:
702
6aa8b732
AK
703 /* Allocate page dirty bitmap if needed */
704 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
a36a57b1 705 if (kvm_create_dirty_bitmap(&new) < 0)
f78e0e2e 706 goto out_free;
bc6678a3 707 /* destroy any largepage mappings for dirty tracking */
e244584f 708 if (old.npages)
bc6678a3 709 flush_shadow = 1;
6aa8b732 710 }
3eea8437
CB
711#else /* not defined CONFIG_S390 */
712 new.user_alloc = user_alloc;
713 if (user_alloc)
714 new.userspace_addr = mem->userspace_addr;
eff0114a 715#endif /* not defined CONFIG_S390 */
6aa8b732 716
bc6678a3
MT
717 if (!npages) {
718 r = -ENOMEM;
719 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
720 if (!slots)
721 goto out_free;
722 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
723 if (mem->slot >= slots->nmemslots)
724 slots->nmemslots = mem->slot + 1;
49c7754c 725 slots->generation++;
bc6678a3
MT
726 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
727
728 old_memslots = kvm->memslots;
729 rcu_assign_pointer(kvm->memslots, slots);
730 synchronize_srcu_expedited(&kvm->srcu);
731 /* From this point no new shadow pages pointing to a deleted
732 * memslot will be created.
733 *
734 * validation of sp->gfn happens in:
735 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
736 * - kvm_is_visible_gfn (mmu_check_roots)
737 */
34d4cb8f 738 kvm_arch_flush_shadow(kvm);
bc6678a3
MT
739 kfree(old_memslots);
740 }
34d4cb8f 741
f7784b8e
MT
742 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
743 if (r)
744 goto out_free;
745
bc6678a3
MT
746 /* map the pages in iommu page table */
747 if (npages) {
748 r = kvm_iommu_map_pages(kvm, &new);
749 if (r)
750 goto out_free;
751 }
604b38ac 752
bc6678a3
MT
753 r = -ENOMEM;
754 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
755 if (!slots)
756 goto out_free;
757 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
758 if (mem->slot >= slots->nmemslots)
759 slots->nmemslots = mem->slot + 1;
49c7754c 760 slots->generation++;
bc6678a3
MT
761
762 /* actual memory is freed via old in kvm_free_physmem_slot below */
763 if (!npages) {
764 new.rmap = NULL;
765 new.dirty_bitmap = NULL;
766 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
767 new.lpage_info[i] = NULL;
768 }
769
770 slots->memslots[mem->slot] = new;
771 old_memslots = kvm->memslots;
772 rcu_assign_pointer(kvm->memslots, slots);
773 synchronize_srcu_expedited(&kvm->srcu);
3ad82a7e 774
f7784b8e 775 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
82ce2c96 776
bc6678a3
MT
777 kvm_free_physmem_slot(&old, &new);
778 kfree(old_memslots);
779
780 if (flush_shadow)
781 kvm_arch_flush_shadow(kvm);
782
6aa8b732
AK
783 return 0;
784
f78e0e2e 785out_free:
6aa8b732
AK
786 kvm_free_physmem_slot(&new, &old);
787out:
788 return r;
210c7c4d
IE
789
790}
f78e0e2e
SY
791EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
792
793int kvm_set_memory_region(struct kvm *kvm,
794 struct kvm_userspace_memory_region *mem,
795 int user_alloc)
796{
797 int r;
798
79fac95e 799 mutex_lock(&kvm->slots_lock);
f78e0e2e 800 r = __kvm_set_memory_region(kvm, mem, user_alloc);
79fac95e 801 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
802 return r;
803}
210c7c4d
IE
804EXPORT_SYMBOL_GPL(kvm_set_memory_region);
805
1fe779f8
CO
806int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
807 struct
808 kvm_userspace_memory_region *mem,
809 int user_alloc)
210c7c4d 810{
e0d62c7f
IE
811 if (mem->slot >= KVM_MEMORY_SLOTS)
812 return -EINVAL;
210c7c4d 813 return kvm_set_memory_region(kvm, mem, user_alloc);
6aa8b732
AK
814}
815
5bb064dc
ZX
816int kvm_get_dirty_log(struct kvm *kvm,
817 struct kvm_dirty_log *log, int *is_dirty)
6aa8b732
AK
818{
819 struct kvm_memory_slot *memslot;
820 int r, i;
87bf6e7d 821 unsigned long n;
6aa8b732
AK
822 unsigned long any = 0;
823
6aa8b732
AK
824 r = -EINVAL;
825 if (log->slot >= KVM_MEMORY_SLOTS)
826 goto out;
827
46a26bf5 828 memslot = &kvm->memslots->memslots[log->slot];
6aa8b732
AK
829 r = -ENOENT;
830 if (!memslot->dirty_bitmap)
831 goto out;
832
87bf6e7d 833 n = kvm_dirty_bitmap_bytes(memslot);
6aa8b732 834
cd1a4a98 835 for (i = 0; !any && i < n/sizeof(long); ++i)
6aa8b732
AK
836 any = memslot->dirty_bitmap[i];
837
838 r = -EFAULT;
839 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
840 goto out;
841
5bb064dc
ZX
842 if (any)
843 *is_dirty = 1;
6aa8b732
AK
844
845 r = 0;
6aa8b732 846out:
6aa8b732
AK
847 return r;
848}
849
54dee993
MT
850void kvm_disable_largepages(void)
851{
852 largepages_enabled = false;
853}
854EXPORT_SYMBOL_GPL(kvm_disable_largepages);
855
cea7bb21
IE
856int is_error_page(struct page *page)
857{
edba23e5 858 return page == bad_page || page == hwpoison_page || page == fault_page;
cea7bb21
IE
859}
860EXPORT_SYMBOL_GPL(is_error_page);
861
35149e21
AL
862int is_error_pfn(pfn_t pfn)
863{
edba23e5 864 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
35149e21
AL
865}
866EXPORT_SYMBOL_GPL(is_error_pfn);
867
bf998156
HY
868int is_hwpoison_pfn(pfn_t pfn)
869{
870 return pfn == hwpoison_pfn;
871}
872EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
873
edba23e5
GN
874int is_fault_pfn(pfn_t pfn)
875{
876 return pfn == fault_pfn;
877}
878EXPORT_SYMBOL_GPL(is_fault_pfn);
879
f9d46eb0
IE
880static inline unsigned long bad_hva(void)
881{
882 return PAGE_OFFSET;
883}
884
885int kvm_is_error_hva(unsigned long addr)
886{
887 return addr == bad_hva();
888}
889EXPORT_SYMBOL_GPL(kvm_is_error_hva);
890
49c7754c
GN
891static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
892 gfn_t gfn)
6aa8b732
AK
893{
894 int i;
895
46a26bf5
MT
896 for (i = 0; i < slots->nmemslots; ++i) {
897 struct kvm_memory_slot *memslot = &slots->memslots[i];
6aa8b732
AK
898
899 if (gfn >= memslot->base_gfn
900 && gfn < memslot->base_gfn + memslot->npages)
901 return memslot;
902 }
8b6d44c7 903 return NULL;
6aa8b732 904}
49c7754c
GN
905
906struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
907{
908 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
909}
a1f4d395 910EXPORT_SYMBOL_GPL(gfn_to_memslot);
6aa8b732 911
e0d62c7f
IE
912int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
913{
914 int i;
90d83dc3 915 struct kvm_memslots *slots = kvm_memslots(kvm);
e0d62c7f 916
e0d62c7f 917 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
46a26bf5 918 struct kvm_memory_slot *memslot = &slots->memslots[i];
e0d62c7f 919
bc6678a3
MT
920 if (memslot->flags & KVM_MEMSLOT_INVALID)
921 continue;
922
e0d62c7f
IE
923 if (gfn >= memslot->base_gfn
924 && gfn < memslot->base_gfn + memslot->npages)
925 return 1;
926 }
927 return 0;
928}
929EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
930
8f0b1ab6
JR
931unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
932{
933 struct vm_area_struct *vma;
934 unsigned long addr, size;
935
936 size = PAGE_SIZE;
937
938 addr = gfn_to_hva(kvm, gfn);
939 if (kvm_is_error_hva(addr))
940 return PAGE_SIZE;
941
942 down_read(&current->mm->mmap_sem);
943 vma = find_vma(current->mm, addr);
944 if (!vma)
945 goto out;
946
947 size = vma_kernel_pagesize(vma);
948
949out:
950 up_read(&current->mm->mmap_sem);
951
952 return size;
953}
954
bc6678a3
MT
955int memslot_id(struct kvm *kvm, gfn_t gfn)
956{
957 int i;
90d83dc3 958 struct kvm_memslots *slots = kvm_memslots(kvm);
bc6678a3
MT
959 struct kvm_memory_slot *memslot = NULL;
960
bc6678a3
MT
961 for (i = 0; i < slots->nmemslots; ++i) {
962 memslot = &slots->memslots[i];
963
964 if (gfn >= memslot->base_gfn
965 && gfn < memslot->base_gfn + memslot->npages)
966 break;
967 }
968
969 return memslot - slots->memslots;
970}
971
49c7754c 972static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
48987781 973 gfn_t *nr_pages)
539cb660 974{
bc6678a3 975 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
539cb660 976 return bad_hva();
48987781
XG
977
978 if (nr_pages)
979 *nr_pages = slot->npages - (gfn - slot->base_gfn);
980
f5c98031 981 return gfn_to_hva_memslot(slot, gfn);
539cb660 982}
48987781
XG
983
984unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
985{
49c7754c 986 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
48987781 987}
0d150298 988EXPORT_SYMBOL_GPL(gfn_to_hva);
539cb660 989
8030089f
GN
990static pfn_t get_fault_pfn(void)
991{
992 get_page(fault_page);
993 return fault_pfn;
994}
995
af585b92 996static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
612819c3 997 bool *async, bool write_fault, bool *writable)
954bbbc2 998{
8d4e1288 999 struct page *page[1];
af585b92 1000 int npages = 0;
2e2e3738 1001 pfn_t pfn;
954bbbc2 1002
af585b92
GN
1003 /* we can do it either atomically or asynchronously, not both */
1004 BUG_ON(atomic && async);
1005
612819c3
MT
1006 BUG_ON(!write_fault && !writable);
1007
1008 if (writable)
1009 *writable = true;
1010
af585b92 1011 if (atomic || async)
887c08ac 1012 npages = __get_user_pages_fast(addr, 1, 1, page);
af585b92
GN
1013
1014 if (unlikely(npages != 1) && !atomic) {
887c08ac 1015 might_sleep();
612819c3
MT
1016
1017 if (writable)
1018 *writable = write_fault;
1019
1020 npages = get_user_pages_fast(addr, 1, write_fault, page);
1021
1022 /* map read fault as writable if possible */
1023 if (unlikely(!write_fault) && npages == 1) {
1024 struct page *wpage[1];
1025
1026 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1027 if (npages == 1) {
1028 *writable = true;
1029 put_page(page[0]);
1030 page[0] = wpage[0];
1031 }
1032 npages = 1;
1033 }
887c08ac 1034 }
539cb660 1035
2e2e3738
AL
1036 if (unlikely(npages != 1)) {
1037 struct vm_area_struct *vma;
1038
887c08ac 1039 if (atomic)
8030089f 1040 return get_fault_pfn();
887c08ac 1041
bbeb3406 1042 down_read(&current->mm->mmap_sem);
bf998156 1043 if (is_hwpoison_address(addr)) {
bbeb3406 1044 up_read(&current->mm->mmap_sem);
bf998156
HY
1045 get_page(hwpoison_page);
1046 return page_to_pfn(hwpoison_page);
1047 }
1048
8030089f 1049 vma = find_vma_intersection(current->mm, addr, addr+1);
4c2155ce 1050
8030089f
GN
1051 if (vma == NULL)
1052 pfn = get_fault_pfn();
1053 else if ((vma->vm_flags & VM_PFNMAP)) {
1054 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1055 vma->vm_pgoff;
1056 BUG_ON(!kvm_is_mmio_pfn(pfn));
1057 } else {
1058 if (async && (vma->vm_flags & VM_WRITE))
af585b92 1059 *async = true;
8030089f 1060 pfn = get_fault_pfn();
2e2e3738 1061 }
4c2155ce 1062 up_read(&current->mm->mmap_sem);
2e2e3738
AL
1063 } else
1064 pfn = page_to_pfn(page[0]);
8d4e1288 1065
2e2e3738 1066 return pfn;
35149e21
AL
1067}
1068
887c08ac
XG
1069pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
1070{
612819c3 1071 return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
887c08ac
XG
1072}
1073EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1074
612819c3
MT
1075static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1076 bool write_fault, bool *writable)
506f0d6f
MT
1077{
1078 unsigned long addr;
1079
af585b92
GN
1080 if (async)
1081 *async = false;
1082
506f0d6f
MT
1083 addr = gfn_to_hva(kvm, gfn);
1084 if (kvm_is_error_hva(addr)) {
1085 get_page(bad_page);
1086 return page_to_pfn(bad_page);
1087 }
1088
612819c3 1089 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
365fb3fd
XG
1090}
1091
1092pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1093{
612819c3 1094 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
365fb3fd
XG
1095}
1096EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1097
612819c3
MT
1098pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1099 bool write_fault, bool *writable)
af585b92 1100{
612819c3 1101 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
af585b92
GN
1102}
1103EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1104
365fb3fd
XG
1105pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1106{
612819c3 1107 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
506f0d6f 1108}
35149e21
AL
1109EXPORT_SYMBOL_GPL(gfn_to_pfn);
1110
612819c3
MT
1111pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1112 bool *writable)
1113{
1114 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1115}
1116EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1117
506f0d6f
MT
1118pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1119 struct kvm_memory_slot *slot, gfn_t gfn)
1120{
1121 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
612819c3 1122 return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
506f0d6f
MT
1123}
1124
48987781
XG
1125int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1126 int nr_pages)
1127{
1128 unsigned long addr;
1129 gfn_t entry;
1130
49c7754c 1131 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
48987781
XG
1132 if (kvm_is_error_hva(addr))
1133 return -1;
1134
1135 if (entry < nr_pages)
1136 return 0;
1137
1138 return __get_user_pages_fast(addr, nr_pages, 1, pages);
1139}
1140EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1141
35149e21
AL
1142struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1143{
2e2e3738
AL
1144 pfn_t pfn;
1145
1146 pfn = gfn_to_pfn(kvm, gfn);
c77fb9dc 1147 if (!kvm_is_mmio_pfn(pfn))
2e2e3738
AL
1148 return pfn_to_page(pfn);
1149
c77fb9dc 1150 WARN_ON(kvm_is_mmio_pfn(pfn));
2e2e3738
AL
1151
1152 get_page(bad_page);
1153 return bad_page;
954bbbc2 1154}
aab61cc0 1155
954bbbc2
AK
1156EXPORT_SYMBOL_GPL(gfn_to_page);
1157
b4231d61
IE
1158void kvm_release_page_clean(struct page *page)
1159{
35149e21 1160 kvm_release_pfn_clean(page_to_pfn(page));
b4231d61
IE
1161}
1162EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1163
35149e21
AL
1164void kvm_release_pfn_clean(pfn_t pfn)
1165{
c77fb9dc 1166 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 1167 put_page(pfn_to_page(pfn));
35149e21
AL
1168}
1169EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1170
b4231d61 1171void kvm_release_page_dirty(struct page *page)
8a7ae055 1172{
35149e21
AL
1173 kvm_release_pfn_dirty(page_to_pfn(page));
1174}
1175EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1176
1177void kvm_release_pfn_dirty(pfn_t pfn)
1178{
1179 kvm_set_pfn_dirty(pfn);
1180 kvm_release_pfn_clean(pfn);
1181}
1182EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1183
1184void kvm_set_page_dirty(struct page *page)
1185{
1186 kvm_set_pfn_dirty(page_to_pfn(page));
1187}
1188EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1189
1190void kvm_set_pfn_dirty(pfn_t pfn)
1191{
c77fb9dc 1192 if (!kvm_is_mmio_pfn(pfn)) {
2e2e3738
AL
1193 struct page *page = pfn_to_page(pfn);
1194 if (!PageReserved(page))
1195 SetPageDirty(page);
1196 }
8a7ae055 1197}
35149e21
AL
1198EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1199
1200void kvm_set_pfn_accessed(pfn_t pfn)
1201{
c77fb9dc 1202 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 1203 mark_page_accessed(pfn_to_page(pfn));
35149e21
AL
1204}
1205EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1206
1207void kvm_get_pfn(pfn_t pfn)
1208{
c77fb9dc 1209 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 1210 get_page(pfn_to_page(pfn));
35149e21
AL
1211}
1212EXPORT_SYMBOL_GPL(kvm_get_pfn);
8a7ae055 1213
195aefde
IE
1214static int next_segment(unsigned long len, int offset)
1215{
1216 if (len > PAGE_SIZE - offset)
1217 return PAGE_SIZE - offset;
1218 else
1219 return len;
1220}
1221
1222int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1223 int len)
1224{
e0506bcb
IE
1225 int r;
1226 unsigned long addr;
195aefde 1227
e0506bcb
IE
1228 addr = gfn_to_hva(kvm, gfn);
1229 if (kvm_is_error_hva(addr))
1230 return -EFAULT;
1231 r = copy_from_user(data, (void __user *)addr + offset, len);
1232 if (r)
195aefde 1233 return -EFAULT;
195aefde
IE
1234 return 0;
1235}
1236EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1237
1238int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1239{
1240 gfn_t gfn = gpa >> PAGE_SHIFT;
1241 int seg;
1242 int offset = offset_in_page(gpa);
1243 int ret;
1244
1245 while ((seg = next_segment(len, offset)) != 0) {
1246 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1247 if (ret < 0)
1248 return ret;
1249 offset = 0;
1250 len -= seg;
1251 data += seg;
1252 ++gfn;
1253 }
1254 return 0;
1255}
1256EXPORT_SYMBOL_GPL(kvm_read_guest);
1257
7ec54588
MT
1258int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1259 unsigned long len)
1260{
1261 int r;
1262 unsigned long addr;
1263 gfn_t gfn = gpa >> PAGE_SHIFT;
1264 int offset = offset_in_page(gpa);
1265
1266 addr = gfn_to_hva(kvm, gfn);
1267 if (kvm_is_error_hva(addr))
1268 return -EFAULT;
0aac03f0 1269 pagefault_disable();
7ec54588 1270 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
0aac03f0 1271 pagefault_enable();
7ec54588
MT
1272 if (r)
1273 return -EFAULT;
1274 return 0;
1275}
1276EXPORT_SYMBOL(kvm_read_guest_atomic);
1277
195aefde
IE
1278int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1279 int offset, int len)
1280{
e0506bcb
IE
1281 int r;
1282 unsigned long addr;
195aefde 1283
e0506bcb
IE
1284 addr = gfn_to_hva(kvm, gfn);
1285 if (kvm_is_error_hva(addr))
1286 return -EFAULT;
1287 r = copy_to_user((void __user *)addr + offset, data, len);
1288 if (r)
195aefde 1289 return -EFAULT;
195aefde
IE
1290 mark_page_dirty(kvm, gfn);
1291 return 0;
1292}
1293EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1294
1295int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1296 unsigned long len)
1297{
1298 gfn_t gfn = gpa >> PAGE_SHIFT;
1299 int seg;
1300 int offset = offset_in_page(gpa);
1301 int ret;
1302
1303 while ((seg = next_segment(len, offset)) != 0) {
1304 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1305 if (ret < 0)
1306 return ret;
1307 offset = 0;
1308 len -= seg;
1309 data += seg;
1310 ++gfn;
1311 }
1312 return 0;
1313}
1314
49c7754c
GN
1315int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1316 gpa_t gpa)
1317{
1318 struct kvm_memslots *slots = kvm_memslots(kvm);
1319 int offset = offset_in_page(gpa);
1320 gfn_t gfn = gpa >> PAGE_SHIFT;
1321
1322 ghc->gpa = gpa;
1323 ghc->generation = slots->generation;
1324 ghc->memslot = __gfn_to_memslot(slots, gfn);
1325 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1326 if (!kvm_is_error_hva(ghc->hva))
1327 ghc->hva += offset;
1328 else
1329 return -EFAULT;
1330
1331 return 0;
1332}
1333EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1334
1335int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1336 void *data, unsigned long len)
1337{
1338 struct kvm_memslots *slots = kvm_memslots(kvm);
1339 int r;
1340
1341 if (slots->generation != ghc->generation)
1342 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1343
1344 if (kvm_is_error_hva(ghc->hva))
1345 return -EFAULT;
1346
1347 r = copy_to_user((void __user *)ghc->hva, data, len);
1348 if (r)
1349 return -EFAULT;
1350 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1351
1352 return 0;
1353}
1354EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1355
195aefde
IE
1356int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1357{
3bcc8a8c
HC
1358 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1359 offset, len);
195aefde
IE
1360}
1361EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1362
1363int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1364{
1365 gfn_t gfn = gpa >> PAGE_SHIFT;
1366 int seg;
1367 int offset = offset_in_page(gpa);
1368 int ret;
1369
1370 while ((seg = next_segment(len, offset)) != 0) {
1371 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1372 if (ret < 0)
1373 return ret;
1374 offset = 0;
1375 len -= seg;
1376 ++gfn;
1377 }
1378 return 0;
1379}
1380EXPORT_SYMBOL_GPL(kvm_clear_guest);
1381
49c7754c
GN
1382void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1383 gfn_t gfn)
6aa8b732 1384{
7e9d619d
RR
1385 if (memslot && memslot->dirty_bitmap) {
1386 unsigned long rel_gfn = gfn - memslot->base_gfn;
6aa8b732 1387
d1476937 1388 generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
6aa8b732
AK
1389 }
1390}
1391
49c7754c
GN
1392void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1393{
1394 struct kvm_memory_slot *memslot;
1395
1396 memslot = gfn_to_memslot(kvm, gfn);
1397 mark_page_dirty_in_slot(kvm, memslot, gfn);
1398}
1399
b6958ce4
ED
1400/*
1401 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1402 */
8776e519 1403void kvm_vcpu_block(struct kvm_vcpu *vcpu)
d3bef15f 1404{
e5c239cf
MT
1405 DEFINE_WAIT(wait);
1406
1407 for (;;) {
1408 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1409
a1b37100 1410 if (kvm_arch_vcpu_runnable(vcpu)) {
a8eeb04a 1411 kvm_make_request(KVM_REQ_UNHALT, vcpu);
e5c239cf 1412 break;
d7690175 1413 }
09cec754
GN
1414 if (kvm_cpu_has_pending_timer(vcpu))
1415 break;
e5c239cf
MT
1416 if (signal_pending(current))
1417 break;
1418
b6958ce4 1419 schedule();
b6958ce4 1420 }
d3bef15f 1421
e5c239cf 1422 finish_wait(&vcpu->wq, &wait);
b6958ce4
ED
1423}
1424
6aa8b732
AK
1425void kvm_resched(struct kvm_vcpu *vcpu)
1426{
3fca0365
YD
1427 if (!need_resched())
1428 return;
6aa8b732 1429 cond_resched();
6aa8b732
AK
1430}
1431EXPORT_SYMBOL_GPL(kvm_resched);
1432
d255f4f2
ZE
1433void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
1434{
1435 ktime_t expires;
1436 DEFINE_WAIT(wait);
1437
1438 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1439
1440 /* Sleep for 100 us, and hope lock-holder got scheduled */
1441 expires = ktime_add_ns(ktime_get(), 100000UL);
1442 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1443
1444 finish_wait(&vcpu->wq, &wait);
1445}
1446EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1447
e4a533a4 1448static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
9a2bb7f4
AK
1449{
1450 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
9a2bb7f4
AK
1451 struct page *page;
1452
e4a533a4 1453 if (vmf->pgoff == 0)
039576c0 1454 page = virt_to_page(vcpu->run);
09566765 1455#ifdef CONFIG_X86
e4a533a4 1456 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
ad312c7c 1457 page = virt_to_page(vcpu->arch.pio_data);
5f94c174
LV
1458#endif
1459#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1460 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1461 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
09566765 1462#endif
039576c0 1463 else
e4a533a4 1464 return VM_FAULT_SIGBUS;
9a2bb7f4 1465 get_page(page);
e4a533a4 1466 vmf->page = page;
1467 return 0;
9a2bb7f4
AK
1468}
1469
f0f37e2f 1470static const struct vm_operations_struct kvm_vcpu_vm_ops = {
e4a533a4 1471 .fault = kvm_vcpu_fault,
9a2bb7f4
AK
1472};
1473
1474static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1475{
1476 vma->vm_ops = &kvm_vcpu_vm_ops;
1477 return 0;
1478}
1479
bccf2150
AK
1480static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1481{
1482 struct kvm_vcpu *vcpu = filp->private_data;
1483
66c0b394 1484 kvm_put_kvm(vcpu->kvm);
bccf2150
AK
1485 return 0;
1486}
1487
3d3aab1b 1488static struct file_operations kvm_vcpu_fops = {
bccf2150
AK
1489 .release = kvm_vcpu_release,
1490 .unlocked_ioctl = kvm_vcpu_ioctl,
1491 .compat_ioctl = kvm_vcpu_ioctl,
9a2bb7f4 1492 .mmap = kvm_vcpu_mmap,
6038f373 1493 .llseek = noop_llseek,
bccf2150
AK
1494};
1495
1496/*
1497 * Allocates an inode for the vcpu.
1498 */
1499static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1500{
628ff7c1 1501 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
bccf2150
AK
1502}
1503
c5ea7660
AK
1504/*
1505 * Creates some virtual cpus. Good luck creating more than one.
1506 */
73880c80 1507static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
c5ea7660
AK
1508{
1509 int r;
988a2cae 1510 struct kvm_vcpu *vcpu, *v;
c5ea7660 1511
73880c80 1512 vcpu = kvm_arch_vcpu_create(kvm, id);
fb3f0f51
RR
1513 if (IS_ERR(vcpu))
1514 return PTR_ERR(vcpu);
c5ea7660 1515
15ad7146
AK
1516 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1517
26e5215f
AK
1518 r = kvm_arch_vcpu_setup(vcpu);
1519 if (r)
7d8fece6 1520 return r;
26e5215f 1521
11ec2804 1522 mutex_lock(&kvm->lock);
73880c80
GN
1523 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1524 r = -EINVAL;
e9b11c17 1525 goto vcpu_destroy;
fb3f0f51 1526 }
73880c80 1527
988a2cae
GN
1528 kvm_for_each_vcpu(r, v, kvm)
1529 if (v->vcpu_id == id) {
73880c80
GN
1530 r = -EEXIST;
1531 goto vcpu_destroy;
1532 }
1533
1534 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
c5ea7660 1535
fb3f0f51 1536 /* Now it's all set up, let userspace reach it */
66c0b394 1537 kvm_get_kvm(kvm);
bccf2150 1538 r = create_vcpu_fd(vcpu);
73880c80
GN
1539 if (r < 0) {
1540 kvm_put_kvm(kvm);
1541 goto vcpu_destroy;
1542 }
1543
1544 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1545 smp_wmb();
1546 atomic_inc(&kvm->online_vcpus);
1547
1548#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1549 if (kvm->bsp_vcpu_id == id)
1550 kvm->bsp_vcpu = vcpu;
1551#endif
1552 mutex_unlock(&kvm->lock);
fb3f0f51 1553 return r;
39c3b86e 1554
e9b11c17 1555vcpu_destroy:
7d8fece6 1556 mutex_unlock(&kvm->lock);
d40ccc62 1557 kvm_arch_vcpu_destroy(vcpu);
c5ea7660
AK
1558 return r;
1559}
1560
1961d276
AK
1561static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1562{
1563 if (sigset) {
1564 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1565 vcpu->sigset_active = 1;
1566 vcpu->sigset = *sigset;
1567 } else
1568 vcpu->sigset_active = 0;
1569 return 0;
1570}
1571
bccf2150
AK
1572static long kvm_vcpu_ioctl(struct file *filp,
1573 unsigned int ioctl, unsigned long arg)
6aa8b732 1574{
bccf2150 1575 struct kvm_vcpu *vcpu = filp->private_data;
2f366987 1576 void __user *argp = (void __user *)arg;
313a3dc7 1577 int r;
fa3795a7
DH
1578 struct kvm_fpu *fpu = NULL;
1579 struct kvm_sregs *kvm_sregs = NULL;
6aa8b732 1580
6d4e4c4f
AK
1581 if (vcpu->kvm->mm != current->mm)
1582 return -EIO;
2122ff5e
AK
1583
1584#if defined(CONFIG_S390) || defined(CONFIG_PPC)
1585 /*
1586 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1587 * so vcpu_load() would break it.
1588 */
1589 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1590 return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1591#endif
1592
1593
1594 vcpu_load(vcpu);
6aa8b732 1595 switch (ioctl) {
9a2bb7f4 1596 case KVM_RUN:
f0fe5108
AK
1597 r = -EINVAL;
1598 if (arg)
1599 goto out;
b6c7a5dc 1600 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
64be5007 1601 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
6aa8b732 1602 break;
6aa8b732 1603 case KVM_GET_REGS: {
3e4bb3ac 1604 struct kvm_regs *kvm_regs;
6aa8b732 1605
3e4bb3ac
XZ
1606 r = -ENOMEM;
1607 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1608 if (!kvm_regs)
6aa8b732 1609 goto out;
3e4bb3ac
XZ
1610 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1611 if (r)
1612 goto out_free1;
6aa8b732 1613 r = -EFAULT;
3e4bb3ac
XZ
1614 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1615 goto out_free1;
6aa8b732 1616 r = 0;
3e4bb3ac
XZ
1617out_free1:
1618 kfree(kvm_regs);
6aa8b732
AK
1619 break;
1620 }
1621 case KVM_SET_REGS: {
3e4bb3ac 1622 struct kvm_regs *kvm_regs;
6aa8b732 1623
3e4bb3ac
XZ
1624 r = -ENOMEM;
1625 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1626 if (!kvm_regs)
6aa8b732 1627 goto out;
3e4bb3ac
XZ
1628 r = -EFAULT;
1629 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1630 goto out_free2;
1631 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
6aa8b732 1632 if (r)
3e4bb3ac 1633 goto out_free2;
6aa8b732 1634 r = 0;
3e4bb3ac
XZ
1635out_free2:
1636 kfree(kvm_regs);
6aa8b732
AK
1637 break;
1638 }
1639 case KVM_GET_SREGS: {
fa3795a7
DH
1640 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1641 r = -ENOMEM;
1642 if (!kvm_sregs)
1643 goto out;
1644 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
6aa8b732
AK
1645 if (r)
1646 goto out;
1647 r = -EFAULT;
fa3795a7 1648 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
6aa8b732
AK
1649 goto out;
1650 r = 0;
1651 break;
1652 }
1653 case KVM_SET_SREGS: {
fa3795a7
DH
1654 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1655 r = -ENOMEM;
1656 if (!kvm_sregs)
1657 goto out;
6aa8b732 1658 r = -EFAULT;
fa3795a7 1659 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
6aa8b732 1660 goto out;
fa3795a7 1661 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
6aa8b732
AK
1662 if (r)
1663 goto out;
1664 r = 0;
1665 break;
1666 }
62d9f0db
MT
1667 case KVM_GET_MP_STATE: {
1668 struct kvm_mp_state mp_state;
1669
1670 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1671 if (r)
1672 goto out;
1673 r = -EFAULT;
1674 if (copy_to_user(argp, &mp_state, sizeof mp_state))
1675 goto out;
1676 r = 0;
1677 break;
1678 }
1679 case KVM_SET_MP_STATE: {
1680 struct kvm_mp_state mp_state;
1681
1682 r = -EFAULT;
1683 if (copy_from_user(&mp_state, argp, sizeof mp_state))
1684 goto out;
1685 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1686 if (r)
1687 goto out;
1688 r = 0;
1689 break;
1690 }
6aa8b732
AK
1691 case KVM_TRANSLATE: {
1692 struct kvm_translation tr;
1693
1694 r = -EFAULT;
2f366987 1695 if (copy_from_user(&tr, argp, sizeof tr))
6aa8b732 1696 goto out;
8b006791 1697 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
6aa8b732
AK
1698 if (r)
1699 goto out;
1700 r = -EFAULT;
2f366987 1701 if (copy_to_user(argp, &tr, sizeof tr))
6aa8b732
AK
1702 goto out;
1703 r = 0;
1704 break;
1705 }
d0bfb940
JK
1706 case KVM_SET_GUEST_DEBUG: {
1707 struct kvm_guest_debug dbg;
6aa8b732
AK
1708
1709 r = -EFAULT;
2f366987 1710 if (copy_from_user(&dbg, argp, sizeof dbg))
6aa8b732 1711 goto out;
d0bfb940 1712 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
6aa8b732
AK
1713 if (r)
1714 goto out;
1715 r = 0;
1716 break;
1717 }
1961d276
AK
1718 case KVM_SET_SIGNAL_MASK: {
1719 struct kvm_signal_mask __user *sigmask_arg = argp;
1720 struct kvm_signal_mask kvm_sigmask;
1721 sigset_t sigset, *p;
1722
1723 p = NULL;
1724 if (argp) {
1725 r = -EFAULT;
1726 if (copy_from_user(&kvm_sigmask, argp,
1727 sizeof kvm_sigmask))
1728 goto out;
1729 r = -EINVAL;
1730 if (kvm_sigmask.len != sizeof sigset)
1731 goto out;
1732 r = -EFAULT;
1733 if (copy_from_user(&sigset, sigmask_arg->sigset,
1734 sizeof sigset))
1735 goto out;
1736 p = &sigset;
1737 }
376d41ff 1738 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1961d276
AK
1739 break;
1740 }
b8836737 1741 case KVM_GET_FPU: {
fa3795a7
DH
1742 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1743 r = -ENOMEM;
1744 if (!fpu)
1745 goto out;
1746 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
b8836737
AK
1747 if (r)
1748 goto out;
1749 r = -EFAULT;
fa3795a7 1750 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
b8836737
AK
1751 goto out;
1752 r = 0;
1753 break;
1754 }
1755 case KVM_SET_FPU: {
fa3795a7
DH
1756 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1757 r = -ENOMEM;
1758 if (!fpu)
1759 goto out;
b8836737 1760 r = -EFAULT;
fa3795a7 1761 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
b8836737 1762 goto out;
fa3795a7 1763 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
b8836737
AK
1764 if (r)
1765 goto out;
1766 r = 0;
1767 break;
1768 }
bccf2150 1769 default:
313a3dc7 1770 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
bccf2150
AK
1771 }
1772out:
2122ff5e 1773 vcpu_put(vcpu);
fa3795a7
DH
1774 kfree(fpu);
1775 kfree(kvm_sregs);
bccf2150
AK
1776 return r;
1777}
1778
1779static long kvm_vm_ioctl(struct file *filp,
1780 unsigned int ioctl, unsigned long arg)
1781{
1782 struct kvm *kvm = filp->private_data;
1783 void __user *argp = (void __user *)arg;
1fe779f8 1784 int r;
bccf2150 1785
6d4e4c4f
AK
1786 if (kvm->mm != current->mm)
1787 return -EIO;
bccf2150
AK
1788 switch (ioctl) {
1789 case KVM_CREATE_VCPU:
1790 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1791 if (r < 0)
1792 goto out;
1793 break;
6fc138d2
IE
1794 case KVM_SET_USER_MEMORY_REGION: {
1795 struct kvm_userspace_memory_region kvm_userspace_mem;
1796
1797 r = -EFAULT;
1798 if (copy_from_user(&kvm_userspace_mem, argp,
1799 sizeof kvm_userspace_mem))
1800 goto out;
1801
1802 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
6aa8b732
AK
1803 if (r)
1804 goto out;
1805 break;
1806 }
1807 case KVM_GET_DIRTY_LOG: {
1808 struct kvm_dirty_log log;
1809
1810 r = -EFAULT;
2f366987 1811 if (copy_from_user(&log, argp, sizeof log))
6aa8b732 1812 goto out;
2c6f5df9 1813 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6aa8b732
AK
1814 if (r)
1815 goto out;
1816 break;
1817 }
5f94c174
LV
1818#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1819 case KVM_REGISTER_COALESCED_MMIO: {
1820 struct kvm_coalesced_mmio_zone zone;
1821 r = -EFAULT;
1822 if (copy_from_user(&zone, argp, sizeof zone))
1823 goto out;
5f94c174
LV
1824 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1825 if (r)
1826 goto out;
1827 r = 0;
1828 break;
1829 }
1830 case KVM_UNREGISTER_COALESCED_MMIO: {
1831 struct kvm_coalesced_mmio_zone zone;
1832 r = -EFAULT;
1833 if (copy_from_user(&zone, argp, sizeof zone))
1834 goto out;
5f94c174
LV
1835 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
1836 if (r)
1837 goto out;
1838 r = 0;
1839 break;
1840 }
1841#endif
721eecbf
GH
1842 case KVM_IRQFD: {
1843 struct kvm_irqfd data;
1844
1845 r = -EFAULT;
1846 if (copy_from_user(&data, argp, sizeof data))
1847 goto out;
1848 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
1849 break;
1850 }
d34e6b17
GH
1851 case KVM_IOEVENTFD: {
1852 struct kvm_ioeventfd data;
1853
1854 r = -EFAULT;
1855 if (copy_from_user(&data, argp, sizeof data))
1856 goto out;
1857 r = kvm_ioeventfd(kvm, &data);
1858 break;
1859 }
73880c80
GN
1860#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1861 case KVM_SET_BOOT_CPU_ID:
1862 r = 0;
894a9c55 1863 mutex_lock(&kvm->lock);
73880c80
GN
1864 if (atomic_read(&kvm->online_vcpus) != 0)
1865 r = -EBUSY;
1866 else
1867 kvm->bsp_vcpu_id = arg;
894a9c55 1868 mutex_unlock(&kvm->lock);
73880c80
GN
1869 break;
1870#endif
f17abe9a 1871 default:
1fe779f8 1872 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
bfd99ff5
AK
1873 if (r == -ENOTTY)
1874 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
f17abe9a
AK
1875 }
1876out:
1877 return r;
1878}
1879
6ff5894c
AB
1880#ifdef CONFIG_COMPAT
1881struct compat_kvm_dirty_log {
1882 __u32 slot;
1883 __u32 padding1;
1884 union {
1885 compat_uptr_t dirty_bitmap; /* one bit per page */
1886 __u64 padding2;
1887 };
1888};
1889
1890static long kvm_vm_compat_ioctl(struct file *filp,
1891 unsigned int ioctl, unsigned long arg)
1892{
1893 struct kvm *kvm = filp->private_data;
1894 int r;
1895
1896 if (kvm->mm != current->mm)
1897 return -EIO;
1898 switch (ioctl) {
1899 case KVM_GET_DIRTY_LOG: {
1900 struct compat_kvm_dirty_log compat_log;
1901 struct kvm_dirty_log log;
1902
1903 r = -EFAULT;
1904 if (copy_from_user(&compat_log, (void __user *)arg,
1905 sizeof(compat_log)))
1906 goto out;
1907 log.slot = compat_log.slot;
1908 log.padding1 = compat_log.padding1;
1909 log.padding2 = compat_log.padding2;
1910 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
1911
1912 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1913 if (r)
1914 goto out;
1915 break;
1916 }
1917 default:
1918 r = kvm_vm_ioctl(filp, ioctl, arg);
1919 }
1920
1921out:
1922 return r;
1923}
1924#endif
1925
e4a533a4 1926static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
f17abe9a 1927{
777b3f49
MT
1928 struct page *page[1];
1929 unsigned long addr;
1930 int npages;
1931 gfn_t gfn = vmf->pgoff;
f17abe9a 1932 struct kvm *kvm = vma->vm_file->private_data;
f17abe9a 1933
777b3f49
MT
1934 addr = gfn_to_hva(kvm, gfn);
1935 if (kvm_is_error_hva(addr))
e4a533a4 1936 return VM_FAULT_SIGBUS;
777b3f49
MT
1937
1938 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
1939 NULL);
1940 if (unlikely(npages != 1))
e4a533a4 1941 return VM_FAULT_SIGBUS;
777b3f49
MT
1942
1943 vmf->page = page[0];
e4a533a4 1944 return 0;
f17abe9a
AK
1945}
1946
f0f37e2f 1947static const struct vm_operations_struct kvm_vm_vm_ops = {
e4a533a4 1948 .fault = kvm_vm_fault,
f17abe9a
AK
1949};
1950
1951static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1952{
1953 vma->vm_ops = &kvm_vm_vm_ops;
1954 return 0;
1955}
1956
3d3aab1b 1957static struct file_operations kvm_vm_fops = {
f17abe9a
AK
1958 .release = kvm_vm_release,
1959 .unlocked_ioctl = kvm_vm_ioctl,
6ff5894c
AB
1960#ifdef CONFIG_COMPAT
1961 .compat_ioctl = kvm_vm_compat_ioctl,
1962#endif
f17abe9a 1963 .mmap = kvm_vm_mmap,
6038f373 1964 .llseek = noop_llseek,
f17abe9a
AK
1965};
1966
1967static int kvm_dev_ioctl_create_vm(void)
1968{
aac87636 1969 int r;
f17abe9a
AK
1970 struct kvm *kvm;
1971
f17abe9a 1972 kvm = kvm_create_vm();
d6d28168
AK
1973 if (IS_ERR(kvm))
1974 return PTR_ERR(kvm);
6ce5a090
TY
1975#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1976 r = kvm_coalesced_mmio_init(kvm);
1977 if (r < 0) {
1978 kvm_put_kvm(kvm);
1979 return r;
1980 }
1981#endif
aac87636
HC
1982 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
1983 if (r < 0)
66c0b394 1984 kvm_put_kvm(kvm);
f17abe9a 1985
aac87636 1986 return r;
f17abe9a
AK
1987}
1988
1a811b61
AK
1989static long kvm_dev_ioctl_check_extension_generic(long arg)
1990{
1991 switch (arg) {
ca9edaee 1992 case KVM_CAP_USER_MEMORY:
1a811b61 1993 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4cd481f6 1994 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
73880c80
GN
1995#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1996 case KVM_CAP_SET_BOOT_CPU_ID:
1997#endif
a9c7399d 1998 case KVM_CAP_INTERNAL_ERROR_DATA:
1a811b61 1999 return 1;
399ec807
AK
2000#ifdef CONFIG_HAVE_KVM_IRQCHIP
2001 case KVM_CAP_IRQ_ROUTING:
36463146 2002 return KVM_MAX_IRQ_ROUTES;
399ec807 2003#endif
1a811b61
AK
2004 default:
2005 break;
2006 }
2007 return kvm_dev_ioctl_check_extension(arg);
2008}
2009
f17abe9a
AK
2010static long kvm_dev_ioctl(struct file *filp,
2011 unsigned int ioctl, unsigned long arg)
2012{
07c45a36 2013 long r = -EINVAL;
f17abe9a
AK
2014
2015 switch (ioctl) {
2016 case KVM_GET_API_VERSION:
f0fe5108
AK
2017 r = -EINVAL;
2018 if (arg)
2019 goto out;
f17abe9a
AK
2020 r = KVM_API_VERSION;
2021 break;
2022 case KVM_CREATE_VM:
f0fe5108
AK
2023 r = -EINVAL;
2024 if (arg)
2025 goto out;
f17abe9a
AK
2026 r = kvm_dev_ioctl_create_vm();
2027 break;
018d00d2 2028 case KVM_CHECK_EXTENSION:
1a811b61 2029 r = kvm_dev_ioctl_check_extension_generic(arg);
5d308f45 2030 break;
07c45a36
AK
2031 case KVM_GET_VCPU_MMAP_SIZE:
2032 r = -EINVAL;
2033 if (arg)
2034 goto out;
adb1ff46
AK
2035 r = PAGE_SIZE; /* struct kvm_run */
2036#ifdef CONFIG_X86
2037 r += PAGE_SIZE; /* pio data page */
5f94c174
LV
2038#endif
2039#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2040 r += PAGE_SIZE; /* coalesced mmio ring page */
adb1ff46 2041#endif
07c45a36 2042 break;
d4c9ff2d
FEL
2043 case KVM_TRACE_ENABLE:
2044 case KVM_TRACE_PAUSE:
2045 case KVM_TRACE_DISABLE:
2023a29c 2046 r = -EOPNOTSUPP;
d4c9ff2d 2047 break;
6aa8b732 2048 default:
043405e1 2049 return kvm_arch_dev_ioctl(filp, ioctl, arg);
6aa8b732
AK
2050 }
2051out:
2052 return r;
2053}
2054
6aa8b732 2055static struct file_operations kvm_chardev_ops = {
6aa8b732
AK
2056 .unlocked_ioctl = kvm_dev_ioctl,
2057 .compat_ioctl = kvm_dev_ioctl,
6038f373 2058 .llseek = noop_llseek,
6aa8b732
AK
2059};
2060
2061static struct miscdevice kvm_dev = {
bbe4432e 2062 KVM_MINOR,
6aa8b732
AK
2063 "kvm",
2064 &kvm_chardev_ops,
2065};
2066
75b7127c 2067static void hardware_enable_nolock(void *junk)
1b6c0168
AK
2068{
2069 int cpu = raw_smp_processor_id();
10474ae8 2070 int r;
1b6c0168 2071
7f59f492 2072 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
1b6c0168 2073 return;
10474ae8 2074
7f59f492 2075 cpumask_set_cpu(cpu, cpus_hardware_enabled);
10474ae8
AG
2076
2077 r = kvm_arch_hardware_enable(NULL);
2078
2079 if (r) {
2080 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2081 atomic_inc(&hardware_enable_failed);
2082 printk(KERN_INFO "kvm: enabling virtualization on "
2083 "CPU%d failed\n", cpu);
2084 }
1b6c0168
AK
2085}
2086
75b7127c
TY
2087static void hardware_enable(void *junk)
2088{
2089 spin_lock(&kvm_lock);
2090 hardware_enable_nolock(junk);
2091 spin_unlock(&kvm_lock);
2092}
2093
2094static void hardware_disable_nolock(void *junk)
1b6c0168
AK
2095{
2096 int cpu = raw_smp_processor_id();
2097
7f59f492 2098 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
1b6c0168 2099 return;
7f59f492 2100 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
e9b11c17 2101 kvm_arch_hardware_disable(NULL);
1b6c0168
AK
2102}
2103
75b7127c
TY
2104static void hardware_disable(void *junk)
2105{
2106 spin_lock(&kvm_lock);
2107 hardware_disable_nolock(junk);
2108 spin_unlock(&kvm_lock);
2109}
2110
10474ae8
AG
2111static void hardware_disable_all_nolock(void)
2112{
2113 BUG_ON(!kvm_usage_count);
2114
2115 kvm_usage_count--;
2116 if (!kvm_usage_count)
75b7127c 2117 on_each_cpu(hardware_disable_nolock, NULL, 1);
10474ae8
AG
2118}
2119
2120static void hardware_disable_all(void)
2121{
2122 spin_lock(&kvm_lock);
2123 hardware_disable_all_nolock();
2124 spin_unlock(&kvm_lock);
2125}
2126
2127static int hardware_enable_all(void)
2128{
2129 int r = 0;
2130
2131 spin_lock(&kvm_lock);
2132
2133 kvm_usage_count++;
2134 if (kvm_usage_count == 1) {
2135 atomic_set(&hardware_enable_failed, 0);
75b7127c 2136 on_each_cpu(hardware_enable_nolock, NULL, 1);
10474ae8
AG
2137
2138 if (atomic_read(&hardware_enable_failed)) {
2139 hardware_disable_all_nolock();
2140 r = -EBUSY;
2141 }
2142 }
2143
2144 spin_unlock(&kvm_lock);
2145
2146 return r;
2147}
2148
774c47f1
AK
2149static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2150 void *v)
2151{
2152 int cpu = (long)v;
2153
10474ae8
AG
2154 if (!kvm_usage_count)
2155 return NOTIFY_OK;
2156
1a6f4d7f 2157 val &= ~CPU_TASKS_FROZEN;
774c47f1 2158 switch (val) {
cec9ad27 2159 case CPU_DYING:
6ec8a856
AK
2160 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2161 cpu);
2162 hardware_disable(NULL);
2163 break;
da908f2f 2164 case CPU_STARTING:
43934a38
JK
2165 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2166 cpu);
da908f2f 2167 hardware_enable(NULL);
774c47f1
AK
2168 break;
2169 }
2170 return NOTIFY_OK;
2171}
2172
4ecac3fd
AK
2173
2174asmlinkage void kvm_handle_fault_on_reboot(void)
2175{
ca242ac9 2176 if (kvm_rebooting) {
4ecac3fd 2177 /* spin while reset goes on */
ca242ac9 2178 local_irq_enable();
4ecac3fd 2179 while (true)
624d84cf 2180 cpu_relax();
ca242ac9 2181 }
4ecac3fd
AK
2182 /* Fault while not rebooting. We want the trace. */
2183 BUG();
2184}
2185EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
2186
9a2b85c6 2187static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
d77c26fc 2188 void *v)
9a2b85c6 2189{
8e1c1815
SY
2190 /*
2191 * Some (well, at least mine) BIOSes hang on reboot if
2192 * in vmx root mode.
2193 *
2194 * And Intel TXT required VMX off for all cpu when system shutdown.
2195 */
2196 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2197 kvm_rebooting = true;
75b7127c 2198 on_each_cpu(hardware_disable_nolock, NULL, 1);
9a2b85c6
RR
2199 return NOTIFY_OK;
2200}
2201
2202static struct notifier_block kvm_reboot_notifier = {
2203 .notifier_call = kvm_reboot,
2204 .priority = 0,
2205};
2206
e93f8a0f 2207static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2eeb2e94
GH
2208{
2209 int i;
2210
2211 for (i = 0; i < bus->dev_count; i++) {
2212 struct kvm_io_device *pos = bus->devs[i];
2213
2214 kvm_iodevice_destructor(pos);
2215 }
e93f8a0f 2216 kfree(bus);
2eeb2e94
GH
2217}
2218
bda9020e 2219/* kvm_io_bus_write - called under kvm->slots_lock */
e93f8a0f 2220int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
bda9020e 2221 int len, const void *val)
2eeb2e94
GH
2222{
2223 int i;
90d83dc3
LJ
2224 struct kvm_io_bus *bus;
2225
2226 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
bda9020e
MT
2227 for (i = 0; i < bus->dev_count; i++)
2228 if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2229 return 0;
2230 return -EOPNOTSUPP;
2231}
2eeb2e94 2232
bda9020e 2233/* kvm_io_bus_read - called under kvm->slots_lock */
e93f8a0f
MT
2234int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2235 int len, void *val)
bda9020e
MT
2236{
2237 int i;
90d83dc3 2238 struct kvm_io_bus *bus;
e93f8a0f 2239
90d83dc3 2240 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
bda9020e
MT
2241 for (i = 0; i < bus->dev_count; i++)
2242 if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2243 return 0;
2244 return -EOPNOTSUPP;
2eeb2e94
GH
2245}
2246
79fac95e 2247/* Caller must hold slots_lock. */
e93f8a0f
MT
2248int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2249 struct kvm_io_device *dev)
6c474694 2250{
e93f8a0f 2251 struct kvm_io_bus *new_bus, *bus;
090b7aff 2252
e93f8a0f 2253 bus = kvm->buses[bus_idx];
090b7aff
GH
2254 if (bus->dev_count > NR_IOBUS_DEVS-1)
2255 return -ENOSPC;
2eeb2e94 2256
e93f8a0f
MT
2257 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2258 if (!new_bus)
2259 return -ENOMEM;
2260 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2261 new_bus->devs[new_bus->dev_count++] = dev;
2262 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2263 synchronize_srcu_expedited(&kvm->srcu);
2264 kfree(bus);
090b7aff
GH
2265
2266 return 0;
2267}
2268
79fac95e 2269/* Caller must hold slots_lock. */
e93f8a0f
MT
2270int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2271 struct kvm_io_device *dev)
090b7aff 2272{
e93f8a0f
MT
2273 int i, r;
2274 struct kvm_io_bus *new_bus, *bus;
090b7aff 2275
e93f8a0f
MT
2276 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2277 if (!new_bus)
2278 return -ENOMEM;
090b7aff 2279
e93f8a0f
MT
2280 bus = kvm->buses[bus_idx];
2281 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2282
2283 r = -ENOENT;
2284 for (i = 0; i < new_bus->dev_count; i++)
2285 if (new_bus->devs[i] == dev) {
2286 r = 0;
2287 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
090b7aff
GH
2288 break;
2289 }
e93f8a0f
MT
2290
2291 if (r) {
2292 kfree(new_bus);
2293 return r;
2294 }
2295
2296 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2297 synchronize_srcu_expedited(&kvm->srcu);
2298 kfree(bus);
2299 return r;
2eeb2e94
GH
2300}
2301
774c47f1
AK
2302static struct notifier_block kvm_cpu_notifier = {
2303 .notifier_call = kvm_cpu_hotplug,
774c47f1
AK
2304};
2305
8b88b099 2306static int vm_stat_get(void *_offset, u64 *val)
ba1389b7
AK
2307{
2308 unsigned offset = (long)_offset;
ba1389b7
AK
2309 struct kvm *kvm;
2310
8b88b099 2311 *val = 0;
ba1389b7
AK
2312 spin_lock(&kvm_lock);
2313 list_for_each_entry(kvm, &vm_list, vm_list)
8b88b099 2314 *val += *(u32 *)((void *)kvm + offset);
ba1389b7 2315 spin_unlock(&kvm_lock);
8b88b099 2316 return 0;
ba1389b7
AK
2317}
2318
2319DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2320
8b88b099 2321static int vcpu_stat_get(void *_offset, u64 *val)
1165f5fe
AK
2322{
2323 unsigned offset = (long)_offset;
1165f5fe
AK
2324 struct kvm *kvm;
2325 struct kvm_vcpu *vcpu;
2326 int i;
2327
8b88b099 2328 *val = 0;
1165f5fe
AK
2329 spin_lock(&kvm_lock);
2330 list_for_each_entry(kvm, &vm_list, vm_list)
988a2cae
GN
2331 kvm_for_each_vcpu(i, vcpu, kvm)
2332 *val += *(u32 *)((void *)vcpu + offset);
2333
1165f5fe 2334 spin_unlock(&kvm_lock);
8b88b099 2335 return 0;
1165f5fe
AK
2336}
2337
ba1389b7
AK
2338DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2339
828c0950 2340static const struct file_operations *stat_fops[] = {
ba1389b7
AK
2341 [KVM_STAT_VCPU] = &vcpu_stat_fops,
2342 [KVM_STAT_VM] = &vm_stat_fops,
2343};
1165f5fe 2344
a16b043c 2345static void kvm_init_debug(void)
6aa8b732
AK
2346{
2347 struct kvm_stats_debugfs_item *p;
2348
76f7c879 2349 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6aa8b732 2350 for (p = debugfs_entries; p->name; ++p)
76f7c879 2351 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
1165f5fe 2352 (void *)(long)p->offset,
ba1389b7 2353 stat_fops[p->kind]);
6aa8b732
AK
2354}
2355
2356static void kvm_exit_debug(void)
2357{
2358 struct kvm_stats_debugfs_item *p;
2359
2360 for (p = debugfs_entries; p->name; ++p)
2361 debugfs_remove(p->dentry);
76f7c879 2362 debugfs_remove(kvm_debugfs_dir);
6aa8b732
AK
2363}
2364
59ae6c6b
AK
2365static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2366{
10474ae8 2367 if (kvm_usage_count)
75b7127c 2368 hardware_disable_nolock(NULL);
59ae6c6b
AK
2369 return 0;
2370}
2371
2372static int kvm_resume(struct sys_device *dev)
2373{
ca84d1a2
ZA
2374 if (kvm_usage_count) {
2375 WARN_ON(spin_is_locked(&kvm_lock));
75b7127c 2376 hardware_enable_nolock(NULL);
ca84d1a2 2377 }
59ae6c6b
AK
2378 return 0;
2379}
2380
2381static struct sysdev_class kvm_sysdev_class = {
af5ca3f4 2382 .name = "kvm",
59ae6c6b
AK
2383 .suspend = kvm_suspend,
2384 .resume = kvm_resume,
2385};
2386
2387static struct sys_device kvm_sysdev = {
2388 .id = 0,
2389 .cls = &kvm_sysdev_class,
2390};
2391
cea7bb21 2392struct page *bad_page;
35149e21 2393pfn_t bad_pfn;
6aa8b732 2394
15ad7146
AK
2395static inline
2396struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2397{
2398 return container_of(pn, struct kvm_vcpu, preempt_notifier);
2399}
2400
2401static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2402{
2403 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2404
e9b11c17 2405 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146
AK
2406}
2407
2408static void kvm_sched_out(struct preempt_notifier *pn,
2409 struct task_struct *next)
2410{
2411 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2412
e9b11c17 2413 kvm_arch_vcpu_put(vcpu);
15ad7146
AK
2414}
2415
0ee75bea 2416int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
c16f862d 2417 struct module *module)
6aa8b732
AK
2418{
2419 int r;
002c7f7c 2420 int cpu;
6aa8b732 2421
f8c16bba
ZX
2422 r = kvm_arch_init(opaque);
2423 if (r)
d2308784 2424 goto out_fail;
cb498ea2
ZX
2425
2426 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2427
2428 if (bad_page == NULL) {
2429 r = -ENOMEM;
2430 goto out;
2431 }
2432
35149e21
AL
2433 bad_pfn = page_to_pfn(bad_page);
2434
bf998156
HY
2435 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2436
2437 if (hwpoison_page == NULL) {
2438 r = -ENOMEM;
2439 goto out_free_0;
2440 }
2441
2442 hwpoison_pfn = page_to_pfn(hwpoison_page);
2443
edba23e5
GN
2444 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2445
2446 if (fault_page == NULL) {
2447 r = -ENOMEM;
2448 goto out_free_0;
2449 }
2450
2451 fault_pfn = page_to_pfn(fault_page);
2452
8437a617 2453 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
7f59f492
RR
2454 r = -ENOMEM;
2455 goto out_free_0;
2456 }
2457
e9b11c17 2458 r = kvm_arch_hardware_setup();
6aa8b732 2459 if (r < 0)
7f59f492 2460 goto out_free_0a;
6aa8b732 2461
002c7f7c
YS
2462 for_each_online_cpu(cpu) {
2463 smp_call_function_single(cpu,
e9b11c17 2464 kvm_arch_check_processor_compat,
8691e5a8 2465 &r, 1);
002c7f7c 2466 if (r < 0)
d2308784 2467 goto out_free_1;
002c7f7c
YS
2468 }
2469
774c47f1
AK
2470 r = register_cpu_notifier(&kvm_cpu_notifier);
2471 if (r)
d2308784 2472 goto out_free_2;
6aa8b732
AK
2473 register_reboot_notifier(&kvm_reboot_notifier);
2474
59ae6c6b
AK
2475 r = sysdev_class_register(&kvm_sysdev_class);
2476 if (r)
d2308784 2477 goto out_free_3;
59ae6c6b
AK
2478
2479 r = sysdev_register(&kvm_sysdev);
2480 if (r)
d2308784 2481 goto out_free_4;
59ae6c6b 2482
c16f862d 2483 /* A kmem cache lets us meet the alignment requirements of fx_save. */
0ee75bea
AK
2484 if (!vcpu_align)
2485 vcpu_align = __alignof__(struct kvm_vcpu);
2486 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
56919c5c 2487 0, NULL);
c16f862d
RR
2488 if (!kvm_vcpu_cache) {
2489 r = -ENOMEM;
d2308784 2490 goto out_free_5;
c16f862d
RR
2491 }
2492
af585b92
GN
2493 r = kvm_async_pf_init();
2494 if (r)
2495 goto out_free;
2496
6aa8b732 2497 kvm_chardev_ops.owner = module;
3d3aab1b
CB
2498 kvm_vm_fops.owner = module;
2499 kvm_vcpu_fops.owner = module;
6aa8b732
AK
2500
2501 r = misc_register(&kvm_dev);
2502 if (r) {
d77c26fc 2503 printk(KERN_ERR "kvm: misc device register failed\n");
af585b92 2504 goto out_unreg;
6aa8b732
AK
2505 }
2506
15ad7146
AK
2507 kvm_preempt_ops.sched_in = kvm_sched_in;
2508 kvm_preempt_ops.sched_out = kvm_sched_out;
2509
0ea4ed8e
DW
2510 kvm_init_debug();
2511
c7addb90 2512 return 0;
6aa8b732 2513
af585b92
GN
2514out_unreg:
2515 kvm_async_pf_deinit();
6aa8b732 2516out_free:
c16f862d 2517 kmem_cache_destroy(kvm_vcpu_cache);
d2308784 2518out_free_5:
59ae6c6b 2519 sysdev_unregister(&kvm_sysdev);
d2308784 2520out_free_4:
59ae6c6b 2521 sysdev_class_unregister(&kvm_sysdev_class);
d2308784 2522out_free_3:
6aa8b732 2523 unregister_reboot_notifier(&kvm_reboot_notifier);
774c47f1 2524 unregister_cpu_notifier(&kvm_cpu_notifier);
d2308784 2525out_free_2:
d2308784 2526out_free_1:
e9b11c17 2527 kvm_arch_hardware_unsetup();
7f59f492
RR
2528out_free_0a:
2529 free_cpumask_var(cpus_hardware_enabled);
d2308784 2530out_free_0:
edba23e5
GN
2531 if (fault_page)
2532 __free_page(fault_page);
bf998156
HY
2533 if (hwpoison_page)
2534 __free_page(hwpoison_page);
d2308784 2535 __free_page(bad_page);
ca45aaae 2536out:
f8c16bba 2537 kvm_arch_exit();
d2308784 2538out_fail:
6aa8b732
AK
2539 return r;
2540}
cb498ea2 2541EXPORT_SYMBOL_GPL(kvm_init);
6aa8b732 2542
cb498ea2 2543void kvm_exit(void)
6aa8b732 2544{
0ea4ed8e 2545 kvm_exit_debug();
6aa8b732 2546 misc_deregister(&kvm_dev);
c16f862d 2547 kmem_cache_destroy(kvm_vcpu_cache);
af585b92 2548 kvm_async_pf_deinit();
59ae6c6b
AK
2549 sysdev_unregister(&kvm_sysdev);
2550 sysdev_class_unregister(&kvm_sysdev_class);
6aa8b732 2551 unregister_reboot_notifier(&kvm_reboot_notifier);
59ae6c6b 2552 unregister_cpu_notifier(&kvm_cpu_notifier);
75b7127c 2553 on_each_cpu(hardware_disable_nolock, NULL, 1);
e9b11c17 2554 kvm_arch_hardware_unsetup();
f8c16bba 2555 kvm_arch_exit();
7f59f492 2556 free_cpumask_var(cpus_hardware_enabled);
bf998156 2557 __free_page(hwpoison_page);
cea7bb21 2558 __free_page(bad_page);
6aa8b732 2559}
cb498ea2 2560EXPORT_SYMBOL_GPL(kvm_exit);