arch/x86/kvm/x86.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * derived from drivers/kvm/kvm_main.c
   5  *
   6  * Copyright (C) 2006 Qumranet, Inc.
   7  * Copyright (C) 2008 Qumranet, Inc.
   8  * Copyright IBM Corporation, 2008
   9  *
  10  * Authors:
  11  *   Avi Kivity   <avi@qumranet.com>
  12  *   Yaniv Kamay  <yaniv@qumranet.com>
  13  *   Amit Shah    <amit.shah@qumranet.com>
  14  *   Ben-Ami Yassour <benami@il.ibm.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include <linux/kvm_host.h>
  22 #include "irq.h"
  23 #include "mmu.h"
  24 #include "i8254.h"
  25 #include "tss.h"
  26 #include "kvm_cache_regs.h"
  27 #include "x86.h"
  28
  29 #include <linux/clocksource.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/kvm.h>
  32 #include <linux/fs.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/module.h>
  35 #include <linux/mman.h>
  36 #include <linux/highmem.h>
  37 #include <linux/iommu.h>
  38 #include <linux/intel-iommu.h>
  39
  40 #include <asm/uaccess.h>
  41 #include <asm/msr.h>
  42 #include <asm/desc.h>
  43 #include <asm/mtrr.h>
  44
  45 #define MAX_IO_MSRS 256
  46 #define CR0_RESERVED_BITS                                               \
  47         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  48                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  49                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  50 #define CR4_RESERVED_BITS                                               \
  51         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  52                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  53                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  54                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  55
  56 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  57 /* EFER defaults:
  58  * - enable syscall per default because its emulated by KVM
  59  * - enable LME and LMA per default on 64 bit KVM
  60  */
  61 #ifdef CONFIG_X86_64
  62 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  63 #else
  64 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  65 #endif
  66
  67 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  68 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  69
  70 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  71                                     struct kvm_cpuid_entry2 __user *entries);
  72 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
  73                                               u32 function, u32 index);
  74
  75 struct kvm_x86_ops *kvm_x86_ops;
  76 EXPORT_SYMBOL_GPL(kvm_x86_ops);
  77
  78 struct kvm_stats_debugfs_item debugfs_entries[] = {
  79         { "pf_fixed", VCPU_STAT(pf_fixed) },
  80         { "pf_guest", VCPU_STAT(pf_guest) },
  81         { "tlb_flush", VCPU_STAT(tlb_flush) },
  82         { "invlpg", VCPU_STAT(invlpg) },
  83         { "exits", VCPU_STAT(exits) },
  84         { "io_exits", VCPU_STAT(io_exits) },
  85         { "mmio_exits", VCPU_STAT(mmio_exits) },
  86         { "signal_exits", VCPU_STAT(signal_exits) },
  87         { "irq_window", VCPU_STAT(irq_window_exits) },
  88         { "nmi_window", VCPU_STAT(nmi_window_exits) },
  89         { "halt_exits", VCPU_STAT(halt_exits) },
  90         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
  91         { "hypercalls", VCPU_STAT(hypercalls) },
  92         { "request_irq", VCPU_STAT(request_irq_exits) },
  93         { "request_nmi", VCPU_STAT(request_nmi_exits) },
  94         { "irq_exits", VCPU_STAT(irq_exits) },
  95         { "host_state_reload", VCPU_STAT(host_state_reload) },
  96         { "efer_reload", VCPU_STAT(efer_reload) },
  97         { "fpu_reload", VCPU_STAT(fpu_reload) },
  98         { "insn_emulation", VCPU_STAT(insn_emulation) },
  99         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 100         { "irq_injections", VCPU_STAT(irq_injections) },
 101         { "nmi_injections", VCPU_STAT(nmi_injections) },
 102         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 103         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 104         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 105         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 106         { "mmu_flooded", VM_STAT(mmu_flooded) },
 107         { "mmu_recycled", VM_STAT(mmu_recycled) },
 108         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 109         { "mmu_unsync", VM_STAT(mmu_unsync) },
 110         { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
 111         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 112         { "largepages", VM_STAT(lpages) },
 113         { NULL }
 114 };
 115
 116 unsigned long segment_base(u16 selector)
 117 {
 118         struct descriptor_table gdt;
 119         struct desc_struct *d;
 120         unsigned long table_base;
 121         unsigned long v;
 122
 123         if (selector == 0)
 124                 return 0;
 125
 126         asm("sgdt %0" : "=m"(gdt));
 127         table_base = gdt.base;
 128
 129         if (selector & 4) {           /* from ldt */
 130                 u16 ldt_selector;
 131
 132                 asm("sldt %0" : "=g"(ldt_selector));
 133                 table_base = segment_base(ldt_selector);
 134         }
 135         d = (struct desc_struct *)(table_base + (selector & ~7));
 136         v = d->base0 | ((unsigned long)d->base1 << 16) |
 137                 ((unsigned long)d->base2 << 24);
 138 #ifdef CONFIG_X86_64
 139         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 140                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 141 #endif
 142         return v;
 143 }
 144 EXPORT_SYMBOL_GPL(segment_base);
 145
 146 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 147 {
 148         if (irqchip_in_kernel(vcpu->kvm))
 149                 return vcpu->arch.apic_base;
 150         else
 151                 return vcpu->arch.apic_base;
 152 }
 153 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 154
 155 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 156 {
 157         /* TODO: reserve bits check */
 158         if (irqchip_in_kernel(vcpu->kvm))
 159                 kvm_lapic_set_base(vcpu, data);
 160         else
 161                 vcpu->arch.apic_base = data;
 162 }
 163 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 164
 165 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 166 {
 167         WARN_ON(vcpu->arch.exception.pending);
 168         vcpu->arch.exception.pending = true;
 169         vcpu->arch.exception.has_error_code = false;
 170         vcpu->arch.exception.nr = nr;
 171 }
 172 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 173
 174 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 175                            u32 error_code)
 176 {
 177         ++vcpu->stat.pf_guest;
 178
 179         if (vcpu->arch.exception.pending) {
 180                 if (vcpu->arch.exception.nr == PF_VECTOR) {
 181                         printk(KERN_DEBUG "kvm: inject_page_fault:"
 182                                         " double fault 0x%lx\n", addr);
 183                         vcpu->arch.exception.nr = DF_VECTOR;
 184                         vcpu->arch.exception.error_code = 0;
 185                 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
 186                         /* triple fault -> shutdown */
 187                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 188                 }
 189                 return;
 190         }
 191         vcpu->arch.cr2 = addr;
 192         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 193 }
 194
 195 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 196 {
 197         vcpu->arch.nmi_pending = 1;
 198 }
 199 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 200
 201 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 202 {
 203         WARN_ON(vcpu->arch.exception.pending);
 204         vcpu->arch.exception.pending = true;
 205         vcpu->arch.exception.has_error_code = true;
 206         vcpu->arch.exception.nr = nr;
 207         vcpu->arch.exception.error_code = error_code;
 208 }
 209 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 210
 211 static void __queue_exception(struct kvm_vcpu *vcpu)
 212 {
 213         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 214                                      vcpu->arch.exception.has_error_code,
 215                                      vcpu->arch.exception.error_code);
 216 }
 217
 218 /*
 219  * Load the pae pdptrs.  Return true is they are all valid.
 220  */
 221 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 222 {
 223         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 224         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 225         int i;
 226         int ret;
 227         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 228
 229         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 230                                   offset * sizeof(u64), sizeof(pdpte));
 231         if (ret < 0) {
 232                 ret = 0;
 233                 goto out;
 234         }
 235         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 236                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
 237                         ret = 0;
 238                         goto out;
 239                 }
 240         }
 241         ret = 1;
 242
 243         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 244 out:
 245
 246         return ret;
 247 }
 248 EXPORT_SYMBOL_GPL(load_pdptrs);
 249
 250 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 251 {
 252         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 253         bool changed = true;
 254         int r;
 255
 256         if (is_long_mode(vcpu) || !is_pae(vcpu))
 257                 return false;
 258
 259         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 260         if (r < 0)
 261                 goto out;
 262         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 263 out:
 264
 265         return changed;
 266 }
 267
 268 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 269 {
 270         if (cr0 & CR0_RESERVED_BITS) {
 271                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 272                        cr0, vcpu->arch.cr0);
 273                 kvm_inject_gp(vcpu, 0);
 274                 return;
 275         }
 276
 277         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 278                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 279                 kvm_inject_gp(vcpu, 0);
 280                 return;
 281         }
 282
 283         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 284                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 285                        "and a clear PE flag\n");
 286                 kvm_inject_gp(vcpu, 0);
 287                 return;
 288         }
 289
 290         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 291 #ifdef CONFIG_X86_64
 292                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
 293                         int cs_db, cs_l;
 294
 295                         if (!is_pae(vcpu)) {
 296                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 297                                        "in long mode while PAE is disabled\n");
 298                                 kvm_inject_gp(vcpu, 0);
 299                                 return;
 300                         }
 301                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 302                         if (cs_l) {
 303                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 304                                        "in long mode while CS.L == 1\n");
 305                                 kvm_inject_gp(vcpu, 0);
 306                                 return;
 307
 308                         }
 309                 } else
 310 #endif
 311                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 312                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 313                                "reserved bits\n");
 314                         kvm_inject_gp(vcpu, 0);
 315                         return;
 316                 }
 317
 318         }
 319
 320         kvm_x86_ops->set_cr0(vcpu, cr0);
 321         vcpu->arch.cr0 = cr0;
 322
 323         kvm_mmu_sync_global(vcpu);
 324         kvm_mmu_reset_context(vcpu);
 325         return;
 326 }
 327 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 328
 329 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 330 {
 331         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 332         KVMTRACE_1D(LMSW, vcpu,
 333                     (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
 334                     handler);
 335 }
 336 EXPORT_SYMBOL_GPL(kvm_lmsw);
 337
 338 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 339 {
 340         if (cr4 & CR4_RESERVED_BITS) {
 341                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 342                 kvm_inject_gp(vcpu, 0);
 343                 return;
 344         }
 345
 346         if (is_long_mode(vcpu)) {
 347                 if (!(cr4 & X86_CR4_PAE)) {
 348                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 349                                "in long mode\n");
 350                         kvm_inject_gp(vcpu, 0);
 351                         return;
 352                 }
 353         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
 354                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 355                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 356                 kvm_inject_gp(vcpu, 0);
 357                 return;
 358         }
 359
 360         if (cr4 & X86_CR4_VMXE) {
 361                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 362                 kvm_inject_gp(vcpu, 0);
 363                 return;
 364         }
 365         kvm_x86_ops->set_cr4(vcpu, cr4);
 366         vcpu->arch.cr4 = cr4;
 367         kvm_mmu_sync_global(vcpu);
 368         kvm_mmu_reset_context(vcpu);
 369 }
 370 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 371
 372 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 373 {
 374         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 375                 kvm_mmu_sync_roots(vcpu);
 376                 kvm_mmu_flush_tlb(vcpu);
 377                 return;
 378         }
 379
 380         if (is_long_mode(vcpu)) {
 381                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 382                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 383                         kvm_inject_gp(vcpu, 0);
 384                         return;
 385                 }
 386         } else {
 387                 if (is_pae(vcpu)) {
 388                         if (cr3 & CR3_PAE_RESERVED_BITS) {
 389                                 printk(KERN_DEBUG
 390                                        "set_cr3: #GP, reserved bits\n");
 391                                 kvm_inject_gp(vcpu, 0);
 392                                 return;
 393                         }
 394                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 395                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 396                                        "reserved bits\n");
 397                                 kvm_inject_gp(vcpu, 0);
 398                                 return;
 399                         }
 400                 }
 401                 /*
 402                  * We don't check reserved bits in nonpae mode, because
 403                  * this isn't enforced, and VMware depends on this.
 404                  */
 405         }
 406
 407         /*
 408          * Does the new cr3 value map to physical memory? (Note, we
 409          * catch an invalid cr3 even in real-mode, because it would
 410          * cause trouble later on when we turn on paging anyway.)
 411          *
 412          * A real CPU would silently accept an invalid cr3 and would
 413          * attempt to use it - with largely undefined (and often hard
 414          * to debug) behavior on the guest side.
 415          */
 416         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 417                 kvm_inject_gp(vcpu, 0);
 418         else {
 419                 vcpu->arch.cr3 = cr3;
 420                 vcpu->arch.mmu.new_cr3(vcpu);
 421         }
 422 }
 423 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 424
 425 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 426 {
 427         if (cr8 & CR8_RESERVED_BITS) {
 428                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 429                 kvm_inject_gp(vcpu, 0);
 430                 return;
 431         }
 432         if (irqchip_in_kernel(vcpu->kvm))
 433                 kvm_lapic_set_tpr(vcpu, cr8);
 434         else
 435                 vcpu->arch.cr8 = cr8;
 436 }
 437 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 438
 439 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 440 {
 441         if (irqchip_in_kernel(vcpu->kvm))
 442                 return kvm_lapic_get_cr8(vcpu);
 443         else
 444                 return vcpu->arch.cr8;
 445 }
 446 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 447
 448 static inline u32 bit(int bitno)
 449 {
 450         return 1 << (bitno & 31);
 451 }
 452
 453 /*
 454  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 455  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 456  *
 457  * This list is modified at module load time to reflect the
 458  * capabilities of the host cpu.
 459  */
 460 static u32 msrs_to_save[] = {
 461         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 462         MSR_K6_STAR,
 463 #ifdef CONFIG_X86_64
 464         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 465 #endif
 466         MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 467         MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 468 };
 469
 470 static unsigned num_msrs_to_save;
 471
 472 static u32 emulated_msrs[] = {
 473         MSR_IA32_MISC_ENABLE,
 474 };
 475
 476 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 477 {
 478         if (efer & efer_reserved_bits) {
 479                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 480                        efer);
 481                 kvm_inject_gp(vcpu, 0);
 482                 return;
 483         }
 484
 485         if (is_paging(vcpu)
 486             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 487                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 488                 kvm_inject_gp(vcpu, 0);
 489                 return;
 490         }
 491
 492         if (efer & EFER_SVME) {
 493                 struct kvm_cpuid_entry2 *feat;
 494
 495                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 496                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
 497                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
 498                         kvm_inject_gp(vcpu, 0);
 499                         return;
 500                 }
 501         }
 502
 503         kvm_x86_ops->set_efer(vcpu, efer);
 504
 505         efer &= ~EFER_LMA;
 506         efer |= vcpu->arch.shadow_efer & EFER_LMA;
 507
 508         vcpu->arch.shadow_efer = efer;
 509 }
 510
 511 void kvm_enable_efer_bits(u64 mask)
 512 {
 513        efer_reserved_bits &= ~mask;
 514 }
 515 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 516
 517
 518 /*
 519  * Writes msr value into into the appropriate "register".
 520  * Returns 0 on success, non-0 otherwise.
 521  * Assumes vcpu_load() was already called.
 522  */
 523 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 524 {
 525         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 526 }
 527
 528 /*
 529  * Adapt set_msr() to msr_io()'s calling convention
 530  */
 531 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 532 {
 533         return kvm_set_msr(vcpu, index, *data);
 534 }
 535
 536 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 537 {
 538         static int version;
 539         struct pvclock_wall_clock wc;
 540         struct timespec now, sys, boot;
 541
 542         if (!wall_clock)
 543                 return;
 544
 545         version++;
 546
 547         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 548
 549         /*
 550          * The guest calculates current wall clock time by adding
 551          * system time (updated by kvm_write_guest_time below) to the
 552          * wall clock specified here.  guest system time equals host
 553          * system time for us, thus we must fill in host boot time here.
 554          */
 555         now = current_kernel_time();
 556         ktime_get_ts(&sys);
 557         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
 558
 559         wc.sec = boot.tv_sec;
 560         wc.nsec = boot.tv_nsec;
 561         wc.version = version;
 562
 563         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 564
 565         version++;
 566         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 567 }
 568
 569 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 570 {
 571         uint32_t quotient, remainder;
 572
 573         /* Don't try to replace with do_div(), this one calculates
 574          * "(dividend << 32) / divisor" */
 575         __asm__ ( "divl %4"
 576                   : "=a" (quotient), "=d" (remainder)
 577                   : "0" (0), "1" (dividend), "r" (divisor) );
 578         return quotient;
 579 }
 580
 581 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 582 {
 583         uint64_t nsecs = 1000000000LL;
 584         int32_t  shift = 0;
 585         uint64_t tps64;
 586         uint32_t tps32;
 587
 588         tps64 = tsc_khz * 1000LL;
 589         while (tps64 > nsecs*2) {
 590                 tps64 >>= 1;
 591                 shift--;
 592         }
 593
 594         tps32 = (uint32_t)tps64;
 595         while (tps32 <= (uint32_t)nsecs) {
 596                 tps32 <<= 1;
 597                 shift++;
 598         }
 599
 600         hv_clock->tsc_shift = shift;
 601         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 602
 603         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 604                  __func__, tsc_khz, hv_clock->tsc_shift,
 605                  hv_clock->tsc_to_system_mul);
 606 }
 607
 608 static void kvm_write_guest_time(struct kvm_vcpu *v)
 609 {
 610         struct timespec ts;
 611         unsigned long flags;
 612         struct kvm_vcpu_arch *vcpu = &v->arch;
 613         void *shared_kaddr;
 614
 615         if ((!vcpu->time_page))
 616                 return;
 617
 618         if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
 619                 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
 620                 vcpu->hv_clock_tsc_khz = tsc_khz;
 621         }
 622
 623         /* Keep irq disabled to prevent changes to the clock */
 624         local_irq_save(flags);
 625         kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
 626                           &vcpu->hv_clock.tsc_timestamp);
 627         ktime_get_ts(&ts);
 628         local_irq_restore(flags);
 629
 630         /* With all the info we got, fill in the values */
 631
 632         vcpu->hv_clock.system_time = ts.tv_nsec +
 633                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
 634         /*
 635          * The interface expects us to write an even number signaling that the
 636          * update is finished. Since the guest won't see the intermediate
 637          * state, we just increase by 2 at the end.
 638          */
 639         vcpu->hv_clock.version += 2;
 640
 641         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 642
 643         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 644                sizeof(vcpu->hv_clock));
 645
 646         kunmap_atomic(shared_kaddr, KM_USER0);
 647
 648         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 649 }
 650
 651 static bool msr_mtrr_valid(unsigned msr)
 652 {
 653         switch (msr) {
 654         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 655         case MSR_MTRRfix64K_00000:
 656         case MSR_MTRRfix16K_80000:
 657         case MSR_MTRRfix16K_A0000:
 658         case MSR_MTRRfix4K_C0000:
 659         case MSR_MTRRfix4K_C8000:
 660         case MSR_MTRRfix4K_D0000:
 661         case MSR_MTRRfix4K_D8000:
 662         case MSR_MTRRfix4K_E0000:
 663         case MSR_MTRRfix4K_E8000:
 664         case MSR_MTRRfix4K_F0000:
 665         case MSR_MTRRfix4K_F8000:
 666         case MSR_MTRRdefType:
 667         case MSR_IA32_CR_PAT:
 668                 return true;
 669         case 0x2f8:
 670                 return true;
 671         }
 672         return false;
 673 }
 674
 675 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 676 {
 677         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 678
 679         if (!msr_mtrr_valid(msr))
 680                 return 1;
 681
 682         if (msr == MSR_MTRRdefType) {
 683                 vcpu->arch.mtrr_state.def_type = data;
 684                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 685         } else if (msr == MSR_MTRRfix64K_00000)
 686                 p[0] = data;
 687         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 688                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
 689         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 690                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 691         else if (msr == MSR_IA32_CR_PAT)
 692                 vcpu->arch.pat = data;
 693         else {  /* Variable MTRRs */
 694                 int idx, is_mtrr_mask;
 695                 u64 *pt;
 696
 697                 idx = (msr - 0x200) / 2;
 698                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 699                 if (!is_mtrr_mask)
 700                         pt =
 701                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 702                 else
 703                         pt =
 704                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 705                 *pt = data;
 706         }
 707
 708         kvm_mmu_reset_context(vcpu);
 709         return 0;
 710 }
 711
 712 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 713 {
 714         switch (msr) {
 715         case MSR_EFER:
 716                 set_efer(vcpu, data);
 717                 break;
 718         case MSR_IA32_MC0_STATUS:
 719                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
 720                        __func__, data);
 721                 break;
 722         case MSR_IA32_MCG_STATUS:
 723                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
 724                         __func__, data);
 725                 break;
 726         case MSR_IA32_MCG_CTL:
 727                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 728                         __func__, data);
 729                 break;
 730         case MSR_IA32_DEBUGCTLMSR:
 731                 if (!data) {
 732                         /* We support the non-activated case already */
 733                         break;
 734                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 735                         /* Values other than LBR and BTF are vendor-specific,
 736                            thus reserved and should throw a #GP */
 737                         return 1;
 738                 }
 739                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 740                         __func__, data);
 741                 break;
 742         case MSR_IA32_UCODE_REV:
 743         case MSR_IA32_UCODE_WRITE:
 744                 break;
 745         case 0x200 ... 0x2ff:
 746                 return set_msr_mtrr(vcpu, msr, data);
 747         case MSR_IA32_APICBASE:
 748                 kvm_set_apic_base(vcpu, data);
 749                 break;
 750         case MSR_IA32_MISC_ENABLE:
 751                 vcpu->arch.ia32_misc_enable_msr = data;
 752                 break;
 753         case MSR_KVM_WALL_CLOCK:
 754                 vcpu->kvm->arch.wall_clock = data;
 755                 kvm_write_wall_clock(vcpu->kvm, data);
 756                 break;
 757         case MSR_KVM_SYSTEM_TIME: {
 758                 if (vcpu->arch.time_page) {
 759                         kvm_release_page_dirty(vcpu->arch.time_page);
 760                         vcpu->arch.time_page = NULL;
 761                 }
 762
 763                 vcpu->arch.time = data;
 764
 765                 /* we verify if the enable bit is set... */
 766                 if (!(data & 1))
 767                         break;
 768
 769                 /* ...but clean it before doing the actual write */
 770                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 771
 772                 vcpu->arch.time_page =
 773                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 774
 775                 if (is_error_page(vcpu->arch.time_page)) {
 776                         kvm_release_page_clean(vcpu->arch.time_page);
 777                         vcpu->arch.time_page = NULL;
 778                 }
 779
 780                 kvm_write_guest_time(vcpu);
 781                 break;
 782         }
 783         default:
 784                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 785                 return 1;
 786         }
 787         return 0;
 788 }
 789 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 790
 791
 792 /*
 793  * Reads an msr value (of 'msr_index') into 'pdata'.
 794  * Returns 0 on success, non-0 otherwise.
 795  * Assumes vcpu_load() was already called.
 796  */
 797 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 798 {
 799         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 800 }
 801
 802 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 803 {
 804         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 805
 806         if (!msr_mtrr_valid(msr))
 807                 return 1;
 808
 809         if (msr == MSR_MTRRdefType)
 810                 *pdata = vcpu->arch.mtrr_state.def_type +
 811                          (vcpu->arch.mtrr_state.enabled << 10);
 812         else if (msr == MSR_MTRRfix64K_00000)
 813                 *pdata = p[0];
 814         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 815                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
 816         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 817                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
 818         else if (msr == MSR_IA32_CR_PAT)
 819                 *pdata = vcpu->arch.pat;
 820         else {  /* Variable MTRRs */
 821                 int idx, is_mtrr_mask;
 822                 u64 *pt;
 823
 824                 idx = (msr - 0x200) / 2;
 825                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 826                 if (!is_mtrr_mask)
 827                         pt =
 828                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 829                 else
 830                         pt =
 831                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 832                 *pdata = *pt;
 833         }
 834
 835         return 0;
 836 }
 837
 838 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 839 {
 840         u64 data;
 841
 842         switch (msr) {
 843         case 0xc0010010: /* SYSCFG */
 844         case 0xc0010015: /* HWCR */
 845         case MSR_IA32_PLATFORM_ID:
 846         case MSR_IA32_P5_MC_ADDR:
 847         case MSR_IA32_P5_MC_TYPE:
 848         case MSR_IA32_MC0_CTL:
 849         case MSR_IA32_MCG_STATUS:
 850         case MSR_IA32_MCG_CAP:
 851         case MSR_IA32_MCG_CTL:
 852         case MSR_IA32_MC0_MISC:
 853         case MSR_IA32_MC0_MISC+4:
 854         case MSR_IA32_MC0_MISC+8:
 855         case MSR_IA32_MC0_MISC+12:
 856         case MSR_IA32_MC0_MISC+16:
 857         case MSR_IA32_MC0_MISC+20:
 858         case MSR_IA32_UCODE_REV:
 859         case MSR_IA32_EBL_CR_POWERON:
 860         case MSR_IA32_DEBUGCTLMSR:
 861         case MSR_IA32_LASTBRANCHFROMIP:
 862         case MSR_IA32_LASTBRANCHTOIP:
 863         case MSR_IA32_LASTINTFROMIP:
 864         case MSR_IA32_LASTINTTOIP:
 865                 data = 0;
 866                 break;
 867         case MSR_MTRRcap:
 868                 data = 0x500 | KVM_NR_VAR_MTRR;
 869                 break;
 870         case 0x200 ... 0x2ff:
 871                 return get_msr_mtrr(vcpu, msr, pdata);
 872         case 0xcd: /* fsb frequency */
 873                 data = 3;
 874                 break;
 875         case MSR_IA32_APICBASE:
 876                 data = kvm_get_apic_base(vcpu);
 877                 break;
 878         case MSR_IA32_MISC_ENABLE:
 879                 data = vcpu->arch.ia32_misc_enable_msr;
 880                 break;
 881         case MSR_IA32_PERF_STATUS:
 882                 /* TSC increment by tick */
 883                 data = 1000ULL;
 884                 /* CPU multiplier */
 885                 data |= (((uint64_t)4ULL) << 40);
 886                 break;
 887         case MSR_EFER:
 888                 data = vcpu->arch.shadow_efer;
 889                 break;
 890         case MSR_KVM_WALL_CLOCK:
 891                 data = vcpu->kvm->arch.wall_clock;
 892                 break;
 893         case MSR_KVM_SYSTEM_TIME:
 894                 data = vcpu->arch.time;
 895                 break;
 896         default:
 897                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 898                 return 1;
 899         }
 900         *pdata = data;
 901         return 0;
 902 }
 903 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 904
 905 /*
 906  * Read or write a bunch of msrs. All parameters are kernel addresses.
 907  *
 908  * @return number of msrs set successfully.
 909  */
 910 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 911                     struct kvm_msr_entry *entries,
 912                     int (*do_msr)(struct kvm_vcpu *vcpu,
 913                                   unsigned index, u64 *data))
 914 {
 915         int i;
 916
 917         vcpu_load(vcpu);
 918
 919         down_read(&vcpu->kvm->slots_lock);
 920         for (i = 0; i < msrs->nmsrs; ++i)
 921                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
 922                         break;
 923         up_read(&vcpu->kvm->slots_lock);
 924
 925         vcpu_put(vcpu);
 926
 927         return i;
 928 }
 929
 930 /*
 931  * Read or write a bunch of msrs. Parameters are user addresses.
 932  *
 933  * @return number of msrs set successfully.
 934  */
 935 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 936                   int (*do_msr)(struct kvm_vcpu *vcpu,
 937                                 unsigned index, u64 *data),
 938                   int writeback)
 939 {
 940         struct kvm_msrs msrs;
 941         struct kvm_msr_entry *entries;
 942         int r, n;
 943         unsigned size;
 944
 945         r = -EFAULT;
 946         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
 947                 goto out;
 948
 949         r = -E2BIG;
 950         if (msrs.nmsrs >= MAX_IO_MSRS)
 951                 goto out;
 952
 953         r = -ENOMEM;
 954         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
 955         entries = vmalloc(size);
 956         if (!entries)
 957                 goto out;
 958
 959         r = -EFAULT;
 960         if (copy_from_user(entries, user_msrs->entries, size))
 961                 goto out_free;
 962
 963         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
 964         if (r < 0)
 965                 goto out_free;
 966
 967         r = -EFAULT;
 968         if (writeback && copy_to_user(user_msrs->entries, entries, size))
 969                 goto out_free;
 970
 971         r = n;
 972
 973 out_free:
 974         vfree(entries);
 975 out:
 976         return r;
 977 }
 978
 979 int kvm_dev_ioctl_check_extension(long ext)
 980 {
 981         int r;
 982
 983         switch (ext) {
 984         case KVM_CAP_IRQCHIP:
 985         case KVM_CAP_HLT:
 986         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 987         case KVM_CAP_SET_TSS_ADDR:
 988         case KVM_CAP_EXT_CPUID:
 989         case KVM_CAP_PIT:
 990         case KVM_CAP_NOP_IO_DELAY:
 991         case KVM_CAP_MP_STATE:
 992         case KVM_CAP_SYNC_MMU:
 993                 r = 1;
 994                 break;
 995         case KVM_CAP_COALESCED_MMIO:
 996                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 997                 break;
 998         case KVM_CAP_VAPIC:
 999                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1000                 break;
1001         case KVM_CAP_NR_VCPUS:
1002                 r = KVM_MAX_VCPUS;
1003                 break;
1004         case KVM_CAP_NR_MEMSLOTS:
1005                 r = KVM_MEMORY_SLOTS;
1006                 break;
1007         case KVM_CAP_PV_MMU:
1008                 r = !tdp_enabled;
1009                 break;
1010         case KVM_CAP_IOMMU:
1011                 r = iommu_found();
1012                 break;
1013         case KVM_CAP_CLOCKSOURCE:
1014                 r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
1015                 break;
1016         default:
1017                 r = 0;
1018                 break;
1019         }
1020         return r;
1021
1022 }
1023
1024 long kvm_arch_dev_ioctl(struct file *filp,
1025                         unsigned int ioctl, unsigned long arg)
1026 {
1027         void __user *argp = (void __user *)arg;
1028         long r;
1029
1030         switch (ioctl) {
1031         case KVM_GET_MSR_INDEX_LIST: {
1032                 struct kvm_msr_list __user *user_msr_list = argp;
1033                 struct kvm_msr_list msr_list;
1034                 unsigned n;
1035
1036                 r = -EFAULT;
1037                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1038                         goto out;
1039                 n = msr_list.nmsrs;
1040                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1041                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1042                         goto out;
1043                 r = -E2BIG;
1044                 if (n < num_msrs_to_save)
1045                         goto out;
1046                 r = -EFAULT;
1047                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1048                                  num_msrs_to_save * sizeof(u32)))
1049                         goto out;
1050                 if (copy_to_user(user_msr_list->indices
1051                                  + num_msrs_to_save * sizeof(u32),
1052                                  &emulated_msrs,
1053                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1054                         goto out;
1055                 r = 0;
1056                 break;
1057         }
1058         case KVM_GET_SUPPORTED_CPUID: {
1059                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1060                 struct kvm_cpuid2 cpuid;
1061
1062                 r = -EFAULT;
1063                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1064                         goto out;
1065                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1066                         cpuid_arg->entries);
1067                 if (r)
1068                         goto out;
1069
1070                 r = -EFAULT;
1071                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1072                         goto out;
1073                 r = 0;
1074                 break;
1075         }
1076         default:
1077                 r = -EINVAL;
1078         }
1079 out:
1080         return r;
1081 }
1082
1083 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1084 {
1085         kvm_x86_ops->vcpu_load(vcpu, cpu);
1086         kvm_write_guest_time(vcpu);
1087 }
1088
1089 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1090 {
1091         kvm_x86_ops->vcpu_put(vcpu);
1092         kvm_put_guest_fpu(vcpu);
1093 }
1094
1095 static int is_efer_nx(void)
1096 {
1097         u64 efer;
1098
1099         rdmsrl(MSR_EFER, efer);
1100         return efer & EFER_NX;
1101 }
1102
1103 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1104 {
1105         int i;
1106         struct kvm_cpuid_entry2 *e, *entry;
1107
1108         entry = NULL;
1109         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1110                 e = &vcpu->arch.cpuid_entries[i];
1111                 if (e->function == 0x80000001) {
1112                         entry = e;
1113                         break;
1114                 }
1115         }
1116         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1117                 entry->edx &= ~(1 << 20);
1118                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1119         }
1120 }
1121
1122 /* when an old userspace process fills a new kernel module */
1123 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1124                                     struct kvm_cpuid *cpuid,
1125                                     struct kvm_cpuid_entry __user *entries)
1126 {
1127         int r, i;
1128         struct kvm_cpuid_entry *cpuid_entries;
1129
1130         r = -E2BIG;
1131         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1132                 goto out;
1133         r = -ENOMEM;
1134         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1135         if (!cpuid_entries)
1136                 goto out;
1137         r = -EFAULT;
1138         if (copy_from_user(cpuid_entries, entries,
1139                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1140                 goto out_free;
1141         for (i = 0; i < cpuid->nent; i++) {
1142                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1143                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1144                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1145                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1146                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1147                 vcpu->arch.cpuid_entries[i].index = 0;
1148                 vcpu->arch.cpuid_entries[i].flags = 0;
1149                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1150                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1151                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1152         }
1153         vcpu->arch.cpuid_nent = cpuid->nent;
1154         cpuid_fix_nx_cap(vcpu);
1155         r = 0;
1156
1157 out_free:
1158         vfree(cpuid_entries);
1159 out:
1160         return r;
1161 }
1162
1163 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1164                                     struct kvm_cpuid2 *cpuid,
1165                                     struct kvm_cpuid_entry2 __user *entries)
1166 {
1167         int r;
1168
1169         r = -E2BIG;
1170         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1171                 goto out;
1172         r = -EFAULT;
1173         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1174                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1175                 goto out;
1176         vcpu->arch.cpuid_nent = cpuid->nent;
1177         return 0;
1178
1179 out:
1180         return r;
1181 }
1182
1183 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1184                                     struct kvm_cpuid2 *cpuid,
1185                                     struct kvm_cpuid_entry2 __user *entries)
1186 {
1187         int r;
1188
1189         r = -E2BIG;
1190         if (cpuid->nent < vcpu->arch.cpuid_nent)
1191                 goto out;
1192         r = -EFAULT;
1193         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1194                            vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1195                 goto out;
1196         return 0;
1197
1198 out:
1199         cpuid->nent = vcpu->arch.cpuid_nent;
1200         return r;
1201 }
1202
1203 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1204                           u32 index)
1205 {
1206         entry->function = function;
1207         entry->index = index;
1208         cpuid_count(entry->function, entry->index,
1209                 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1210         entry->flags = 0;
1211 }
1212
1213 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1214                          u32 index, int *nent, int maxnent)
1215 {
1216         const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
1217                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1218                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1219                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1220                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1221                 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1222                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1223                 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1224                 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1225                 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1226         const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1227                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1228                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1229                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1230                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1231                 bit(X86_FEATURE_PGE) |
1232                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1233                 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1234                 bit(X86_FEATURE_SYSCALL) |
1235                 (bit(X86_FEATURE_NX) && is_efer_nx()) |
1236 #ifdef CONFIG_X86_64
1237                 bit(X86_FEATURE_LM) |
1238 #endif
1239                 bit(X86_FEATURE_MMXEXT) |
1240                 bit(X86_FEATURE_3DNOWEXT) |
1241                 bit(X86_FEATURE_3DNOW);
1242         const u32 kvm_supported_word3_x86_features =
1243                 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1244         const u32 kvm_supported_word6_x86_features =
1245                 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
1246                 bit(X86_FEATURE_SVM);
1247
1248         /* all func 2 cpuid_count() should be called on the same cpu */
1249         get_cpu();
1250         do_cpuid_1_ent(entry, function, index);
1251         ++*nent;
1252
1253         switch (function) {
1254         case 0:
1255                 entry->eax = min(entry->eax, (u32)0xb);
1256                 break;
1257         case 1:
1258                 entry->edx &= kvm_supported_word0_x86_features;
1259                 entry->ecx &= kvm_supported_word3_x86_features;
1260                 break;
1261         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1262          * may return different values. This forces us to get_cpu() before
1263          * issuing the first command, and also to emulate this annoying behavior
1264          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1265         case 2: {
1266                 int t, times = entry->eax & 0xff;
1267
1268                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1269                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1270                 for (t = 1; t < times && *nent < maxnent; ++t) {
1271                         do_cpuid_1_ent(&entry[t], function, 0);
1272                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1273                         ++*nent;
1274                 }
1275                 break;
1276         }
1277         /* function 4 and 0xb have additional index. */
1278         case 4: {
1279                 int i, cache_type;
1280
1281                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1282                 /* read more entries until cache_type is zero */
1283                 for (i = 1; *nent < maxnent; ++i) {
1284                         cache_type = entry[i - 1].eax & 0x1f;
1285                         if (!cache_type)
1286                                 break;
1287                         do_cpuid_1_ent(&entry[i], function, i);
1288                         entry[i].flags |=
1289                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1290                         ++*nent;
1291                 }
1292                 break;
1293         }
1294         case 0xb: {
1295                 int i, level_type;
1296
1297                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1298                 /* read more entries until level_type is zero */
1299                 for (i = 1; *nent < maxnent; ++i) {
1300                         level_type = entry[i - 1].ecx & 0xff00;
1301                         if (!level_type)
1302                                 break;
1303                         do_cpuid_1_ent(&entry[i], function, i);
1304                         entry[i].flags |=
1305                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1306                         ++*nent;
1307                 }
1308                 break;
1309         }
1310         case 0x80000000:
1311                 entry->eax = min(entry->eax, 0x8000001a);
1312                 break;
1313         case 0x80000001:
1314                 entry->edx &= kvm_supported_word1_x86_features;
1315                 entry->ecx &= kvm_supported_word6_x86_features;
1316                 break;
1317         }
1318         put_cpu();
1319 }
1320
1321 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1322                                     struct kvm_cpuid_entry2 __user *entries)
1323 {
1324         struct kvm_cpuid_entry2 *cpuid_entries;
1325         int limit, nent = 0, r = -E2BIG;
1326         u32 func;
1327
1328         if (cpuid->nent < 1)
1329                 goto out;
1330         r = -ENOMEM;
1331         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1332         if (!cpuid_entries)
1333                 goto out;
1334
1335         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1336         limit = cpuid_entries[0].eax;
1337         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1338                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1339                                 &nent, cpuid->nent);
1340         r = -E2BIG;
1341         if (nent >= cpuid->nent)
1342                 goto out_free;
1343
1344         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1345         limit = cpuid_entries[nent - 1].eax;
1346         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1347                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1348                                &nent, cpuid->nent);
1349         r = -EFAULT;
1350         if (copy_to_user(entries, cpuid_entries,
1351                         nent * sizeof(struct kvm_cpuid_entry2)))
1352                 goto out_free;
1353         cpuid->nent = nent;
1354         r = 0;
1355
1356 out_free:
1357         vfree(cpuid_entries);
1358 out:
1359         return r;
1360 }
1361
1362 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1363                                     struct kvm_lapic_state *s)
1364 {
1365         vcpu_load(vcpu);
1366         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1367         vcpu_put(vcpu);
1368
1369         return 0;
1370 }
1371
1372 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1373                                     struct kvm_lapic_state *s)
1374 {
1375         vcpu_load(vcpu);
1376         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1377         kvm_apic_post_state_restore(vcpu);
1378         vcpu_put(vcpu);
1379
1380         return 0;
1381 }
1382
1383 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1384                                     struct kvm_interrupt *irq)
1385 {
1386         if (irq->irq < 0 || irq->irq >= 256)
1387                 return -EINVAL;
1388         if (irqchip_in_kernel(vcpu->kvm))
1389                 return -ENXIO;
1390         vcpu_load(vcpu);
1391
1392         set_bit(irq->irq, vcpu->arch.irq_pending);
1393         set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1394
1395         vcpu_put(vcpu);
1396
1397         return 0;
1398 }
1399
1400 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1401 {
1402         vcpu_load(vcpu);
1403         kvm_inject_nmi(vcpu);
1404         vcpu_put(vcpu);
1405
1406         return 0;
1407 }
1408
1409 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1410                                            struct kvm_tpr_access_ctl *tac)
1411 {
1412         if (tac->flags)
1413                 return -EINVAL;
1414         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1415         return 0;
1416 }
1417
1418 long kvm_arch_vcpu_ioctl(struct file *filp,
1419                          unsigned int ioctl, unsigned long arg)
1420 {
1421         struct kvm_vcpu *vcpu = filp->private_data;
1422         void __user *argp = (void __user *)arg;
1423         int r;
1424         struct kvm_lapic_state *lapic = NULL;
1425
1426         switch (ioctl) {
1427         case KVM_GET_LAPIC: {
1428                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1429
1430                 r = -ENOMEM;
1431                 if (!lapic)
1432                         goto out;
1433                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1434                 if (r)
1435                         goto out;
1436                 r = -EFAULT;
1437                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1438                         goto out;
1439                 r = 0;
1440                 break;
1441         }
1442         case KVM_SET_LAPIC: {
1443                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1444                 r = -ENOMEM;
1445                 if (!lapic)
1446                         goto out;
1447                 r = -EFAULT;
1448                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1449                         goto out;
1450                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1451                 if (r)
1452                         goto out;
1453                 r = 0;
1454                 break;
1455         }
1456         case KVM_INTERRUPT: {
1457                 struct kvm_interrupt irq;
1458
1459                 r = -EFAULT;
1460                 if (copy_from_user(&irq, argp, sizeof irq))
1461                         goto out;
1462                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1463                 if (r)
1464                         goto out;
1465                 r = 0;
1466                 break;
1467         }
1468         case KVM_NMI: {
1469                 r = kvm_vcpu_ioctl_nmi(vcpu);
1470                 if (r)
1471                         goto out;
1472                 r = 0;
1473                 break;
1474         }
1475         case KVM_SET_CPUID: {
1476                 struct kvm_cpuid __user *cpuid_arg = argp;
1477                 struct kvm_cpuid cpuid;
1478
1479                 r = -EFAULT;
1480                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1481                         goto out;
1482                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1483                 if (r)
1484                         goto out;
1485                 break;
1486         }
1487         case KVM_SET_CPUID2: {
1488                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1489                 struct kvm_cpuid2 cpuid;
1490
1491                 r = -EFAULT;
1492                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1493                         goto out;
1494                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1495                                 cpuid_arg->entries);
1496                 if (r)
1497                         goto out;
1498                 break;
1499         }
1500         case KVM_GET_CPUID2: {
1501                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1502                 struct kvm_cpuid2 cpuid;
1503
1504                 r = -EFAULT;
1505                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1506                         goto out;
1507                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1508                                 cpuid_arg->entries);
1509                 if (r)
1510                         goto out;
1511                 r = -EFAULT;
1512                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1513                         goto out;
1514                 r = 0;
1515                 break;
1516         }
1517         case KVM_GET_MSRS:
1518                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1519                 break;
1520         case KVM_SET_MSRS:
1521                 r = msr_io(vcpu, argp, do_set_msr, 0);
1522                 break;
1523         case KVM_TPR_ACCESS_REPORTING: {
1524                 struct kvm_tpr_access_ctl tac;
1525
1526                 r = -EFAULT;
1527                 if (copy_from_user(&tac, argp, sizeof tac))
1528                         goto out;
1529                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1530                 if (r)
1531                         goto out;
1532                 r = -EFAULT;
1533                 if (copy_to_user(argp, &tac, sizeof tac))
1534                         goto out;
1535                 r = 0;
1536                 break;
1537         };
1538         case KVM_SET_VAPIC_ADDR: {
1539                 struct kvm_vapic_addr va;
1540
1541                 r = -EINVAL;
1542                 if (!irqchip_in_kernel(vcpu->kvm))
1543                         goto out;
1544                 r = -EFAULT;
1545                 if (copy_from_user(&va, argp, sizeof va))
1546                         goto out;
1547                 r = 0;
1548                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1549                 break;
1550         }
1551         default:
1552                 r = -EINVAL;
1553         }
1554 out:
1555         if (lapic)
1556                 kfree(lapic);
1557         return r;
1558 }
1559
1560 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1561 {
1562         int ret;
1563
1564         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1565                 return -1;
1566         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1567         return ret;
1568 }
1569
1570 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1571                                           u32 kvm_nr_mmu_pages)
1572 {
1573         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1574                 return -EINVAL;
1575
1576         down_write(&kvm->slots_lock);
1577
1578         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1579         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1580
1581         up_write(&kvm->slots_lock);
1582         return 0;
1583 }
1584
1585 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1586 {
1587         return kvm->arch.n_alloc_mmu_pages;
1588 }
1589
1590 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1591 {
1592         int i;
1593         struct kvm_mem_alias *alias;
1594
1595         for (i = 0; i < kvm->arch.naliases; ++i) {
1596                 alias = &kvm->arch.aliases[i];
1597                 if (gfn >= alias->base_gfn
1598                     && gfn < alias->base_gfn + alias->npages)
1599                         return alias->target_gfn + gfn - alias->base_gfn;
1600         }
1601         return gfn;
1602 }
1603
1604 /*
1605  * Set a new alias region.  Aliases map a portion of physical memory into
1606  * another portion.  This is useful for memory windows, for example the PC
1607  * VGA region.
1608  */
1609 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1610                                          struct kvm_memory_alias *alias)
1611 {
1612         int r, n;
1613         struct kvm_mem_alias *p;
1614
1615         r = -EINVAL;
1616         /* General sanity checks */
1617         if (alias->memory_size & (PAGE_SIZE - 1))
1618                 goto out;
1619         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1620                 goto out;
1621         if (alias->slot >= KVM_ALIAS_SLOTS)
1622                 goto out;
1623         if (alias->guest_phys_addr + alias->memory_size
1624             < alias->guest_phys_addr)
1625                 goto out;
1626         if (alias->target_phys_addr + alias->memory_size
1627             < alias->target_phys_addr)
1628                 goto out;
1629
1630         down_write(&kvm->slots_lock);
1631         spin_lock(&kvm->mmu_lock);
1632
1633         p = &kvm->arch.aliases[alias->slot];
1634         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1635         p->npages = alias->memory_size >> PAGE_SHIFT;
1636         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1637
1638         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1639                 if (kvm->arch.aliases[n - 1].npages)
1640                         break;
1641         kvm->arch.naliases = n;
1642
1643         spin_unlock(&kvm->mmu_lock);
1644         kvm_mmu_zap_all(kvm);
1645
1646         up_write(&kvm->slots_lock);
1647
1648         return 0;
1649
1650 out:
1651         return r;
1652 }
1653
1654 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1655 {
1656         int r;
1657
1658         r = 0;
1659         switch (chip->chip_id) {
1660         case KVM_IRQCHIP_PIC_MASTER:
1661                 memcpy(&chip->chip.pic,
1662                         &pic_irqchip(kvm)->pics[0],
1663                         sizeof(struct kvm_pic_state));
1664                 break;
1665         case KVM_IRQCHIP_PIC_SLAVE:
1666                 memcpy(&chip->chip.pic,
1667                         &pic_irqchip(kvm)->pics[1],
1668                         sizeof(struct kvm_pic_state));
1669                 break;
1670         case KVM_IRQCHIP_IOAPIC:
1671                 memcpy(&chip->chip.ioapic,
1672                         ioapic_irqchip(kvm),
1673                         sizeof(struct kvm_ioapic_state));
1674                 break;
1675         default:
1676                 r = -EINVAL;
1677                 break;
1678         }
1679         return r;
1680 }
1681
1682 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1683 {
1684         int r;
1685
1686         r = 0;
1687         switch (chip->chip_id) {
1688         case KVM_IRQCHIP_PIC_MASTER:
1689                 memcpy(&pic_irqchip(kvm)->pics[0],
1690                         &chip->chip.pic,
1691                         sizeof(struct kvm_pic_state));
1692                 break;
1693         case KVM_IRQCHIP_PIC_SLAVE:
1694                 memcpy(&pic_irqchip(kvm)->pics[1],
1695                         &chip->chip.pic,
1696                         sizeof(struct kvm_pic_state));
1697                 break;
1698         case KVM_IRQCHIP_IOAPIC:
1699                 memcpy(ioapic_irqchip(kvm),
1700                         &chip->chip.ioapic,
1701                         sizeof(struct kvm_ioapic_state));
1702                 break;
1703         default:
1704                 r = -EINVAL;
1705                 break;
1706         }
1707         kvm_pic_update_irq(pic_irqchip(kvm));
1708         return r;
1709 }
1710
1711 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1712 {
1713         int r = 0;
1714
1715         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1716         return r;
1717 }
1718
1719 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1720 {
1721         int r = 0;
1722
1723         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1724         kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1725         return r;
1726 }
1727
1728 /*
1729  * Get (and clear) the dirty memory log for a memory slot.
1730  */
1731 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1732                                       struct kvm_dirty_log *log)
1733 {
1734         int r;
1735         int n;
1736         struct kvm_memory_slot *memslot;
1737         int is_dirty = 0;
1738
1739         down_write(&kvm->slots_lock);
1740
1741         r = kvm_get_dirty_log(kvm, log, &is_dirty);
1742         if (r)
1743                 goto out;
1744
1745         /* If nothing is dirty, don't bother messing with page tables. */
1746         if (is_dirty) {
1747                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1748                 kvm_flush_remote_tlbs(kvm);
1749                 memslot = &kvm->memslots[log->slot];
1750                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1751                 memset(memslot->dirty_bitmap, 0, n);
1752         }
1753         r = 0;
1754 out:
1755         up_write(&kvm->slots_lock);
1756         return r;
1757 }
1758
1759 long kvm_arch_vm_ioctl(struct file *filp,
1760                        unsigned int ioctl, unsigned long arg)
1761 {
1762         struct kvm *kvm = filp->private_data;
1763         void __user *argp = (void __user *)arg;
1764         int r = -EINVAL;
1765         /*
1766          * This union makes it completely explicit to gcc-3.x
1767          * that these two variables' stack usage should be
1768          * combined, not added together.
1769          */
1770         union {
1771                 struct kvm_pit_state ps;
1772                 struct kvm_memory_alias alias;
1773         } u;
1774
1775         switch (ioctl) {
1776         case KVM_SET_TSS_ADDR:
1777                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1778                 if (r < 0)
1779                         goto out;
1780                 break;
1781         case KVM_SET_MEMORY_REGION: {
1782                 struct kvm_memory_region kvm_mem;
1783                 struct kvm_userspace_memory_region kvm_userspace_mem;
1784
1785                 r = -EFAULT;
1786                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1787                         goto out;
1788                 kvm_userspace_mem.slot = kvm_mem.slot;
1789                 kvm_userspace_mem.flags = kvm_mem.flags;
1790                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1791                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1792                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1793                 if (r)
1794                         goto out;
1795                 break;
1796         }
1797         case KVM_SET_NR_MMU_PAGES:
1798                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1799                 if (r)
1800                         goto out;
1801                 break;
1802         case KVM_GET_NR_MMU_PAGES:
1803                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1804                 break;
1805         case KVM_SET_MEMORY_ALIAS:
1806                 r = -EFAULT;
1807                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1808                         goto out;
1809                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1810                 if (r)
1811                         goto out;
1812                 break;
1813         case KVM_CREATE_IRQCHIP:
1814                 r = -ENOMEM;
1815                 kvm->arch.vpic = kvm_create_pic(kvm);
1816                 if (kvm->arch.vpic) {
1817                         r = kvm_ioapic_init(kvm);
1818                         if (r) {
1819                                 kfree(kvm->arch.vpic);
1820                                 kvm->arch.vpic = NULL;
1821                                 goto out;
1822                         }
1823                 } else
1824                         goto out;
1825                 break;
1826         case KVM_CREATE_PIT:
1827                 r = -ENOMEM;
1828                 kvm->arch.vpit = kvm_create_pit(kvm);
1829                 if (kvm->arch.vpit)
1830                         r = 0;
1831                 break;
1832         case KVM_IRQ_LINE: {
1833                 struct kvm_irq_level irq_event;
1834
1835                 r = -EFAULT;
1836                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1837                         goto out;
1838                 if (irqchip_in_kernel(kvm)) {
1839                         mutex_lock(&kvm->lock);
1840                         kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1841                                     irq_event.irq, irq_event.level);
1842                         mutex_unlock(&kvm->lock);
1843                         r = 0;
1844                 }
1845                 break;
1846         }
1847         case KVM_GET_IRQCHIP: {
1848                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1849                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1850
1851                 r = -ENOMEM;
1852                 if (!chip)
1853                         goto out;
1854                 r = -EFAULT;
1855                 if (copy_from_user(chip, argp, sizeof *chip))
1856                         goto get_irqchip_out;
1857                 r = -ENXIO;
1858                 if (!irqchip_in_kernel(kvm))
1859                         goto get_irqchip_out;
1860                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1861                 if (r)
1862                         goto get_irqchip_out;
1863                 r = -EFAULT;
1864                 if (copy_to_user(argp, chip, sizeof *chip))
1865                         goto get_irqchip_out;
1866                 r = 0;
1867         get_irqchip_out:
1868                 kfree(chip);
1869                 if (r)
1870                         goto out;
1871                 break;
1872         }
1873         case KVM_SET_IRQCHIP: {
1874                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1875                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1876
1877                 r = -ENOMEM;
1878                 if (!chip)
1879                         goto out;
1880                 r = -EFAULT;
1881                 if (copy_from_user(chip, argp, sizeof *chip))
1882                         goto set_irqchip_out;
1883                 r = -ENXIO;
1884                 if (!irqchip_in_kernel(kvm))
1885                         goto set_irqchip_out;
1886                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1887                 if (r)
1888                         goto set_irqchip_out;
1889                 r = 0;
1890         set_irqchip_out:
1891                 kfree(chip);
1892                 if (r)
1893                         goto out;
1894                 break;
1895         }
1896         case KVM_GET_PIT: {
1897                 r = -EFAULT;
1898                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1899                         goto out;
1900                 r = -ENXIO;
1901                 if (!kvm->arch.vpit)
1902                         goto out;
1903                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1904                 if (r)
1905                         goto out;
1906                 r = -EFAULT;
1907                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1908                         goto out;
1909                 r = 0;
1910                 break;
1911         }
1912         case KVM_SET_PIT: {
1913                 r = -EFAULT;
1914                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
1915                         goto out;
1916                 r = -ENXIO;
1917                 if (!kvm->arch.vpit)
1918                         goto out;
1919                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
1920                 if (r)
1921                         goto out;
1922                 r = 0;
1923                 break;
1924         }
1925         default:
1926                 ;
1927         }
1928 out:
1929         return r;
1930 }
1931
1932 static void kvm_init_msr_list(void)
1933 {
1934         u32 dummy[2];
1935         unsigned i, j;
1936
1937         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1938                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1939                         continue;
1940                 if (j < i)
1941                         msrs_to_save[j] = msrs_to_save[i];
1942                 j++;
1943         }
1944         num_msrs_to_save = j;
1945 }
1946
1947 /*
1948  * Only apic need an MMIO device hook, so shortcut now..
1949  */
1950 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1951                                                 gpa_t addr, int len,
1952                                                 int is_write)
1953 {
1954         struct kvm_io_device *dev;
1955
1956         if (vcpu->arch.apic) {
1957                 dev = &vcpu->arch.apic->dev;
1958                 if (dev->in_range(dev, addr, len, is_write))
1959                         return dev;
1960         }
1961         return NULL;
1962 }
1963
1964
1965 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1966                                                 gpa_t addr, int len,
1967                                                 int is_write)
1968 {
1969         struct kvm_io_device *dev;
1970
1971         dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
1972         if (dev == NULL)
1973                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1974                                           is_write);
1975         return dev;
1976 }
1977
1978 int emulator_read_std(unsigned long addr,
1979                              void *val,
1980                              unsigned int bytes,
1981                              struct kvm_vcpu *vcpu)
1982 {
1983         void *data = val;
1984         int r = X86EMUL_CONTINUE;
1985
1986         while (bytes) {
1987                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1988                 unsigned offset = addr & (PAGE_SIZE-1);
1989                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1990                 int ret;
1991
1992                 if (gpa == UNMAPPED_GVA) {
1993                         r = X86EMUL_PROPAGATE_FAULT;
1994                         goto out;
1995                 }
1996                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1997                 if (ret < 0) {
1998                         r = X86EMUL_UNHANDLEABLE;
1999                         goto out;
2000                 }
2001
2002                 bytes -= tocopy;
2003                 data += tocopy;
2004                 addr += tocopy;
2005         }
2006 out:
2007         return r;
2008 }
2009 EXPORT_SYMBOL_GPL(emulator_read_std);
2010
2011 static int emulator_read_emulated(unsigned long addr,
2012                                   void *val,
2013                                   unsigned int bytes,
2014                                   struct kvm_vcpu *vcpu)
2015 {
2016         struct kvm_io_device *mmio_dev;
2017         gpa_t                 gpa;
2018
2019         if (vcpu->mmio_read_completed) {
2020                 memcpy(val, vcpu->mmio_data, bytes);
2021                 vcpu->mmio_read_completed = 0;
2022                 return X86EMUL_CONTINUE;
2023         }
2024
2025         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2026
2027         /* For APIC access vmexit */
2028         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2029                 goto mmio;
2030
2031         if (emulator_read_std(addr, val, bytes, vcpu)
2032                         == X86EMUL_CONTINUE)
2033                 return X86EMUL_CONTINUE;
2034         if (gpa == UNMAPPED_GVA)
2035                 return X86EMUL_PROPAGATE_FAULT;
2036
2037 mmio:
2038         /*
2039          * Is this MMIO handled locally?
2040          */
2041         mutex_lock(&vcpu->kvm->lock);
2042         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
2043         if (mmio_dev) {
2044                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
2045                 mutex_unlock(&vcpu->kvm->lock);
2046                 return X86EMUL_CONTINUE;
2047         }
2048         mutex_unlock(&vcpu->kvm->lock);
2049
2050         vcpu->mmio_needed = 1;
2051         vcpu->mmio_phys_addr = gpa;
2052         vcpu->mmio_size = bytes;
2053         vcpu->mmio_is_write = 0;
2054
2055         return X86EMUL_UNHANDLEABLE;
2056 }
2057
2058 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2059                           const void *val, int bytes)
2060 {
2061         int ret;
2062
2063         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2064         if (ret < 0)
2065                 return 0;
2066         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2067         return 1;
2068 }
2069
2070 static int emulator_write_emulated_onepage(unsigned long addr,
2071                                            const void *val,
2072                                            unsigned int bytes,
2073                                            struct kvm_vcpu *vcpu)
2074 {
2075         struct kvm_io_device *mmio_dev;
2076         gpa_t                 gpa;
2077
2078         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2079
2080         if (gpa == UNMAPPED_GVA) {
2081                 kvm_inject_page_fault(vcpu, addr, 2);
2082                 return X86EMUL_PROPAGATE_FAULT;
2083         }
2084
2085         /* For APIC access vmexit */
2086         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2087                 goto mmio;
2088
2089         if (emulator_write_phys(vcpu, gpa, val, bytes))
2090                 return X86EMUL_CONTINUE;
2091
2092 mmio:
2093         /*
2094          * Is this MMIO handled locally?
2095          */
2096         mutex_lock(&vcpu->kvm->lock);
2097         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2098         if (mmio_dev) {
2099                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2100                 mutex_unlock(&vcpu->kvm->lock);
2101                 return X86EMUL_CONTINUE;
2102         }
2103         mutex_unlock(&vcpu->kvm->lock);
2104
2105         vcpu->mmio_needed = 1;
2106         vcpu->mmio_phys_addr = gpa;
2107         vcpu->mmio_size = bytes;
2108         vcpu->mmio_is_write = 1;
2109         memcpy(vcpu->mmio_data, val, bytes);
2110
2111         return X86EMUL_CONTINUE;
2112 }
2113
2114 int emulator_write_emulated(unsigned long addr,
2115                                    const void *val,
2116                                    unsigned int bytes,
2117                                    struct kvm_vcpu *vcpu)
2118 {
2119         /* Crossing a page boundary? */
2120         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2121                 int rc, now;
2122
2123                 now = -addr & ~PAGE_MASK;
2124                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2125                 if (rc != X86EMUL_CONTINUE)
2126                         return rc;
2127                 addr += now;
2128                 val += now;
2129                 bytes -= now;
2130         }
2131         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2132 }
2133 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2134
2135 static int emulator_cmpxchg_emulated(unsigned long addr,
2136                                      const void *old,
2137                                      const void *new,
2138                                      unsigned int bytes,
2139                                      struct kvm_vcpu *vcpu)
2140 {
2141         static int reported;
2142
2143         if (!reported) {
2144                 reported = 1;
2145                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
2146         }
2147 #ifndef CONFIG_X86_64
2148         /* guests cmpxchg8b have to be emulated atomically */
2149         if (bytes == 8) {
2150                 gpa_t gpa;
2151                 struct page *page;
2152                 char *kaddr;
2153                 u64 val;
2154
2155                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2156
2157                 if (gpa == UNMAPPED_GVA ||
2158                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2159                         goto emul_write;
2160
2161                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2162                         goto emul_write;
2163
2164                 val = *(u64 *)new;
2165
2166                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2167
2168                 kaddr = kmap_atomic(page, KM_USER0);
2169                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2170                 kunmap_atomic(kaddr, KM_USER0);
2171                 kvm_release_page_dirty(page);
2172         }
2173 emul_write:
2174 #endif
2175
2176         return emulator_write_emulated(addr, new, bytes, vcpu);
2177 }
2178
2179 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2180 {
2181         return kvm_x86_ops->get_segment_base(vcpu, seg);
2182 }
2183
2184 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2185 {
2186         kvm_mmu_invlpg(vcpu, address);
2187         return X86EMUL_CONTINUE;
2188 }
2189
2190 int emulate_clts(struct kvm_vcpu *vcpu)
2191 {
2192         KVMTRACE_0D(CLTS, vcpu, handler);
2193         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2194         return X86EMUL_CONTINUE;
2195 }
2196
2197 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2198 {
2199         struct kvm_vcpu *vcpu = ctxt->vcpu;
2200
2201         switch (dr) {
2202         case 0 ... 3:
2203                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2204                 return X86EMUL_CONTINUE;
2205         default:
2206                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2207                 return X86EMUL_UNHANDLEABLE;
2208         }
2209 }
2210
2211 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2212 {
2213         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2214         int exception;
2215
2216         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2217         if (exception) {
2218                 /* FIXME: better handling */
2219                 return X86EMUL_UNHANDLEABLE;
2220         }
2221         return X86EMUL_CONTINUE;
2222 }
2223
2224 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2225 {
2226         u8 opcodes[4];
2227         unsigned long rip = kvm_rip_read(vcpu);
2228         unsigned long rip_linear;
2229
2230         if (!printk_ratelimit())
2231                 return;
2232
2233         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2234
2235         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2236
2237         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2238                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2239 }
2240 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2241
2242 static struct x86_emulate_ops emulate_ops = {
2243         .read_std            = emulator_read_std,
2244         .read_emulated       = emulator_read_emulated,
2245         .write_emulated      = emulator_write_emulated,
2246         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2247 };
2248
2249 static void cache_all_regs(struct kvm_vcpu *vcpu)
2250 {
2251         kvm_register_read(vcpu, VCPU_REGS_RAX);
2252         kvm_register_read(vcpu, VCPU_REGS_RSP);
2253         kvm_register_read(vcpu, VCPU_REGS_RIP);
2254         vcpu->arch.regs_dirty = ~0;
2255 }
2256
2257 int emulate_instruction(struct kvm_vcpu *vcpu,
2258                         struct kvm_run *run,
2259                         unsigned long cr2,
2260                         u16 error_code,
2261                         int emulation_type)
2262 {
2263         int r;
2264         struct decode_cache *c;
2265
2266         kvm_clear_exception_queue(vcpu);
2267         vcpu->arch.mmio_fault_cr2 = cr2;
2268         /*
2269          * TODO: fix x86_emulate.c to use guest_read/write_register
2270          * instead of direct ->regs accesses, can save hundred cycles
2271          * on Intel for instructions that don't read/change RSP, for
2272          * for example.
2273          */
2274         cache_all_regs(vcpu);
2275
2276         vcpu->mmio_is_write = 0;
2277         vcpu->arch.pio.string = 0;
2278
2279         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2280                 int cs_db, cs_l;
2281                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2282
2283                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2284                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2285                 vcpu->arch.emulate_ctxt.mode =
2286                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2287                         ? X86EMUL_MODE_REAL : cs_l
2288                         ? X86EMUL_MODE_PROT64 : cs_db
2289                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2290
2291                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2292
2293                 /* Reject the instructions other than VMCALL/VMMCALL when
2294                  * try to emulate invalid opcode */
2295                 c = &vcpu->arch.emulate_ctxt.decode;
2296                 if ((emulation_type & EMULTYPE_TRAP_UD) &&
2297                     (!(c->twobyte && c->b == 0x01 &&
2298                       (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2299                        c->modrm_mod == 3 && c->modrm_rm == 1)))
2300                         return EMULATE_FAIL;
2301
2302                 ++vcpu->stat.insn_emulation;
2303                 if (r)  {
2304                         ++vcpu->stat.insn_emulation_fail;
2305                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2306                                 return EMULATE_DONE;
2307                         return EMULATE_FAIL;
2308                 }
2309         }
2310
2311         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2312
2313         if (vcpu->arch.pio.string)
2314                 return EMULATE_DO_MMIO;
2315
2316         if ((r || vcpu->mmio_is_write) && run) {
2317                 run->exit_reason = KVM_EXIT_MMIO;
2318                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2319                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2320                 run->mmio.len = vcpu->mmio_size;
2321                 run->mmio.is_write = vcpu->mmio_is_write;
2322         }
2323
2324         if (r) {
2325                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2326                         return EMULATE_DONE;
2327                 if (!vcpu->mmio_needed) {
2328                         kvm_report_emulation_failure(vcpu, "mmio");
2329                         return EMULATE_FAIL;
2330                 }
2331                 return EMULATE_DO_MMIO;
2332         }
2333
2334         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2335
2336         if (vcpu->mmio_is_write) {
2337                 vcpu->mmio_needed = 0;
2338                 return EMULATE_DO_MMIO;
2339         }
2340
2341         return EMULATE_DONE;
2342 }
2343 EXPORT_SYMBOL_GPL(emulate_instruction);
2344
2345 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
2346 {
2347         int i;
2348
2349         for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
2350                 if (vcpu->arch.pio.guest_pages[i]) {
2351                         kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
2352                         vcpu->arch.pio.guest_pages[i] = NULL;
2353                 }
2354 }
2355
2356 static int pio_copy_data(struct kvm_vcpu *vcpu)
2357 {
2358         void *p = vcpu->arch.pio_data;
2359         void *q;
2360         unsigned bytes;
2361         int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
2362
2363         q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
2364                  PAGE_KERNEL);
2365         if (!q) {
2366                 free_pio_guest_pages(vcpu);
2367                 return -ENOMEM;
2368         }
2369         q += vcpu->arch.pio.guest_page_offset;
2370         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2371         if (vcpu->arch.pio.in)
2372                 memcpy(q, p, bytes);
2373         else
2374                 memcpy(p, q, bytes);
2375         q -= vcpu->arch.pio.guest_page_offset;
2376         vunmap(q);
2377         free_pio_guest_pages(vcpu);
2378         return 0;
2379 }
2380
2381 int complete_pio(struct kvm_vcpu *vcpu)
2382 {
2383         struct kvm_pio_request *io = &vcpu->arch.pio;
2384         long delta;
2385         int r;
2386         unsigned long val;
2387
2388         if (!io->string) {
2389                 if (io->in) {
2390                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2391                         memcpy(&val, vcpu->arch.pio_data, io->size);
2392                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2393                 }
2394         } else {
2395                 if (io->in) {
2396                         r = pio_copy_data(vcpu);
2397                         if (r)
2398                                 return r;
2399                 }
2400
2401                 delta = 1;
2402                 if (io->rep) {
2403                         delta *= io->cur_count;
2404                         /*
2405                          * The size of the register should really depend on
2406                          * current address size.
2407                          */
2408                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2409                         val -= delta;
2410                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2411                 }
2412                 if (io->down)
2413                         delta = -delta;
2414                 delta *= io->size;
2415                 if (io->in) {
2416                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2417                         val += delta;
2418                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2419                 } else {
2420                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2421                         val += delta;
2422                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2423                 }
2424         }
2425
2426         io->count -= io->cur_count;
2427         io->cur_count = 0;
2428
2429         return 0;
2430 }
2431
2432 static void kernel_pio(struct kvm_io_device *pio_dev,
2433                        struct kvm_vcpu *vcpu,
2434                        void *pd)
2435 {
2436         /* TODO: String I/O for in kernel device */
2437
2438         mutex_lock(&vcpu->kvm->lock);
2439         if (vcpu->arch.pio.in)
2440                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2441                                   vcpu->arch.pio.size,
2442                                   pd);
2443         else
2444                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2445                                    vcpu->arch.pio.size,
2446                                    pd);
2447         mutex_unlock(&vcpu->kvm->lock);
2448 }
2449
2450 static void pio_string_write(struct kvm_io_device *pio_dev,
2451                              struct kvm_vcpu *vcpu)
2452 {
2453         struct kvm_pio_request *io = &vcpu->arch.pio;
2454         void *pd = vcpu->arch.pio_data;
2455         int i;
2456
2457         mutex_lock(&vcpu->kvm->lock);
2458         for (i = 0; i < io->cur_count; i++) {
2459                 kvm_iodevice_write(pio_dev, io->port,
2460                                    io->size,
2461                                    pd);
2462                 pd += io->size;
2463         }
2464         mutex_unlock(&vcpu->kvm->lock);
2465 }
2466
2467 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2468                                                gpa_t addr, int len,
2469                                                int is_write)
2470 {
2471         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2472 }
2473
2474 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2475                   int size, unsigned port)
2476 {
2477         struct kvm_io_device *pio_dev;
2478         unsigned long val;
2479
2480         vcpu->run->exit_reason = KVM_EXIT_IO;
2481         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2482         vcpu->run->io.size = vcpu->arch.pio.size = size;
2483         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2484         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2485         vcpu->run->io.port = vcpu->arch.pio.port = port;
2486         vcpu->arch.pio.in = in;
2487         vcpu->arch.pio.string = 0;
2488         vcpu->arch.pio.down = 0;
2489         vcpu->arch.pio.guest_page_offset = 0;
2490         vcpu->arch.pio.rep = 0;
2491
2492         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2493                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2494                             handler);
2495         else
2496                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2497                             handler);
2498
2499         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2500         memcpy(vcpu->arch.pio_data, &val, 4);
2501
2502         pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2503         if (pio_dev) {
2504                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2505                 complete_pio(vcpu);
2506                 return 1;
2507         }
2508         return 0;
2509 }
2510 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2511
2512 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2513                   int size, unsigned long count, int down,
2514                   gva_t address, int rep, unsigned port)
2515 {
2516         unsigned now, in_page;
2517         int i, ret = 0;
2518         int nr_pages = 1;
2519         struct page *page;
2520         struct kvm_io_device *pio_dev;
2521
2522         vcpu->run->exit_reason = KVM_EXIT_IO;
2523         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2524         vcpu->run->io.size = vcpu->arch.pio.size = size;
2525         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2526         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2527         vcpu->run->io.port = vcpu->arch.pio.port = port;
2528         vcpu->arch.pio.in = in;
2529         vcpu->arch.pio.string = 1;
2530         vcpu->arch.pio.down = down;
2531         vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2532         vcpu->arch.pio.rep = rep;
2533
2534         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2535                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2536                             handler);
2537         else
2538                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2539                             handler);
2540
2541         if (!count) {
2542                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2543                 return 1;
2544         }
2545
2546         if (!down)
2547                 in_page = PAGE_SIZE - offset_in_page(address);
2548         else
2549                 in_page = offset_in_page(address) + size;
2550         now = min(count, (unsigned long)in_page / size);
2551         if (!now) {
2552                 /*
2553                  * String I/O straddles page boundary.  Pin two guest pages
2554                  * so that we satisfy atomicity constraints.  Do just one
2555                  * transaction to avoid complexity.
2556                  */
2557                 nr_pages = 2;
2558                 now = 1;
2559         }
2560         if (down) {
2561                 /*
2562                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2563                  */
2564                 pr_unimpl(vcpu, "guest string pio down\n");
2565                 kvm_inject_gp(vcpu, 0);
2566                 return 1;
2567         }
2568         vcpu->run->io.count = now;
2569         vcpu->arch.pio.cur_count = now;
2570
2571         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2572                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2573
2574         for (i = 0; i < nr_pages; ++i) {
2575                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2576                 vcpu->arch.pio.guest_pages[i] = page;
2577                 if (!page) {
2578                         kvm_inject_gp(vcpu, 0);
2579                         free_pio_guest_pages(vcpu);
2580                         return 1;
2581                 }
2582         }
2583
2584         pio_dev = vcpu_find_pio_dev(vcpu, port,
2585                                     vcpu->arch.pio.cur_count,
2586                                     !vcpu->arch.pio.in);
2587         if (!vcpu->arch.pio.in) {
2588                 /* string PIO write */
2589                 ret = pio_copy_data(vcpu);
2590                 if (ret >= 0 && pio_dev) {
2591                         pio_string_write(pio_dev, vcpu);
2592                         complete_pio(vcpu);
2593                         if (vcpu->arch.pio.count == 0)
2594                                 ret = 1;
2595                 }
2596         } else if (pio_dev)
2597                 pr_unimpl(vcpu, "no string pio read support yet, "
2598                        "port %x size %d count %ld\n",
2599                         port, size, count);
2600
2601         return ret;
2602 }
2603 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2604
2605 int kvm_arch_init(void *opaque)
2606 {
2607         int r;
2608         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2609
2610         if (kvm_x86_ops) {
2611                 printk(KERN_ERR "kvm: already loaded the other module\n");
2612                 r = -EEXIST;
2613                 goto out;
2614         }
2615
2616         if (!ops->cpu_has_kvm_support()) {
2617                 printk(KERN_ERR "kvm: no hardware support\n");
2618                 r = -EOPNOTSUPP;
2619                 goto out;
2620         }
2621         if (ops->disabled_by_bios()) {
2622                 printk(KERN_ERR "kvm: disabled by bios\n");
2623                 r = -EOPNOTSUPP;
2624                 goto out;
2625         }
2626
2627         r = kvm_mmu_module_init();
2628         if (r)
2629                 goto out;
2630
2631         kvm_init_msr_list();
2632
2633         kvm_x86_ops = ops;
2634         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2635         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2636         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2637                         PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
2638         return 0;
2639
2640 out:
2641         return r;
2642 }
2643
2644 void kvm_arch_exit(void)
2645 {
2646         kvm_x86_ops = NULL;
2647         kvm_mmu_module_exit();
2648 }
2649
2650 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2651 {
2652         ++vcpu->stat.halt_exits;
2653         KVMTRACE_0D(HLT, vcpu, handler);
2654         if (irqchip_in_kernel(vcpu->kvm)) {
2655                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2656                 return 1;
2657         } else {
2658                 vcpu->run->exit_reason = KVM_EXIT_HLT;
2659                 return 0;
2660         }
2661 }
2662 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2663
2664 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2665                            unsigned long a1)
2666 {
2667         if (is_long_mode(vcpu))
2668                 return a0;
2669         else
2670                 return a0 | ((gpa_t)a1 << 32);
2671 }
2672
2673 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2674 {
2675         unsigned long nr, a0, a1, a2, a3, ret;
2676         int r = 1;
2677
2678         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2679         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2680         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2681         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2682         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2683
2684         KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2685
2686         if (!is_long_mode(vcpu)) {
2687                 nr &= 0xFFFFFFFF;
2688                 a0 &= 0xFFFFFFFF;
2689                 a1 &= 0xFFFFFFFF;
2690                 a2 &= 0xFFFFFFFF;
2691                 a3 &= 0xFFFFFFFF;
2692         }
2693
2694         switch (nr) {
2695         case KVM_HC_VAPIC_POLL_IRQ:
2696                 ret = 0;
2697                 break;
2698         case KVM_HC_MMU_OP:
2699                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2700                 break;
2701         default:
2702                 ret = -KVM_ENOSYS;
2703                 break;
2704         }
2705         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2706         ++vcpu->stat.hypercalls;
2707         return r;
2708 }
2709 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2710
2711 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2712 {
2713         char instruction[3];
2714         int ret = 0;
2715         unsigned long rip = kvm_rip_read(vcpu);
2716
2717
2718         /*
2719          * Blow out the MMU to ensure that no other VCPU has an active mapping
2720          * to ensure that the updated hypercall appears atomically across all
2721          * VCPUs.
2722          */
2723         kvm_mmu_zap_all(vcpu->kvm);
2724
2725         kvm_x86_ops->patch_hypercall(vcpu, instruction);
2726         if (emulator_write_emulated(rip, instruction, 3, vcpu)
2727             != X86EMUL_CONTINUE)
2728                 ret = -EFAULT;
2729
2730         return ret;
2731 }
2732
2733 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2734 {
2735         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2736 }
2737
2738 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2739 {
2740         struct descriptor_table dt = { limit, base };
2741
2742         kvm_x86_ops->set_gdt(vcpu, &dt);
2743 }
2744
2745 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2746 {
2747         struct descriptor_table dt = { limit, base };
2748
2749         kvm_x86_ops->set_idt(vcpu, &dt);
2750 }
2751
2752 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2753                    unsigned long *rflags)
2754 {
2755         kvm_lmsw(vcpu, msw);
2756         *rflags = kvm_x86_ops->get_rflags(vcpu);
2757 }
2758
2759 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2760 {
2761         unsigned long value;
2762
2763         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2764         switch (cr) {
2765         case 0:
2766                 value = vcpu->arch.cr0;
2767                 break;
2768         case 2:
2769                 value = vcpu->arch.cr2;
2770                 break;
2771         case 3:
2772                 value = vcpu->arch.cr3;
2773                 break;
2774         case 4:
2775                 value = vcpu->arch.cr4;
2776                 break;
2777         case 8:
2778                 value = kvm_get_cr8(vcpu);
2779                 break;
2780         default:
2781                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2782                 return 0;
2783         }
2784         KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2785                     (u32)((u64)value >> 32), handler);
2786
2787         return value;
2788 }
2789
2790 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2791                      unsigned long *rflags)
2792 {
2793         KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2794                     (u32)((u64)val >> 32), handler);
2795
2796         switch (cr) {
2797         case 0:
2798                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2799                 *rflags = kvm_x86_ops->get_rflags(vcpu);
2800                 break;
2801         case 2:
2802                 vcpu->arch.cr2 = val;
2803                 break;
2804         case 3:
2805                 kvm_set_cr3(vcpu, val);
2806                 break;
2807         case 4:
2808                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2809                 break;
2810         case 8:
2811                 kvm_set_cr8(vcpu, val & 0xfUL);
2812                 break;
2813         default:
2814                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2815         }
2816 }
2817
2818 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2819 {
2820         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2821         int j, nent = vcpu->arch.cpuid_nent;
2822
2823         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2824         /* when no next entry is found, the current entry[i] is reselected */
2825         for (j = i + 1; ; j = (j + 1) % nent) {
2826                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2827                 if (ej->function == e->function) {
2828                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2829                         return j;
2830                 }
2831         }
2832         return 0; /* silence gcc, even though control never reaches here */
2833 }
2834
2835 /* find an entry with matching function, matching index (if needed), and that
2836  * should be read next (if it's stateful) */
2837 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2838         u32 function, u32 index)
2839 {
2840         if (e->function != function)
2841                 return 0;
2842         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2843                 return 0;
2844         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2845                 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2846                 return 0;
2847         return 1;
2848 }
2849
2850 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
2851                                               u32 function, u32 index)
2852 {
2853         int i;
2854         struct kvm_cpuid_entry2 *best = NULL;
2855
2856         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2857                 struct kvm_cpuid_entry2 *e;
2858
2859                 e = &vcpu->arch.cpuid_entries[i];
2860                 if (is_matching_cpuid_entry(e, function, index)) {
2861                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2862                                 move_to_next_stateful_cpuid_entry(vcpu, i);
2863                         best = e;
2864                         break;
2865                 }
2866                 /*
2867                  * Both basic or both extended?
2868                  */
2869                 if (((e->function ^ function) & 0x80000000) == 0)
2870                         if (!best || e->function > best->function)
2871                                 best = e;
2872         }
2873
2874         return best;
2875 }
2876
2877 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2878 {
2879         u32 function, index;
2880         struct kvm_cpuid_entry2 *best;
2881
2882         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
2883         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
2884         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
2885         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
2886         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
2887         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
2888         best = kvm_find_cpuid_entry(vcpu, function, index);
2889         if (best) {
2890                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
2891                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
2892                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
2893                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
2894         }
2895         kvm_x86_ops->skip_emulated_instruction(vcpu);
2896         KVMTRACE_5D(CPUID, vcpu, function,
2897                     (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
2898                     (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
2899                     (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
2900                     (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
2901 }
2902 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2903
2904 /*
2905  * Check if userspace requested an interrupt window, and that the
2906  * interrupt window is open.
2907  *
2908  * No need to exit to userspace if we already have an interrupt queued.
2909  */
2910 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2911                                           struct kvm_run *kvm_run)
2912 {
2913         return (!vcpu->arch.irq_summary &&
2914                 kvm_run->request_interrupt_window &&
2915                 vcpu->arch.interrupt_window_open &&
2916                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2917 }
2918
2919 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2920                               struct kvm_run *kvm_run)
2921 {
2922         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2923         kvm_run->cr8 = kvm_get_cr8(vcpu);
2924         kvm_run->apic_base = kvm_get_apic_base(vcpu);
2925         if (irqchip_in_kernel(vcpu->kvm))
2926                 kvm_run->ready_for_interrupt_injection = 1;
2927         else
2928                 kvm_run->ready_for_interrupt_injection =
2929                                         (vcpu->arch.interrupt_window_open &&
2930                                          vcpu->arch.irq_summary == 0);
2931 }
2932
2933 static void vapic_enter(struct kvm_vcpu *vcpu)
2934 {
2935         struct kvm_lapic *apic = vcpu->arch.apic;
2936         struct page *page;
2937
2938         if (!apic || !apic->vapic_addr)
2939                 return;
2940
2941         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2942
2943         vcpu->arch.apic->vapic_page = page;
2944 }
2945
2946 static void vapic_exit(struct kvm_vcpu *vcpu)
2947 {
2948         struct kvm_lapic *apic = vcpu->arch.apic;
2949
2950         if (!apic || !apic->vapic_addr)
2951                 return;
2952
2953         down_read(&vcpu->kvm->slots_lock);
2954         kvm_release_page_dirty(apic->vapic_page);
2955         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2956         up_read(&vcpu->kvm->slots_lock);
2957 }
2958
2959 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2960 {
2961         int r;
2962
2963         if (vcpu->requests)
2964                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2965                         kvm_mmu_unload(vcpu);
2966
2967         r = kvm_mmu_reload(vcpu);
2968         if (unlikely(r))
2969                 goto out;
2970
2971         if (vcpu->requests) {
2972                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2973                         __kvm_migrate_timers(vcpu);
2974                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
2975                         kvm_mmu_sync_roots(vcpu);
2976                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2977                         kvm_x86_ops->tlb_flush(vcpu);
2978                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2979                                        &vcpu->requests)) {
2980                         kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2981                         r = 0;
2982                         goto out;
2983                 }
2984                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
2985                         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2986                         r = 0;
2987                         goto out;
2988                 }
2989         }
2990
2991         clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
2992         kvm_inject_pending_timer_irqs(vcpu);
2993
2994         preempt_disable();
2995
2996         kvm_x86_ops->prepare_guest_switch(vcpu);
2997         kvm_load_guest_fpu(vcpu);
2998
2999         local_irq_disable();
3000
3001         if (vcpu->requests || need_resched() || signal_pending(current)) {
3002                 local_irq_enable();
3003                 preempt_enable();
3004                 r = 1;
3005                 goto out;
3006         }
3007
3008         vcpu->guest_mode = 1;
3009         /*
3010          * Make sure that guest_mode assignment won't happen after
3011          * testing the pending IRQ vector bitmap.
3012          */
3013         smp_wmb();
3014
3015         if (vcpu->arch.exception.pending)
3016                 __queue_exception(vcpu);
3017         else if (irqchip_in_kernel(vcpu->kvm))
3018                 kvm_x86_ops->inject_pending_irq(vcpu);
3019         else
3020                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
3021
3022         kvm_lapic_sync_to_vapic(vcpu);
3023
3024         up_read(&vcpu->kvm->slots_lock);
3025
3026         kvm_guest_enter();
3027
3028         get_debugreg(vcpu->arch.host_dr6, 6);
3029         get_debugreg(vcpu->arch.host_dr7, 7);
3030         if (unlikely(vcpu->arch.switch_db_regs)) {
3031                 get_debugreg(vcpu->arch.host_db[0], 0);
3032                 get_debugreg(vcpu->arch.host_db[1], 1);
3033                 get_debugreg(vcpu->arch.host_db[2], 2);
3034                 get_debugreg(vcpu->arch.host_db[3], 3);
3035
3036                 set_debugreg(0, 7);
3037                 set_debugreg(vcpu->arch.eff_db[0], 0);
3038                 set_debugreg(vcpu->arch.eff_db[1], 1);
3039                 set_debugreg(vcpu->arch.eff_db[2], 2);
3040                 set_debugreg(vcpu->arch.eff_db[3], 3);
3041         }
3042
3043         KVMTRACE_0D(VMENTRY, vcpu, entryexit);
3044         kvm_x86_ops->run(vcpu, kvm_run);
3045
3046         if (unlikely(vcpu->arch.switch_db_regs)) {
3047                 set_debugreg(0, 7);
3048                 set_debugreg(vcpu->arch.host_db[0], 0);
3049                 set_debugreg(vcpu->arch.host_db[1], 1);
3050                 set_debugreg(vcpu->arch.host_db[2], 2);
3051                 set_debugreg(vcpu->arch.host_db[3], 3);
3052         }
3053         set_debugreg(vcpu->arch.host_dr6, 6);
3054         set_debugreg(vcpu->arch.host_dr7, 7);
3055
3056         vcpu->guest_mode = 0;
3057         local_irq_enable();
3058
3059         ++vcpu->stat.exits;
3060
3061         /*
3062          * We must have an instruction between local_irq_enable() and
3063          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3064          * the interrupt shadow.  The stat.exits increment will do nicely.
3065          * But we need to prevent reordering, hence this barrier():
3066          */
3067         barrier();
3068
3069         kvm_guest_exit();
3070
3071         preempt_enable();
3072
3073         down_read(&vcpu->kvm->slots_lock);
3074
3075         /*
3076          * Profile KVM exit RIPs:
3077          */
3078         if (unlikely(prof_on == KVM_PROFILING)) {
3079                 unsigned long rip = kvm_rip_read(vcpu);
3080                 profile_hit(KVM_PROFILING, (void *)rip);
3081         }
3082
3083         if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
3084                 vcpu->arch.exception.pending = false;
3085
3086         kvm_lapic_sync_from_vapic(vcpu);
3087
3088         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
3089 out:
3090         return r;
3091 }
3092
3093 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3094 {
3095         int r;
3096
3097         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3098                 pr_debug("vcpu %d received sipi with vector # %x\n",
3099                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3100                 kvm_lapic_reset(vcpu);
3101                 r = kvm_arch_vcpu_reset(vcpu);
3102                 if (r)
3103                         return r;
3104                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3105         }
3106
3107         down_read(&vcpu->kvm->slots_lock);
3108         vapic_enter(vcpu);
3109
3110         r = 1;
3111         while (r > 0) {
3112                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3113                         r = vcpu_enter_guest(vcpu, kvm_run);
3114                 else {
3115                         up_read(&vcpu->kvm->slots_lock);
3116                         kvm_vcpu_block(vcpu);
3117                         down_read(&vcpu->kvm->slots_lock);
3118                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3119                                 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3120                                         vcpu->arch.mp_state =
3121                                                         KVM_MP_STATE_RUNNABLE;
3122                         if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
3123                                 r = -EINTR;
3124                 }
3125
3126                 if (r > 0) {
3127                         if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3128                                 r = -EINTR;
3129                                 kvm_run->exit_reason = KVM_EXIT_INTR;
3130                                 ++vcpu->stat.request_irq_exits;
3131                         }
3132                         if (signal_pending(current)) {
3133                                 r = -EINTR;
3134                                 kvm_run->exit_reason = KVM_EXIT_INTR;
3135                                 ++vcpu->stat.signal_exits;
3136                         }
3137                         if (need_resched()) {
3138                                 up_read(&vcpu->kvm->slots_lock);
3139                                 kvm_resched(vcpu);
3140                                 down_read(&vcpu->kvm->slots_lock);
3141                         }
3142                 }
3143         }
3144
3145         up_read(&vcpu->kvm->slots_lock);
3146         post_kvm_run_save(vcpu, kvm_run);
3147
3148         vapic_exit(vcpu);
3149
3150         return r;
3151 }
3152
3153 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3154 {
3155         int r;
3156         sigset_t sigsaved;
3157
3158         vcpu_load(vcpu);
3159
3160         if (vcpu->sigset_active)
3161                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3162
3163         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3164                 kvm_vcpu_block(vcpu);
3165                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3166                 r = -EAGAIN;
3167                 goto out;
3168         }
3169
3170         /* re-sync apic's tpr */
3171         if (!irqchip_in_kernel(vcpu->kvm))
3172                 kvm_set_cr8(vcpu, kvm_run->cr8);
3173
3174         if (vcpu->arch.pio.cur_count) {
3175                 r = complete_pio(vcpu);
3176                 if (r)
3177                         goto out;
3178         }
3179 #if CONFIG_HAS_IOMEM
3180         if (vcpu->mmio_needed) {
3181                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3182                 vcpu->mmio_read_completed = 1;
3183                 vcpu->mmio_needed = 0;
3184
3185                 down_read(&vcpu->kvm->slots_lock);
3186                 r = emulate_instruction(vcpu, kvm_run,
3187                                         vcpu->arch.mmio_fault_cr2, 0,
3188                                         EMULTYPE_NO_DECODE);
3189                 up_read(&vcpu->kvm->slots_lock);
3190                 if (r == EMULATE_DO_MMIO) {
3191                         /*
3192                          * Read-modify-write.  Back to userspace.
3193                          */
3194                         r = 0;
3195                         goto out;
3196                 }
3197         }
3198 #endif
3199         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3200                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3201                                      kvm_run->hypercall.ret);
3202
3203         r = __vcpu_run(vcpu, kvm_run);
3204
3205 out:
3206         if (vcpu->sigset_active)
3207                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3208
3209         vcpu_put(vcpu);
3210         return r;
3211 }
3212
3213 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3214 {
3215         vcpu_load(vcpu);
3216
3217         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3218         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3219         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3220         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3221         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3222         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3223         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3224         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3225 #ifdef CONFIG_X86_64
3226         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3227         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3228         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3229         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3230         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3231         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3232         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3233         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3234 #endif
3235
3236         regs->rip = kvm_rip_read(vcpu);
3237         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3238
3239         /*
3240          * Don't leak debug flags in case they were set for guest debugging
3241          */
3242         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3243                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3244
3245         vcpu_put(vcpu);
3246
3247         return 0;
3248 }
3249
3250 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3251 {
3252         vcpu_load(vcpu);
3253
3254         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3255         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3256         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3257         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3258         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3259         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3260         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3261         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3262 #ifdef CONFIG_X86_64
3263         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3264         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3265         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3266         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3267         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3268         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3269         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3270         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3271
3272 #endif
3273
3274         kvm_rip_write(vcpu, regs->rip);
3275         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3276
3277
3278         vcpu->arch.exception.pending = false;
3279
3280         vcpu_put(vcpu);
3281
3282         return 0;
3283 }
3284
3285 void kvm_get_segment(struct kvm_vcpu *vcpu,
3286                      struct kvm_segment *var, int seg)
3287 {
3288         kvm_x86_ops->get_segment(vcpu, var, seg);
3289 }
3290
3291 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3292 {
3293         struct kvm_segment cs;
3294
3295         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3296         *db = cs.db;
3297         *l = cs.l;
3298 }
3299 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3300
3301 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3302                                   struct kvm_sregs *sregs)
3303 {
3304         struct descriptor_table dt;
3305         int pending_vec;
3306
3307         vcpu_load(vcpu);
3308
3309         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3310         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3311         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3312         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3313         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3314         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3315
3316         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3317         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3318
3319         kvm_x86_ops->get_idt(vcpu, &dt);
3320         sregs->idt.limit = dt.limit;
3321         sregs->idt.base = dt.base;
3322         kvm_x86_ops->get_gdt(vcpu, &dt);
3323         sregs->gdt.limit = dt.limit;
3324         sregs->gdt.base = dt.base;
3325
3326         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3327         sregs->cr0 = vcpu->arch.cr0;
3328         sregs->cr2 = vcpu->arch.cr2;
3329         sregs->cr3 = vcpu->arch.cr3;
3330         sregs->cr4 = vcpu->arch.cr4;
3331         sregs->cr8 = kvm_get_cr8(vcpu);
3332         sregs->efer = vcpu->arch.shadow_efer;
3333         sregs->apic_base = kvm_get_apic_base(vcpu);
3334
3335         if (irqchip_in_kernel(vcpu->kvm)) {
3336                 memset(sregs->interrupt_bitmap, 0,
3337                        sizeof sregs->interrupt_bitmap);
3338                 pending_vec = kvm_x86_ops->get_irq(vcpu);
3339                 if (pending_vec >= 0)
3340                         set_bit(pending_vec,
3341                                 (unsigned long *)sregs->interrupt_bitmap);
3342         } else
3343                 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3344                        sizeof sregs->interrupt_bitmap);
3345
3346         vcpu_put(vcpu);
3347
3348         return 0;
3349 }
3350
3351 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3352                                     struct kvm_mp_state *mp_state)
3353 {
3354         vcpu_load(vcpu);
3355         mp_state->mp_state = vcpu->arch.mp_state;
3356         vcpu_put(vcpu);
3357         return 0;
3358 }
3359
3360 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3361                                     struct kvm_mp_state *mp_state)
3362 {
3363         vcpu_load(vcpu);
3364         vcpu->arch.mp_state = mp_state->mp_state;
3365         vcpu_put(vcpu);
3366         return 0;
3367 }
3368
3369 static void kvm_set_segment(struct kvm_vcpu *vcpu,
3370                         struct kvm_segment *var, int seg)
3371 {
3372         kvm_x86_ops->set_segment(vcpu, var, seg);
3373 }
3374
3375 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3376                                    struct kvm_segment *kvm_desct)
3377 {
3378         kvm_desct->base = seg_desc->base0;
3379         kvm_desct->base |= seg_desc->base1 << 16;
3380         kvm_desct->base |= seg_desc->base2 << 24;
3381         kvm_desct->limit = seg_desc->limit0;
3382         kvm_desct->limit |= seg_desc->limit << 16;
3383         if (seg_desc->g) {
3384                 kvm_desct->limit <<= 12;
3385                 kvm_desct->limit |= 0xfff;
3386         }
3387         kvm_desct->selector = selector;
3388         kvm_desct->type = seg_desc->type;
3389         kvm_desct->present = seg_desc->p;
3390         kvm_desct->dpl = seg_desc->dpl;
3391         kvm_desct->db = seg_desc->d;
3392         kvm_desct->s = seg_desc->s;
3393         kvm_desct->l = seg_desc->l;
3394         kvm_desct->g = seg_desc->g;
3395         kvm_desct->avl = seg_desc->avl;
3396         if (!selector)
3397                 kvm_desct->unusable = 1;
3398         else
3399                 kvm_desct->unusable = 0;
3400         kvm_desct->padding = 0;
3401 }
3402
3403 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3404                                           u16 selector,
3405                                           struct descriptor_table *dtable)
3406 {
3407         if (selector & 1 << 2) {
3408                 struct kvm_segment kvm_seg;
3409
3410                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3411
3412                 if (kvm_seg.unusable)
3413                         dtable->limit = 0;
3414                 else
3415                         dtable->limit = kvm_seg.limit;
3416                 dtable->base = kvm_seg.base;
3417         }
3418         else
3419                 kvm_x86_ops->get_gdt(vcpu, dtable);
3420 }
3421
3422 /* allowed just for 8 bytes segments */
3423 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3424                                          struct desc_struct *seg_desc)
3425 {
3426         gpa_t gpa;
3427         struct descriptor_table dtable;
3428         u16 index = selector >> 3;
3429
3430         get_segment_descriptor_dtable(vcpu, selector, &dtable);
3431
3432         if (dtable.limit < index * 8 + 7) {
3433                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3434                 return 1;
3435         }
3436         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3437         gpa += index * 8;
3438         return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3439 }
3440
3441 /* allowed just for 8 bytes segments */
3442 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3443                                          struct desc_struct *seg_desc)
3444 {
3445         gpa_t gpa;
3446         struct descriptor_table dtable;
3447         u16 index = selector >> 3;
3448
3449         get_segment_descriptor_dtable(vcpu, selector, &dtable);
3450
3451         if (dtable.limit < index * 8 + 7)
3452                 return 1;
3453         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3454         gpa += index * 8;
3455         return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3456 }
3457
3458 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3459                              struct desc_struct *seg_desc)
3460 {
3461         u32 base_addr;
3462
3463         base_addr = seg_desc->base0;
3464         base_addr |= (seg_desc->base1 << 16);
3465         base_addr |= (seg_desc->base2 << 24);
3466
3467         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3468 }
3469
3470 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3471 {
3472         struct kvm_segment kvm_seg;
3473
3474         kvm_get_segment(vcpu, &kvm_seg, seg);
3475         return kvm_seg.selector;
3476 }
3477
3478 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3479                                                 u16 selector,
3480                                                 struct kvm_segment *kvm_seg)
3481 {
3482         struct desc_struct seg_desc;
3483
3484         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3485                 return 1;
3486         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3487         return 0;
3488 }
3489
3490 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3491 {
3492         struct kvm_segment segvar = {
3493                 .base = selector << 4,
3494                 .limit = 0xffff,
3495                 .selector = selector,
3496                 .type = 3,
3497                 .present = 1,
3498                 .dpl = 3,
3499                 .db = 0,
3500                 .s = 1,
3501                 .l = 0,
3502                 .g = 0,
3503                 .avl = 0,
3504                 .unusable = 0,
3505         };
3506         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3507         return 0;
3508 }
3509
3510 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3511                                 int type_bits, int seg)
3512 {
3513         struct kvm_segment kvm_seg;
3514
3515         if (!(vcpu->arch.cr0 & X86_CR0_PE))
3516                 return kvm_load_realmode_segment(vcpu, selector, seg);
3517         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3518                 return 1;
3519         kvm_seg.type |= type_bits;
3520
3521         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3522             seg != VCPU_SREG_LDTR)
3523                 if (!kvm_seg.s)
3524                         kvm_seg.unusable = 1;
3525
3526         kvm_set_segment(vcpu, &kvm_seg, seg);
3527         return 0;
3528 }
3529
3530 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3531                                 struct tss_segment_32 *tss)
3532 {
3533         tss->cr3 = vcpu->arch.cr3;
3534         tss->eip = kvm_rip_read(vcpu);
3535         tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3536         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3537         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3538         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3539         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3540         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3541         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3542         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3543         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3544         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3545         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3546         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3547         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3548         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3549         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3550         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3551         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3552 }
3553
3554 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3555                                   struct tss_segment_32 *tss)
3556 {
3557         kvm_set_cr3(vcpu, tss->cr3);
3558
3559         kvm_rip_write(vcpu, tss->eip);
3560         kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3561
3562         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3563         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3564         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3565         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3566         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3567         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3568         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3569         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3570
3571         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3572                 return 1;
3573
3574         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3575                 return 1;
3576
3577         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3578                 return 1;
3579
3580         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3581                 return 1;
3582
3583         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3584                 return 1;
3585
3586         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3587                 return 1;
3588
3589         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3590                 return 1;
3591         return 0;
3592 }
3593
3594 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3595                                 struct tss_segment_16 *tss)
3596 {
3597         tss->ip = kvm_rip_read(vcpu);
3598         tss->flag = kvm_x86_ops->get_rflags(vcpu);
3599         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3600         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3601         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3602         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3603         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3604         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3605         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3606         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3607
3608         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3609         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3610         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3611         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3612         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3613         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3614 }
3615
3616 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3617                                  struct tss_segment_16 *tss)
3618 {
3619         kvm_rip_write(vcpu, tss->ip);
3620         kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3621         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3622         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3623         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3624         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3625         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3626         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3627         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3628         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3629
3630         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3631                 return 1;
3632
3633         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3634                 return 1;
3635
3636         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3637                 return 1;
3638
3639         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3640                 return 1;
3641
3642         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3643                 return 1;
3644         return 0;
3645 }
3646
3647 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3648                        u32 old_tss_base,
3649                        struct desc_struct *nseg_desc)
3650 {
3651         struct tss_segment_16 tss_segment_16;
3652         int ret = 0;
3653
3654         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3655                            sizeof tss_segment_16))
3656                 goto out;
3657
3658         save_state_to_tss16(vcpu, &tss_segment_16);
3659
3660         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3661                             sizeof tss_segment_16))
3662                 goto out;
3663
3664         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3665                            &tss_segment_16, sizeof tss_segment_16))
3666                 goto out;
3667
3668         if (load_state_from_tss16(vcpu, &tss_segment_16))
3669                 goto out;
3670
3671         ret = 1;
3672 out:
3673         return ret;
3674 }
3675
3676 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3677                        u32 old_tss_base,
3678                        struct desc_struct *nseg_desc)
3679 {
3680         struct tss_segment_32 tss_segment_32;
3681         int ret = 0;
3682
3683         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3684                            sizeof tss_segment_32))
3685                 goto out;
3686
3687         save_state_to_tss32(vcpu, &tss_segment_32);
3688
3689         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3690                             sizeof tss_segment_32))
3691                 goto out;
3692
3693         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3694                            &tss_segment_32, sizeof tss_segment_32))
3695                 goto out;
3696
3697         if (load_state_from_tss32(vcpu, &tss_segment_32))
3698                 goto out;
3699
3700         ret = 1;
3701 out:
3702         return ret;
3703 }
3704
3705 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3706 {
3707         struct kvm_segment tr_seg;
3708         struct desc_struct cseg_desc;
3709         struct desc_struct nseg_desc;
3710         int ret = 0;
3711         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3712         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3713
3714         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3715
3716         /* FIXME: Handle errors. Failure to read either TSS or their
3717          * descriptors should generate a pagefault.
3718          */
3719         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3720                 goto out;
3721
3722         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3723                 goto out;
3724
3725         if (reason != TASK_SWITCH_IRET) {
3726                 int cpl;
3727
3728                 cpl = kvm_x86_ops->get_cpl(vcpu);
3729                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
3730                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3731                         return 1;
3732                 }
3733         }
3734
3735         if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
3736                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
3737                 return 1;
3738         }
3739
3740         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3741                 cseg_desc.type &= ~(1 << 1); //clear the B flag
3742                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3743         }
3744
3745         if (reason == TASK_SWITCH_IRET) {
3746                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3747                 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3748         }
3749
3750         kvm_x86_ops->skip_emulated_instruction(vcpu);
3751
3752         if (nseg_desc.type & 8)
3753                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3754                                          &nseg_desc);
3755         else
3756                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3757                                          &nseg_desc);
3758
3759         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3760                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3761                 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
3762         }
3763
3764         if (reason != TASK_SWITCH_IRET) {
3765                 nseg_desc.type |= (1 << 1);
3766                 save_guest_segment_descriptor(vcpu, tss_selector,
3767                                               &nseg_desc);
3768         }
3769
3770         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3771         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3772         tr_seg.type = 11;
3773         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3774 out:
3775         return ret;
3776 }
3777 EXPORT_SYMBOL_GPL(kvm_task_switch);
3778
3779 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3780                                   struct kvm_sregs *sregs)
3781 {
3782         int mmu_reset_needed = 0;
3783         int i, pending_vec, max_bits;
3784         struct descriptor_table dt;
3785
3786         vcpu_load(vcpu);
3787
3788         dt.limit = sregs->idt.limit;
3789         dt.base = sregs->idt.base;
3790         kvm_x86_ops->set_idt(vcpu, &dt);
3791         dt.limit = sregs->gdt.limit;
3792         dt.base = sregs->gdt.base;
3793         kvm_x86_ops->set_gdt(vcpu, &dt);
3794
3795         vcpu->arch.cr2 = sregs->cr2;
3796         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3797         vcpu->arch.cr3 = sregs->cr3;
3798
3799         kvm_set_cr8(vcpu, sregs->cr8);
3800
3801         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
3802         kvm_x86_ops->set_efer(vcpu, sregs->efer);
3803         kvm_set_apic_base(vcpu, sregs->apic_base);
3804
3805         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3806
3807         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
3808         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
3809         vcpu->arch.cr0 = sregs->cr0;
3810
3811         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
3812         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
3813         if (!is_long_mode(vcpu) && is_pae(vcpu))
3814                 load_pdptrs(vcpu, vcpu->arch.cr3);
3815
3816         if (mmu_reset_needed)
3817                 kvm_mmu_reset_context(vcpu);
3818
3819         if (!irqchip_in_kernel(vcpu->kvm)) {
3820                 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
3821                        sizeof vcpu->arch.irq_pending);
3822                 vcpu->arch.irq_summary = 0;
3823                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
3824                         if (vcpu->arch.irq_pending[i])
3825                                 __set_bit(i, &vcpu->arch.irq_summary);
3826         } else {
3827                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3828                 pending_vec = find_first_bit(
3829                         (const unsigned long *)sregs->interrupt_bitmap,
3830                         max_bits);
3831                 /* Only pending external irq is handled here */
3832                 if (pending_vec < max_bits) {
3833                         kvm_x86_ops->set_irq(vcpu, pending_vec);
3834                         pr_debug("Set back pending irq %d\n",
3835                                  pending_vec);
3836                 }
3837                 kvm_pic_clear_isr_ack(vcpu->kvm);
3838         }
3839
3840         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3841         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3842         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3843         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3844         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3845         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3846
3847         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3848         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3849
3850         /* Older userspace won't unhalt the vcpu on reset. */
3851         if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
3852             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3853             !(vcpu->arch.cr0 & X86_CR0_PE))
3854                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3855
3856         vcpu_put(vcpu);
3857
3858         return 0;
3859 }
3860
3861 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
3862                                         struct kvm_guest_debug *dbg)
3863 {
3864         int r;
3865
3866         vcpu_load(vcpu);
3867
3868         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
3869
3870         if (dbg->control & KVM_GUESTDBG_INJECT_DB)
3871                 kvm_queue_exception(vcpu, DB_VECTOR);
3872         else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
3873                 kvm_queue_exception(vcpu, BP_VECTOR);
3874
3875         vcpu_put(vcpu);
3876
3877         return r;
3878 }
3879
3880 /*
3881  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
3882  * we have asm/x86/processor.h
3883  */
3884 struct fxsave {
3885         u16     cwd;
3886         u16     swd;
3887         u16     twd;
3888         u16     fop;
3889         u64     rip;
3890         u64     rdp;
3891         u32     mxcsr;
3892         u32     mxcsr_mask;
3893         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
3894 #ifdef CONFIG_X86_64
3895         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
3896 #else
3897         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
3898 #endif
3899 };
3900
3901 /*
3902  * Translate a guest virtual address to a guest physical address.
3903  */
3904 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
3905                                     struct kvm_translation *tr)
3906 {
3907         unsigned long vaddr = tr->linear_address;
3908         gpa_t gpa;
3909
3910         vcpu_load(vcpu);
3911         down_read(&vcpu->kvm->slots_lock);
3912         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
3913         up_read(&vcpu->kvm->slots_lock);
3914         tr->physical_address = gpa;
3915         tr->valid = gpa != UNMAPPED_GVA;
3916         tr->writeable = 1;
3917         tr->usermode = 0;
3918         vcpu_put(vcpu);
3919
3920         return 0;
3921 }
3922
3923 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3924 {
3925         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3926
3927         vcpu_load(vcpu);
3928
3929         memcpy(fpu->fpr, fxsave->st_space, 128);
3930         fpu->fcw = fxsave->cwd;
3931         fpu->fsw = fxsave->swd;
3932         fpu->ftwx = fxsave->twd;
3933         fpu->last_opcode = fxsave->fop;
3934         fpu->last_ip = fxsave->rip;
3935         fpu->last_dp = fxsave->rdp;
3936         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
3937
3938         vcpu_put(vcpu);
3939
3940         return 0;
3941 }
3942
3943 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3944 {
3945         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3946
3947         vcpu_load(vcpu);
3948
3949         memcpy(fxsave->st_space, fpu->fpr, 128);
3950         fxsave->cwd = fpu->fcw;
3951         fxsave->swd = fpu->fsw;
3952         fxsave->twd = fpu->ftwx;
3953         fxsave->fop = fpu->last_opcode;
3954         fxsave->rip = fpu->last_ip;
3955         fxsave->rdp = fpu->last_dp;
3956         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
3957
3958         vcpu_put(vcpu);
3959
3960         return 0;
3961 }
3962
3963 void fx_init(struct kvm_vcpu *vcpu)
3964 {
3965         unsigned after_mxcsr_mask;
3966
3967         /*
3968          * Touch the fpu the first time in non atomic context as if
3969          * this is the first fpu instruction the exception handler
3970          * will fire before the instruction returns and it'll have to
3971          * allocate ram with GFP_KERNEL.
3972          */
3973         if (!used_math())
3974                 kvm_fx_save(&vcpu->arch.host_fx_image);
3975
3976         /* Initialize guest FPU by resetting ours and saving into guest's */
3977         preempt_disable();
3978         kvm_fx_save(&vcpu->arch.host_fx_image);
3979         kvm_fx_finit();
3980         kvm_fx_save(&vcpu->arch.guest_fx_image);
3981         kvm_fx_restore(&vcpu->arch.host_fx_image);
3982         preempt_enable();
3983
3984         vcpu->arch.cr0 |= X86_CR0_ET;
3985         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3986         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3987         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3988                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
3989 }
3990 EXPORT_SYMBOL_GPL(fx_init);
3991
3992 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3993 {
3994         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
3995                 return;
3996
3997         vcpu->guest_fpu_loaded = 1;
3998         kvm_fx_save(&vcpu->arch.host_fx_image);
3999         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4000 }
4001 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4002
4003 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4004 {
4005         if (!vcpu->guest_fpu_loaded)
4006                 return;
4007
4008         vcpu->guest_fpu_loaded = 0;
4009         kvm_fx_save(&vcpu->arch.guest_fx_image);
4010         kvm_fx_restore(&vcpu->arch.host_fx_image);
4011         ++vcpu->stat.fpu_reload;
4012 }
4013 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4014
4015 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4016 {
4017         kvm_x86_ops->vcpu_free(vcpu);
4018 }
4019
4020 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4021                                                 unsigned int id)
4022 {
4023         return kvm_x86_ops->vcpu_create(kvm, id);
4024 }
4025
4026 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4027 {
4028         int r;
4029
4030         /* We do fxsave: this must be aligned. */
4031         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4032
4033         vcpu->arch.mtrr_state.have_fixed = 1;
4034         vcpu_load(vcpu);
4035         r = kvm_arch_vcpu_reset(vcpu);
4036         if (r == 0)
4037                 r = kvm_mmu_setup(vcpu);
4038         vcpu_put(vcpu);
4039         if (r < 0)
4040                 goto free_vcpu;
4041
4042         return 0;
4043 free_vcpu:
4044         kvm_x86_ops->vcpu_free(vcpu);
4045         return r;
4046 }
4047
4048 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4049 {
4050         vcpu_load(vcpu);
4051         kvm_mmu_unload(vcpu);
4052         vcpu_put(vcpu);
4053
4054         kvm_x86_ops->vcpu_free(vcpu);
4055 }
4056
4057 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4058 {
4059         vcpu->arch.nmi_pending = false;
4060         vcpu->arch.nmi_injected = false;
4061
4062         vcpu->arch.switch_db_regs = 0;
4063         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4064         vcpu->arch.dr6 = DR6_FIXED_1;
4065         vcpu->arch.dr7 = DR7_FIXED_1;
4066
4067         return kvm_x86_ops->vcpu_reset(vcpu);
4068 }
4069
4070 void kvm_arch_hardware_enable(void *garbage)
4071 {
4072         kvm_x86_ops->hardware_enable(garbage);
4073 }
4074
4075 void kvm_arch_hardware_disable(void *garbage)
4076 {
4077         kvm_x86_ops->hardware_disable(garbage);
4078 }
4079
4080 int kvm_arch_hardware_setup(void)
4081 {
4082         return kvm_x86_ops->hardware_setup();
4083 }
4084
4085 void kvm_arch_hardware_unsetup(void)
4086 {
4087         kvm_x86_ops->hardware_unsetup();
4088 }
4089
4090 void kvm_arch_check_processor_compat(void *rtn)
4091 {
4092         kvm_x86_ops->check_processor_compatibility(rtn);
4093 }
4094
4095 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4096 {
4097         struct page *page;
4098         struct kvm *kvm;
4099         int r;
4100
4101         BUG_ON(vcpu->kvm == NULL);
4102         kvm = vcpu->kvm;
4103
4104         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4105         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
4106                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4107         else
4108                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4109
4110         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4111         if (!page) {
4112                 r = -ENOMEM;
4113                 goto fail;
4114         }
4115         vcpu->arch.pio_data = page_address(page);
4116
4117         r = kvm_mmu_create(vcpu);
4118         if (r < 0)
4119                 goto fail_free_pio_data;
4120
4121         if (irqchip_in_kernel(kvm)) {
4122                 r = kvm_create_lapic(vcpu);
4123                 if (r < 0)
4124                         goto fail_mmu_destroy;
4125         }
4126
4127         return 0;
4128
4129 fail_mmu_destroy:
4130         kvm_mmu_destroy(vcpu);
4131 fail_free_pio_data:
4132         free_page((unsigned long)vcpu->arch.pio_data);
4133 fail:
4134         return r;
4135 }
4136
4137 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4138 {
4139         kvm_free_lapic(vcpu);
4140         down_read(&vcpu->kvm->slots_lock);
4141         kvm_mmu_destroy(vcpu);
4142         up_read(&vcpu->kvm->slots_lock);
4143         free_page((unsigned long)vcpu->arch.pio_data);
4144 }
4145
4146 struct  kvm *kvm_arch_create_vm(void)
4147 {
4148         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4149
4150         if (!kvm)
4151                 return ERR_PTR(-ENOMEM);
4152
4153         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4154         INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4155         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4156
4157         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4158         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4159
4160         return kvm;
4161 }
4162
4163 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4164 {
4165         vcpu_load(vcpu);
4166         kvm_mmu_unload(vcpu);
4167         vcpu_put(vcpu);
4168 }
4169
4170 static void kvm_free_vcpus(struct kvm *kvm)
4171 {
4172         unsigned int i;
4173
4174         /*
4175          * Unpin any mmu pages first.
4176          */
4177         for (i = 0; i < KVM_MAX_VCPUS; ++i)
4178                 if (kvm->vcpus[i])
4179                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
4180         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
4181                 if (kvm->vcpus[i]) {
4182                         kvm_arch_vcpu_free(kvm->vcpus[i]);
4183                         kvm->vcpus[i] = NULL;
4184                 }
4185         }
4186
4187 }
4188
4189 void kvm_arch_sync_events(struct kvm *kvm)
4190 {
4191         kvm_free_all_assigned_devices(kvm);
4192 }
4193
4194 void kvm_arch_destroy_vm(struct kvm *kvm)
4195 {
4196         kvm_iommu_unmap_guest(kvm);
4197         kvm_free_pit(kvm);
4198         kfree(kvm->arch.vpic);
4199         kfree(kvm->arch.vioapic);
4200         kvm_free_vcpus(kvm);
4201         kvm_free_physmem(kvm);
4202         if (kvm->arch.apic_access_page)
4203                 put_page(kvm->arch.apic_access_page);
4204         if (kvm->arch.ept_identity_pagetable)
4205                 put_page(kvm->arch.ept_identity_pagetable);
4206         kfree(kvm);
4207 }
4208
4209 int kvm_arch_set_memory_region(struct kvm *kvm,
4210                                 struct kvm_userspace_memory_region *mem,
4211                                 struct kvm_memory_slot old,
4212                                 int user_alloc)
4213 {
4214         int npages = mem->memory_size >> PAGE_SHIFT;
4215         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4216
4217         /*To keep backward compatibility with older userspace,
4218          *x86 needs to hanlde !user_alloc case.
4219          */
4220         if (!user_alloc) {
4221                 if (npages && !old.rmap) {
4222                         unsigned long userspace_addr;
4223
4224                         down_write(&current->mm->mmap_sem);
4225                         userspace_addr = do_mmap(NULL, 0,
4226                                                  npages * PAGE_SIZE,
4227                                                  PROT_READ | PROT_WRITE,
4228                                                  MAP_PRIVATE | MAP_ANONYMOUS,
4229                                                  0);
4230                         up_write(&current->mm->mmap_sem);
4231
4232                         if (IS_ERR((void *)userspace_addr))
4233                                 return PTR_ERR((void *)userspace_addr);
4234
4235                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
4236                         spin_lock(&kvm->mmu_lock);
4237                         memslot->userspace_addr = userspace_addr;
4238                         spin_unlock(&kvm->mmu_lock);
4239                 } else {
4240                         if (!old.user_alloc && old.rmap) {
4241                                 int ret;
4242
4243                                 down_write(&current->mm->mmap_sem);
4244                                 ret = do_munmap(current->mm, old.userspace_addr,
4245                                                 old.npages * PAGE_SIZE);
4246                                 up_write(&current->mm->mmap_sem);
4247                                 if (ret < 0)
4248                                         printk(KERN_WARNING
4249                                        "kvm_vm_ioctl_set_memory_region: "
4250                                        "failed to munmap memory\n");
4251                         }
4252                 }
4253         }
4254
4255         if (!kvm->arch.n_requested_mmu_pages) {
4256                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4257                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4258         }
4259
4260         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4261         kvm_flush_remote_tlbs(kvm);
4262
4263         return 0;
4264 }
4265
4266 void kvm_arch_flush_shadow(struct kvm *kvm)
4267 {
4268         kvm_mmu_zap_all(kvm);
4269 }
4270
4271 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4272 {
4273         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4274                || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4275                || vcpu->arch.nmi_pending;
4276 }
4277
4278 static void vcpu_kick_intr(void *info)
4279 {
4280 #ifdef DEBUG
4281         struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4282         printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4283 #endif
4284 }
4285
4286 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4287 {
4288         int ipi_pcpu = vcpu->cpu;
4289         int cpu = get_cpu();
4290
4291         if (waitqueue_active(&vcpu->wq)) {
4292                 wake_up_interruptible(&vcpu->wq);
4293                 ++vcpu->stat.halt_wakeup;
4294         }
4295         /*
4296          * We may be called synchronously with irqs disabled in guest mode,
4297          * So need not to call smp_call_function_single() in that case.
4298          */
4299         if (vcpu->guest_mode && vcpu->cpu != cpu)
4300                 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4301         put_cpu();
4302 }