arch/x86/kvm/x86.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * derived from drivers/kvm/kvm_main.c
   5  *
   6  * Copyright (C) 2006 Qumranet, Inc.
   7  * Copyright (C) 2008 Qumranet, Inc.
   8  * Copyright IBM Corporation, 2008
   9  *
  10  * Authors:
  11  *   Avi Kivity   <avi@qumranet.com>
  12  *   Yaniv Kamay  <yaniv@qumranet.com>
  13  *   Amit Shah    <amit.shah@qumranet.com>
  14  *   Ben-Ami Yassour <benami@il.ibm.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include <linux/kvm_host.h>
  22 #include "irq.h"
  23 #include "mmu.h"
  24 #include "i8254.h"
  25 #include "tss.h"
  26 #include "kvm_cache_regs.h"
  27 #include "x86.h"
  28
  29 #include <linux/clocksource.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/kvm.h>
  32 #include <linux/fs.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/module.h>
  35 #include <linux/mman.h>
  36 #include <linux/highmem.h>
  37 #include <linux/iommu.h>
  38 #include <linux/intel-iommu.h>
  39 #include <linux/cpufreq.h>
  40 #include <linux/user-return-notifier.h>
  41 #include <trace/events/kvm.h>
  42 #undef TRACE_INCLUDE_FILE
  43 #define CREATE_TRACE_POINTS
  44 #include "trace.h"
  45
  46 #include <asm/uaccess.h>
  47 #include <asm/msr.h>
  48 #include <asm/desc.h>
  49 #include <asm/mtrr.h>
  50 #include <asm/mce.h>
  51
  52 #define MAX_IO_MSRS 256
  53 #define CR0_RESERVED_BITS                                               \
  54         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  55                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  56                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  57 #define CR4_RESERVED_BITS                                               \
  58         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  59                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  60                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  61                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  62
  63 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  64
  65 #define KVM_MAX_MCE_BANKS 32
  66 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
  67
  68 /* EFER defaults:
  69  * - enable syscall per default because its emulated by KVM
  70  * - enable LME and LMA per default on 64 bit KVM
  71  */
  72 #ifdef CONFIG_X86_64
  73 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  74 #else
  75 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  76 #endif
  77
  78 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  79 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  80
  81 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  82 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  83                                     struct kvm_cpuid_entry2 __user *entries);
  84
  85 struct kvm_x86_ops *kvm_x86_ops;
  86 EXPORT_SYMBOL_GPL(kvm_x86_ops);
  87
  88 int ignore_msrs = 0;
  89 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
  90
  91 #define KVM_NR_SHARED_MSRS 16
  92
  93 struct kvm_shared_msrs_global {
  94         int nr;
  95         struct kvm_shared_msr {
  96                 u32 msr;
  97                 u64 value;
  98         } msrs[KVM_NR_SHARED_MSRS];
  99 };
 100
 101 struct kvm_shared_msrs {
 102         struct user_return_notifier urn;
 103         bool registered;
 104         u64 current_value[KVM_NR_SHARED_MSRS];
 105 };
 106
 107 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 108 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
 109
 110 struct kvm_stats_debugfs_item debugfs_entries[] = {
 111         { "pf_fixed", VCPU_STAT(pf_fixed) },
 112         { "pf_guest", VCPU_STAT(pf_guest) },
 113         { "tlb_flush", VCPU_STAT(tlb_flush) },
 114         { "invlpg", VCPU_STAT(invlpg) },
 115         { "exits", VCPU_STAT(exits) },
 116         { "io_exits", VCPU_STAT(io_exits) },
 117         { "mmio_exits", VCPU_STAT(mmio_exits) },
 118         { "signal_exits", VCPU_STAT(signal_exits) },
 119         { "irq_window", VCPU_STAT(irq_window_exits) },
 120         { "nmi_window", VCPU_STAT(nmi_window_exits) },
 121         { "halt_exits", VCPU_STAT(halt_exits) },
 122         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
 123         { "hypercalls", VCPU_STAT(hypercalls) },
 124         { "request_irq", VCPU_STAT(request_irq_exits) },
 125         { "irq_exits", VCPU_STAT(irq_exits) },
 126         { "host_state_reload", VCPU_STAT(host_state_reload) },
 127         { "efer_reload", VCPU_STAT(efer_reload) },
 128         { "fpu_reload", VCPU_STAT(fpu_reload) },
 129         { "insn_emulation", VCPU_STAT(insn_emulation) },
 130         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 131         { "irq_injections", VCPU_STAT(irq_injections) },
 132         { "nmi_injections", VCPU_STAT(nmi_injections) },
 133         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 134         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 135         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 136         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 137         { "mmu_flooded", VM_STAT(mmu_flooded) },
 138         { "mmu_recycled", VM_STAT(mmu_recycled) },
 139         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 140         { "mmu_unsync", VM_STAT(mmu_unsync) },
 141         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 142         { "largepages", VM_STAT(lpages) },
 143         { NULL }
 144 };
 145
 146 static void kvm_on_user_return(struct user_return_notifier *urn)
 147 {
 148         unsigned slot;
 149         struct kvm_shared_msr *global;
 150         struct kvm_shared_msrs *locals
 151                 = container_of(urn, struct kvm_shared_msrs, urn);
 152
 153         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
 154                 global = &shared_msrs_global.msrs[slot];
 155                 if (global->value != locals->current_value[slot]) {
 156                         wrmsrl(global->msr, global->value);
 157                         locals->current_value[slot] = global->value;
 158                 }
 159         }
 160         locals->registered = false;
 161         user_return_notifier_unregister(urn);
 162 }
 163
 164 void kvm_define_shared_msr(unsigned slot, u32 msr)
 165 {
 166         int cpu;
 167         u64 value;
 168
 169         if (slot >= shared_msrs_global.nr)
 170                 shared_msrs_global.nr = slot + 1;
 171         shared_msrs_global.msrs[slot].msr = msr;
 172         rdmsrl_safe(msr, &value);
 173         shared_msrs_global.msrs[slot].value = value;
 174         for_each_online_cpu(cpu)
 175                 per_cpu(shared_msrs, cpu).current_value[slot] = value;
 176 }
 177 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
 178
 179 static void kvm_shared_msr_cpu_online(void)
 180 {
 181         unsigned i;
 182         struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
 183
 184         for (i = 0; i < shared_msrs_global.nr; ++i)
 185                 locals->current_value[i] = shared_msrs_global.msrs[i].value;
 186 }
 187
 188 void kvm_set_shared_msr(unsigned slot, u64 value)
 189 {
 190         struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 191
 192         if (value == smsr->current_value[slot])
 193                 return;
 194         smsr->current_value[slot] = value;
 195         wrmsrl(shared_msrs_global.msrs[slot].msr, value);
 196         if (!smsr->registered) {
 197                 smsr->urn.on_user_return = kvm_on_user_return;
 198                 user_return_notifier_register(&smsr->urn);
 199                 smsr->registered = true;
 200         }
 201 }
 202 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
 203
 204 unsigned long segment_base(u16 selector)
 205 {
 206         struct descriptor_table gdt;
 207         struct desc_struct *d;
 208         unsigned long table_base;
 209         unsigned long v;
 210
 211         if (selector == 0)
 212                 return 0;
 213
 214         kvm_get_gdt(&gdt);
 215         table_base = gdt.base;
 216
 217         if (selector & 4) {           /* from ldt */
 218                 u16 ldt_selector = kvm_read_ldt();
 219
 220                 table_base = segment_base(ldt_selector);
 221         }
 222         d = (struct desc_struct *)(table_base + (selector & ~7));
 223         v = get_desc_base(d);
 224 #ifdef CONFIG_X86_64
 225         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 226                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 227 #endif
 228         return v;
 229 }
 230 EXPORT_SYMBOL_GPL(segment_base);
 231
 232 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 233 {
 234         if (irqchip_in_kernel(vcpu->kvm))
 235                 return vcpu->arch.apic_base;
 236         else
 237                 return vcpu->arch.apic_base;
 238 }
 239 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 240
 241 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 242 {
 243         /* TODO: reserve bits check */
 244         if (irqchip_in_kernel(vcpu->kvm))
 245                 kvm_lapic_set_base(vcpu, data);
 246         else
 247                 vcpu->arch.apic_base = data;
 248 }
 249 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 250
 251 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 252 {
 253         WARN_ON(vcpu->arch.exception.pending);
 254         vcpu->arch.exception.pending = true;
 255         vcpu->arch.exception.has_error_code = false;
 256         vcpu->arch.exception.nr = nr;
 257 }
 258 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 259
 260 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 261                            u32 error_code)
 262 {
 263         ++vcpu->stat.pf_guest;
 264
 265         if (vcpu->arch.exception.pending) {
 266                 switch(vcpu->arch.exception.nr) {
 267                 case DF_VECTOR:
 268                         /* triple fault -> shutdown */
 269                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 270                         return;
 271                 case PF_VECTOR:
 272                         vcpu->arch.exception.nr = DF_VECTOR;
 273                         vcpu->arch.exception.error_code = 0;
 274                         return;
 275                 default:
 276                         /* replace previous exception with a new one in a hope
 277                            that instruction re-execution will regenerate lost
 278                            exception */
 279                         vcpu->arch.exception.pending = false;
 280                         break;
 281                 }
 282         }
 283         vcpu->arch.cr2 = addr;
 284         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 285 }
 286
 287 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 288 {
 289         vcpu->arch.nmi_pending = 1;
 290 }
 291 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 292
 293 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 294 {
 295         WARN_ON(vcpu->arch.exception.pending);
 296         vcpu->arch.exception.pending = true;
 297         vcpu->arch.exception.has_error_code = true;
 298         vcpu->arch.exception.nr = nr;
 299         vcpu->arch.exception.error_code = error_code;
 300 }
 301 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 302
 303 /*
 304  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 305  * a #GP and return false.
 306  */
 307 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 308 {
 309         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
 310                 return true;
 311         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 312         return false;
 313 }
 314 EXPORT_SYMBOL_GPL(kvm_require_cpl);
 315
 316 /*
 317  * Load the pae pdptrs.  Return true is they are all valid.
 318  */
 319 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 320 {
 321         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 322         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 323         int i;
 324         int ret;
 325         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 326
 327         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 328                                   offset * sizeof(u64), sizeof(pdpte));
 329         if (ret < 0) {
 330                 ret = 0;
 331                 goto out;
 332         }
 333         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 334                 if (is_present_gpte(pdpte[i]) &&
 335                     (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 336                         ret = 0;
 337                         goto out;
 338                 }
 339         }
 340         ret = 1;
 341
 342         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 343         __set_bit(VCPU_EXREG_PDPTR,
 344                   (unsigned long *)&vcpu->arch.regs_avail);
 345         __set_bit(VCPU_EXREG_PDPTR,
 346                   (unsigned long *)&vcpu->arch.regs_dirty);
 347 out:
 348
 349         return ret;
 350 }
 351 EXPORT_SYMBOL_GPL(load_pdptrs);
 352
 353 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 354 {
 355         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 356         bool changed = true;
 357         int r;
 358
 359         if (is_long_mode(vcpu) || !is_pae(vcpu))
 360                 return false;
 361
 362         if (!test_bit(VCPU_EXREG_PDPTR,
 363                       (unsigned long *)&vcpu->arch.regs_avail))
 364                 return true;
 365
 366         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 367         if (r < 0)
 368                 goto out;
 369         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 370 out:
 371
 372         return changed;
 373 }
 374
 375 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 376 {
 377         if (cr0 & CR0_RESERVED_BITS) {
 378                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 379                        cr0, vcpu->arch.cr0);
 380                 kvm_inject_gp(vcpu, 0);
 381                 return;
 382         }
 383
 384         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 385                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 386                 kvm_inject_gp(vcpu, 0);
 387                 return;
 388         }
 389
 390         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 391                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 392                        "and a clear PE flag\n");
 393                 kvm_inject_gp(vcpu, 0);
 394                 return;
 395         }
 396
 397         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 398 #ifdef CONFIG_X86_64
 399                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
 400                         int cs_db, cs_l;
 401
 402                         if (!is_pae(vcpu)) {
 403                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 404                                        "in long mode while PAE is disabled\n");
 405                                 kvm_inject_gp(vcpu, 0);
 406                                 return;
 407                         }
 408                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 409                         if (cs_l) {
 410                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 411                                        "in long mode while CS.L == 1\n");
 412                                 kvm_inject_gp(vcpu, 0);
 413                                 return;
 414
 415                         }
 416                 } else
 417 #endif
 418                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 419                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 420                                "reserved bits\n");
 421                         kvm_inject_gp(vcpu, 0);
 422                         return;
 423                 }
 424
 425         }
 426
 427         kvm_x86_ops->set_cr0(vcpu, cr0);
 428         vcpu->arch.cr0 = cr0;
 429
 430         kvm_mmu_reset_context(vcpu);
 431         return;
 432 }
 433 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 434
 435 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 436 {
 437         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 438 }
 439 EXPORT_SYMBOL_GPL(kvm_lmsw);
 440
 441 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 442 {
 443         unsigned long old_cr4 = vcpu->arch.cr4;
 444         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 445
 446         if (cr4 & CR4_RESERVED_BITS) {
 447                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 448                 kvm_inject_gp(vcpu, 0);
 449                 return;
 450         }
 451
 452         if (is_long_mode(vcpu)) {
 453                 if (!(cr4 & X86_CR4_PAE)) {
 454                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 455                                "in long mode\n");
 456                         kvm_inject_gp(vcpu, 0);
 457                         return;
 458                 }
 459         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 460                    && ((cr4 ^ old_cr4) & pdptr_bits)
 461                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 462                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 463                 kvm_inject_gp(vcpu, 0);
 464                 return;
 465         }
 466
 467         if (cr4 & X86_CR4_VMXE) {
 468                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 469                 kvm_inject_gp(vcpu, 0);
 470                 return;
 471         }
 472         kvm_x86_ops->set_cr4(vcpu, cr4);
 473         vcpu->arch.cr4 = cr4;
 474         vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 475         kvm_mmu_reset_context(vcpu);
 476 }
 477 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 478
 479 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 480 {
 481         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 482                 kvm_mmu_sync_roots(vcpu);
 483                 kvm_mmu_flush_tlb(vcpu);
 484                 return;
 485         }
 486
 487         if (is_long_mode(vcpu)) {
 488                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 489                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 490                         kvm_inject_gp(vcpu, 0);
 491                         return;
 492                 }
 493         } else {
 494                 if (is_pae(vcpu)) {
 495                         if (cr3 & CR3_PAE_RESERVED_BITS) {
 496                                 printk(KERN_DEBUG
 497                                        "set_cr3: #GP, reserved bits\n");
 498                                 kvm_inject_gp(vcpu, 0);
 499                                 return;
 500                         }
 501                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 502                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 503                                        "reserved bits\n");
 504                                 kvm_inject_gp(vcpu, 0);
 505                                 return;
 506                         }
 507                 }
 508                 /*
 509                  * We don't check reserved bits in nonpae mode, because
 510                  * this isn't enforced, and VMware depends on this.
 511                  */
 512         }
 513
 514         /*
 515          * Does the new cr3 value map to physical memory? (Note, we
 516          * catch an invalid cr3 even in real-mode, because it would
 517          * cause trouble later on when we turn on paging anyway.)
 518          *
 519          * A real CPU would silently accept an invalid cr3 and would
 520          * attempt to use it - with largely undefined (and often hard
 521          * to debug) behavior on the guest side.
 522          */
 523         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 524                 kvm_inject_gp(vcpu, 0);
 525         else {
 526                 vcpu->arch.cr3 = cr3;
 527                 vcpu->arch.mmu.new_cr3(vcpu);
 528         }
 529 }
 530 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 531
 532 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 533 {
 534         if (cr8 & CR8_RESERVED_BITS) {
 535                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 536                 kvm_inject_gp(vcpu, 0);
 537                 return;
 538         }
 539         if (irqchip_in_kernel(vcpu->kvm))
 540                 kvm_lapic_set_tpr(vcpu, cr8);
 541         else
 542                 vcpu->arch.cr8 = cr8;
 543 }
 544 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 545
 546 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 547 {
 548         if (irqchip_in_kernel(vcpu->kvm))
 549                 return kvm_lapic_get_cr8(vcpu);
 550         else
 551                 return vcpu->arch.cr8;
 552 }
 553 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 554
 555 static inline u32 bit(int bitno)
 556 {
 557         return 1 << (bitno & 31);
 558 }
 559
 560 /*
 561  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 562  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 563  *
 564  * This list is modified at module load time to reflect the
 565  * capabilities of the host cpu. This capabilities test skips MSRs that are
 566  * kvm-specific. Those are put in the beginning of the list.
 567  */
 568
 569 #define KVM_SAVE_MSRS_BEGIN     2
 570 static u32 msrs_to_save[] = {
 571         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 572         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 573         MSR_K6_STAR,
 574 #ifdef CONFIG_X86_64
 575         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 576 #endif
 577         MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 578 };
 579
 580 static unsigned num_msrs_to_save;
 581
 582 static u32 emulated_msrs[] = {
 583         MSR_IA32_MISC_ENABLE,
 584 };
 585
 586 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 587 {
 588         if (efer & efer_reserved_bits) {
 589                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 590                        efer);
 591                 kvm_inject_gp(vcpu, 0);
 592                 return;
 593         }
 594
 595         if (is_paging(vcpu)
 596             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 597                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 598                 kvm_inject_gp(vcpu, 0);
 599                 return;
 600         }
 601
 602         if (efer & EFER_FFXSR) {
 603                 struct kvm_cpuid_entry2 *feat;
 604
 605                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 606                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
 607                         printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
 608                         kvm_inject_gp(vcpu, 0);
 609                         return;
 610                 }
 611         }
 612
 613         if (efer & EFER_SVME) {
 614                 struct kvm_cpuid_entry2 *feat;
 615
 616                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 617                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
 618                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
 619                         kvm_inject_gp(vcpu, 0);
 620                         return;
 621                 }
 622         }
 623
 624         kvm_x86_ops->set_efer(vcpu, efer);
 625
 626         efer &= ~EFER_LMA;
 627         efer |= vcpu->arch.shadow_efer & EFER_LMA;
 628
 629         vcpu->arch.shadow_efer = efer;
 630
 631         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 632         kvm_mmu_reset_context(vcpu);
 633 }
 634
 635 void kvm_enable_efer_bits(u64 mask)
 636 {
 637        efer_reserved_bits &= ~mask;
 638 }
 639 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 640
 641
 642 /*
 643  * Writes msr value into into the appropriate "register".
 644  * Returns 0 on success, non-0 otherwise.
 645  * Assumes vcpu_load() was already called.
 646  */
 647 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 648 {
 649         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 650 }
 651
 652 /*
 653  * Adapt set_msr() to msr_io()'s calling convention
 654  */
 655 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 656 {
 657         return kvm_set_msr(vcpu, index, *data);
 658 }
 659
 660 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 661 {
 662         static int version;
 663         struct pvclock_wall_clock wc;
 664         struct timespec now, sys, boot;
 665
 666         if (!wall_clock)
 667                 return;
 668
 669         version++;
 670
 671         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 672
 673         /*
 674          * The guest calculates current wall clock time by adding
 675          * system time (updated by kvm_write_guest_time below) to the
 676          * wall clock specified here.  guest system time equals host
 677          * system time for us, thus we must fill in host boot time here.
 678          */
 679         now = current_kernel_time();
 680         ktime_get_ts(&sys);
 681         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
 682
 683         wc.sec = boot.tv_sec;
 684         wc.nsec = boot.tv_nsec;
 685         wc.version = version;
 686
 687         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 688
 689         version++;
 690         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 691 }
 692
 693 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 694 {
 695         uint32_t quotient, remainder;
 696
 697         /* Don't try to replace with do_div(), this one calculates
 698          * "(dividend << 32) / divisor" */
 699         __asm__ ( "divl %4"
 700                   : "=a" (quotient), "=d" (remainder)
 701                   : "0" (0), "1" (dividend), "r" (divisor) );
 702         return quotient;
 703 }
 704
 705 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 706 {
 707         uint64_t nsecs = 1000000000LL;
 708         int32_t  shift = 0;
 709         uint64_t tps64;
 710         uint32_t tps32;
 711
 712         tps64 = tsc_khz * 1000LL;
 713         while (tps64 > nsecs*2) {
 714                 tps64 >>= 1;
 715                 shift--;
 716         }
 717
 718         tps32 = (uint32_t)tps64;
 719         while (tps32 <= (uint32_t)nsecs) {
 720                 tps32 <<= 1;
 721                 shift++;
 722         }
 723
 724         hv_clock->tsc_shift = shift;
 725         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 726
 727         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 728                  __func__, tsc_khz, hv_clock->tsc_shift,
 729                  hv_clock->tsc_to_system_mul);
 730 }
 731
 732 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 733
 734 static void kvm_write_guest_time(struct kvm_vcpu *v)
 735 {
 736         struct timespec ts;
 737         unsigned long flags;
 738         struct kvm_vcpu_arch *vcpu = &v->arch;
 739         void *shared_kaddr;
 740         unsigned long this_tsc_khz;
 741
 742         if ((!vcpu->time_page))
 743                 return;
 744
 745         this_tsc_khz = get_cpu_var(cpu_tsc_khz);
 746         if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
 747                 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
 748                 vcpu->hv_clock_tsc_khz = this_tsc_khz;
 749         }
 750         put_cpu_var(cpu_tsc_khz);
 751
 752         /* Keep irq disabled to prevent changes to the clock */
 753         local_irq_save(flags);
 754         kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
 755         ktime_get_ts(&ts);
 756         local_irq_restore(flags);
 757
 758         /* With all the info we got, fill in the values */
 759
 760         vcpu->hv_clock.system_time = ts.tv_nsec +
 761                                      (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
 762
 763         /*
 764          * The interface expects us to write an even number signaling that the
 765          * update is finished. Since the guest won't see the intermediate
 766          * state, we just increase by 2 at the end.
 767          */
 768         vcpu->hv_clock.version += 2;
 769
 770         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 771
 772         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 773                sizeof(vcpu->hv_clock));
 774
 775         kunmap_atomic(shared_kaddr, KM_USER0);
 776
 777         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 778 }
 779
 780 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 781 {
 782         struct kvm_vcpu_arch *vcpu = &v->arch;
 783
 784         if (!vcpu->time_page)
 785                 return 0;
 786         set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
 787         return 1;
 788 }
 789
 790 static bool msr_mtrr_valid(unsigned msr)
 791 {
 792         switch (msr) {
 793         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 794         case MSR_MTRRfix64K_00000:
 795         case MSR_MTRRfix16K_80000:
 796         case MSR_MTRRfix16K_A0000:
 797         case MSR_MTRRfix4K_C0000:
 798         case MSR_MTRRfix4K_C8000:
 799         case MSR_MTRRfix4K_D0000:
 800         case MSR_MTRRfix4K_D8000:
 801         case MSR_MTRRfix4K_E0000:
 802         case MSR_MTRRfix4K_E8000:
 803         case MSR_MTRRfix4K_F0000:
 804         case MSR_MTRRfix4K_F8000:
 805         case MSR_MTRRdefType:
 806         case MSR_IA32_CR_PAT:
 807                 return true;
 808         case 0x2f8:
 809                 return true;
 810         }
 811         return false;
 812 }
 813
 814 static bool valid_pat_type(unsigned t)
 815 {
 816         return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
 817 }
 818
 819 static bool valid_mtrr_type(unsigned t)
 820 {
 821         return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
 822 }
 823
 824 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 825 {
 826         int i;
 827
 828         if (!msr_mtrr_valid(msr))
 829                 return false;
 830
 831         if (msr == MSR_IA32_CR_PAT) {
 832                 for (i = 0; i < 8; i++)
 833                         if (!valid_pat_type((data >> (i * 8)) & 0xff))
 834                                 return false;
 835                 return true;
 836         } else if (msr == MSR_MTRRdefType) {
 837                 if (data & ~0xcff)
 838                         return false;
 839                 return valid_mtrr_type(data & 0xff);
 840         } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
 841                 for (i = 0; i < 8 ; i++)
 842                         if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
 843                                 return false;
 844                 return true;
 845         }
 846
 847         /* variable MTRRs */
 848         return valid_mtrr_type(data & 0xff);
 849 }
 850
 851 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 852 {
 853         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 854
 855         if (!mtrr_valid(vcpu, msr, data))
 856                 return 1;
 857
 858         if (msr == MSR_MTRRdefType) {
 859                 vcpu->arch.mtrr_state.def_type = data;
 860                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 861         } else if (msr == MSR_MTRRfix64K_00000)
 862                 p[0] = data;
 863         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 864                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
 865         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 866                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 867         else if (msr == MSR_IA32_CR_PAT)
 868                 vcpu->arch.pat = data;
 869         else {  /* Variable MTRRs */
 870                 int idx, is_mtrr_mask;
 871                 u64 *pt;
 872
 873                 idx = (msr - 0x200) / 2;
 874                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 875                 if (!is_mtrr_mask)
 876                         pt =
 877                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 878                 else
 879                         pt =
 880                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 881                 *pt = data;
 882         }
 883
 884         kvm_mmu_reset_context(vcpu);
 885         return 0;
 886 }
 887
 888 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 889 {
 890         u64 mcg_cap = vcpu->arch.mcg_cap;
 891         unsigned bank_num = mcg_cap & 0xff;
 892
 893         switch (msr) {
 894         case MSR_IA32_MCG_STATUS:
 895                 vcpu->arch.mcg_status = data;
 896                 break;
 897         case MSR_IA32_MCG_CTL:
 898                 if (!(mcg_cap & MCG_CTL_P))
 899                         return 1;
 900                 if (data != 0 && data != ~(u64)0)
 901                         return -1;
 902                 vcpu->arch.mcg_ctl = data;
 903                 break;
 904         default:
 905                 if (msr >= MSR_IA32_MC0_CTL &&
 906                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
 907                         u32 offset = msr - MSR_IA32_MC0_CTL;
 908                         /* only 0 or all 1s can be written to IA32_MCi_CTL */
 909                         if ((offset & 0x3) == 0 &&
 910                             data != 0 && data != ~(u64)0)
 911                                 return -1;
 912                         vcpu->arch.mce_banks[offset] = data;
 913                         break;
 914                 }
 915                 return 1;
 916         }
 917         return 0;
 918 }
 919
 920 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
 921 {
 922         struct kvm *kvm = vcpu->kvm;
 923         int lm = is_long_mode(vcpu);
 924         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
 925                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
 926         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
 927                 : kvm->arch.xen_hvm_config.blob_size_32;
 928         u32 page_num = data & ~PAGE_MASK;
 929         u64 page_addr = data & PAGE_MASK;
 930         u8 *page;
 931         int r;
 932
 933         r = -E2BIG;
 934         if (page_num >= blob_size)
 935                 goto out;
 936         r = -ENOMEM;
 937         page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 938         if (!page)
 939                 goto out;
 940         r = -EFAULT;
 941         if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
 942                 goto out_free;
 943         if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
 944                 goto out_free;
 945         r = 0;
 946 out_free:
 947         kfree(page);
 948 out:
 949         return r;
 950 }
 951
 952 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 953 {
 954         switch (msr) {
 955         case MSR_EFER:
 956                 set_efer(vcpu, data);
 957                 break;
 958         case MSR_K7_HWCR:
 959                 data &= ~(u64)0x40;     /* ignore flush filter disable */
 960                 if (data != 0) {
 961                         pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 962                                 data);
 963                         return 1;
 964                 }
 965                 break;
 966         case MSR_FAM10H_MMIO_CONF_BASE:
 967                 if (data != 0) {
 968                         pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
 969                                 "0x%llx\n", data);
 970                         return 1;
 971                 }
 972                 break;
 973         case MSR_AMD64_NB_CFG:
 974                 break;
 975         case MSR_IA32_DEBUGCTLMSR:
 976                 if (!data) {
 977                         /* We support the non-activated case already */
 978                         break;
 979                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 980                         /* Values other than LBR and BTF are vendor-specific,
 981                            thus reserved and should throw a #GP */
 982                         return 1;
 983                 }
 984                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 985                         __func__, data);
 986                 break;
 987         case MSR_IA32_UCODE_REV:
 988         case MSR_IA32_UCODE_WRITE:
 989         case MSR_VM_HSAVE_PA:
 990         case MSR_AMD64_PATCH_LOADER:
 991                 break;
 992         case 0x200 ... 0x2ff:
 993                 return set_msr_mtrr(vcpu, msr, data);
 994         case MSR_IA32_APICBASE:
 995                 kvm_set_apic_base(vcpu, data);
 996                 break;
 997         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 998                 return kvm_x2apic_msr_write(vcpu, msr, data);
 999         case MSR_IA32_MISC_ENABLE:
1000                 vcpu->arch.ia32_misc_enable_msr = data;
1001                 break;
1002         case MSR_KVM_WALL_CLOCK:
1003                 vcpu->kvm->arch.wall_clock = data;
1004                 kvm_write_wall_clock(vcpu->kvm, data);
1005                 break;
1006         case MSR_KVM_SYSTEM_TIME: {
1007                 if (vcpu->arch.time_page) {
1008                         kvm_release_page_dirty(vcpu->arch.time_page);
1009                         vcpu->arch.time_page = NULL;
1010                 }
1011
1012                 vcpu->arch.time = data;
1013
1014                 /* we verify if the enable bit is set... */
1015                 if (!(data & 1))
1016                         break;
1017
1018                 /* ...but clean it before doing the actual write */
1019                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
1020
1021                 vcpu->arch.time_page =
1022                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
1023
1024                 if (is_error_page(vcpu->arch.time_page)) {
1025                         kvm_release_page_clean(vcpu->arch.time_page);
1026                         vcpu->arch.time_page = NULL;
1027                 }
1028
1029                 kvm_request_guest_time_update(vcpu);
1030                 break;
1031         }
1032         case MSR_IA32_MCG_CTL:
1033         case MSR_IA32_MCG_STATUS:
1034         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1035                 return set_msr_mce(vcpu, msr, data);
1036
1037         /* Performance counters are not protected by a CPUID bit,
1038          * so we should check all of them in the generic path for the sake of
1039          * cross vendor migration.
1040          * Writing a zero into the event select MSRs disables them,
1041          * which we perfectly emulate ;-). Any other value should be at least
1042          * reported, some guests depend on them.
1043          */
1044         case MSR_P6_EVNTSEL0:
1045         case MSR_P6_EVNTSEL1:
1046         case MSR_K7_EVNTSEL0:
1047         case MSR_K7_EVNTSEL1:
1048         case MSR_K7_EVNTSEL2:
1049         case MSR_K7_EVNTSEL3:
1050                 if (data != 0)
1051                         pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1052                                 "0x%x data 0x%llx\n", msr, data);
1053                 break;
1054         /* at least RHEL 4 unconditionally writes to the perfctr registers,
1055          * so we ignore writes to make it happy.
1056          */
1057         case MSR_P6_PERFCTR0:
1058         case MSR_P6_PERFCTR1:
1059         case MSR_K7_PERFCTR0:
1060         case MSR_K7_PERFCTR1:
1061         case MSR_K7_PERFCTR2:
1062         case MSR_K7_PERFCTR3:
1063                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1064                         "0x%x data 0x%llx\n", msr, data);
1065                 break;
1066         default:
1067                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1068                         return xen_hvm_config(vcpu, data);
1069                 if (!ignore_msrs) {
1070                         pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1071                                 msr, data);
1072                         return 1;
1073                 } else {
1074                         pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1075                                 msr, data);
1076                         break;
1077                 }
1078         }
1079         return 0;
1080 }
1081 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1082
1083
1084 /*
1085  * Reads an msr value (of 'msr_index') into 'pdata'.
1086  * Returns 0 on success, non-0 otherwise.
1087  * Assumes vcpu_load() was already called.
1088  */
1089 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1090 {
1091         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1092 }
1093
1094 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1095 {
1096         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1097
1098         if (!msr_mtrr_valid(msr))
1099                 return 1;
1100
1101         if (msr == MSR_MTRRdefType)
1102                 *pdata = vcpu->arch.mtrr_state.def_type +
1103                          (vcpu->arch.mtrr_state.enabled << 10);
1104         else if (msr == MSR_MTRRfix64K_00000)
1105                 *pdata = p[0];
1106         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1107                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
1108         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1109                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
1110         else if (msr == MSR_IA32_CR_PAT)
1111                 *pdata = vcpu->arch.pat;
1112         else {  /* Variable MTRRs */
1113                 int idx, is_mtrr_mask;
1114                 u64 *pt;
1115
1116                 idx = (msr - 0x200) / 2;
1117                 is_mtrr_mask = msr - 0x200 - 2 * idx;
1118                 if (!is_mtrr_mask)
1119                         pt =
1120                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1121                 else
1122                         pt =
1123                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1124                 *pdata = *pt;
1125         }
1126
1127         return 0;
1128 }
1129
1130 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1131 {
1132         u64 data;
1133         u64 mcg_cap = vcpu->arch.mcg_cap;
1134         unsigned bank_num = mcg_cap & 0xff;
1135
1136         switch (msr) {
1137         case MSR_IA32_P5_MC_ADDR:
1138         case MSR_IA32_P5_MC_TYPE:
1139                 data = 0;
1140                 break;
1141         case MSR_IA32_MCG_CAP:
1142                 data = vcpu->arch.mcg_cap;
1143                 break;
1144         case MSR_IA32_MCG_CTL:
1145                 if (!(mcg_cap & MCG_CTL_P))
1146                         return 1;
1147                 data = vcpu->arch.mcg_ctl;
1148                 break;
1149         case MSR_IA32_MCG_STATUS:
1150                 data = vcpu->arch.mcg_status;
1151                 break;
1152         default:
1153                 if (msr >= MSR_IA32_MC0_CTL &&
1154                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1155                         u32 offset = msr - MSR_IA32_MC0_CTL;
1156                         data = vcpu->arch.mce_banks[offset];
1157                         break;
1158                 }
1159                 return 1;
1160         }
1161         *pdata = data;
1162         return 0;
1163 }
1164
1165 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1166 {
1167         u64 data;
1168
1169         switch (msr) {
1170         case MSR_IA32_PLATFORM_ID:
1171         case MSR_IA32_UCODE_REV:
1172         case MSR_IA32_EBL_CR_POWERON:
1173         case MSR_IA32_DEBUGCTLMSR:
1174         case MSR_IA32_LASTBRANCHFROMIP:
1175         case MSR_IA32_LASTBRANCHTOIP:
1176         case MSR_IA32_LASTINTFROMIP:
1177         case MSR_IA32_LASTINTTOIP:
1178         case MSR_K8_SYSCFG:
1179         case MSR_K7_HWCR:
1180         case MSR_VM_HSAVE_PA:
1181         case MSR_P6_PERFCTR0:
1182         case MSR_P6_PERFCTR1:
1183         case MSR_P6_EVNTSEL0:
1184         case MSR_P6_EVNTSEL1:
1185         case MSR_K7_EVNTSEL0:
1186         case MSR_K7_PERFCTR0:
1187         case MSR_K8_INT_PENDING_MSG:
1188         case MSR_AMD64_NB_CFG:
1189         case MSR_FAM10H_MMIO_CONF_BASE:
1190                 data = 0;
1191                 break;
1192         case MSR_MTRRcap:
1193                 data = 0x500 | KVM_NR_VAR_MTRR;
1194                 break;
1195         case 0x200 ... 0x2ff:
1196                 return get_msr_mtrr(vcpu, msr, pdata);
1197         case 0xcd: /* fsb frequency */
1198                 data = 3;
1199                 break;
1200         case MSR_IA32_APICBASE:
1201                 data = kvm_get_apic_base(vcpu);
1202                 break;
1203         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1204                 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1205                 break;
1206         case MSR_IA32_MISC_ENABLE:
1207                 data = vcpu->arch.ia32_misc_enable_msr;
1208                 break;
1209         case MSR_IA32_PERF_STATUS:
1210                 /* TSC increment by tick */
1211                 data = 1000ULL;
1212                 /* CPU multiplier */
1213                 data |= (((uint64_t)4ULL) << 40);
1214                 break;
1215         case MSR_EFER:
1216                 data = vcpu->arch.shadow_efer;
1217                 break;
1218         case MSR_KVM_WALL_CLOCK:
1219                 data = vcpu->kvm->arch.wall_clock;
1220                 break;
1221         case MSR_KVM_SYSTEM_TIME:
1222                 data = vcpu->arch.time;
1223                 break;
1224         case MSR_IA32_P5_MC_ADDR:
1225         case MSR_IA32_P5_MC_TYPE:
1226         case MSR_IA32_MCG_CAP:
1227         case MSR_IA32_MCG_CTL:
1228         case MSR_IA32_MCG_STATUS:
1229         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1230                 return get_msr_mce(vcpu, msr, pdata);
1231         default:
1232                 if (!ignore_msrs) {
1233                         pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1234                         return 1;
1235                 } else {
1236                         pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1237                         data = 0;
1238                 }
1239                 break;
1240         }
1241         *pdata = data;
1242         return 0;
1243 }
1244 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1245
1246 /*
1247  * Read or write a bunch of msrs. All parameters are kernel addresses.
1248  *
1249  * @return number of msrs set successfully.
1250  */
1251 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1252                     struct kvm_msr_entry *entries,
1253                     int (*do_msr)(struct kvm_vcpu *vcpu,
1254                                   unsigned index, u64 *data))
1255 {
1256         int i;
1257
1258         vcpu_load(vcpu);
1259
1260         down_read(&vcpu->kvm->slots_lock);
1261         for (i = 0; i < msrs->nmsrs; ++i)
1262                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1263                         break;
1264         up_read(&vcpu->kvm->slots_lock);
1265
1266         vcpu_put(vcpu);
1267
1268         return i;
1269 }
1270
1271 /*
1272  * Read or write a bunch of msrs. Parameters are user addresses.
1273  *
1274  * @return number of msrs set successfully.
1275  */
1276 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1277                   int (*do_msr)(struct kvm_vcpu *vcpu,
1278                                 unsigned index, u64 *data),
1279                   int writeback)
1280 {
1281         struct kvm_msrs msrs;
1282         struct kvm_msr_entry *entries;
1283         int r, n;
1284         unsigned size;
1285
1286         r = -EFAULT;
1287         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1288                 goto out;
1289
1290         r = -E2BIG;
1291         if (msrs.nmsrs >= MAX_IO_MSRS)
1292                 goto out;
1293
1294         r = -ENOMEM;
1295         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1296         entries = vmalloc(size);
1297         if (!entries)
1298                 goto out;
1299
1300         r = -EFAULT;
1301         if (copy_from_user(entries, user_msrs->entries, size))
1302                 goto out_free;
1303
1304         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1305         if (r < 0)
1306                 goto out_free;
1307
1308         r = -EFAULT;
1309         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1310                 goto out_free;
1311
1312         r = n;
1313
1314 out_free:
1315         vfree(entries);
1316 out:
1317         return r;
1318 }
1319
1320 int kvm_dev_ioctl_check_extension(long ext)
1321 {
1322         int r;
1323
1324         switch (ext) {
1325         case KVM_CAP_IRQCHIP:
1326         case KVM_CAP_HLT:
1327         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1328         case KVM_CAP_SET_TSS_ADDR:
1329         case KVM_CAP_EXT_CPUID:
1330         case KVM_CAP_CLOCKSOURCE:
1331         case KVM_CAP_PIT:
1332         case KVM_CAP_NOP_IO_DELAY:
1333         case KVM_CAP_MP_STATE:
1334         case KVM_CAP_SYNC_MMU:
1335         case KVM_CAP_REINJECT_CONTROL:
1336         case KVM_CAP_IRQ_INJECT_STATUS:
1337         case KVM_CAP_ASSIGN_DEV_IRQ:
1338         case KVM_CAP_IRQFD:
1339         case KVM_CAP_IOEVENTFD:
1340         case KVM_CAP_PIT2:
1341         case KVM_CAP_PIT_STATE2:
1342         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1343         case KVM_CAP_XEN_HVM:
1344         case KVM_CAP_ADJUST_CLOCK:
1345                 r = 1;
1346                 break;
1347         case KVM_CAP_COALESCED_MMIO:
1348                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1349                 break;
1350         case KVM_CAP_VAPIC:
1351                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1352                 break;
1353         case KVM_CAP_NR_VCPUS:
1354                 r = KVM_MAX_VCPUS;
1355                 break;
1356         case KVM_CAP_NR_MEMSLOTS:
1357                 r = KVM_MEMORY_SLOTS;
1358                 break;
1359         case KVM_CAP_PV_MMU:    /* obsolete */
1360                 r = 0;
1361                 break;
1362         case KVM_CAP_IOMMU:
1363                 r = iommu_found();
1364                 break;
1365         case KVM_CAP_MCE:
1366                 r = KVM_MAX_MCE_BANKS;
1367                 break;
1368         default:
1369                 r = 0;
1370                 break;
1371         }
1372         return r;
1373
1374 }
1375
1376 long kvm_arch_dev_ioctl(struct file *filp,
1377                         unsigned int ioctl, unsigned long arg)
1378 {
1379         void __user *argp = (void __user *)arg;
1380         long r;
1381
1382         switch (ioctl) {
1383         case KVM_GET_MSR_INDEX_LIST: {
1384                 struct kvm_msr_list __user *user_msr_list = argp;
1385                 struct kvm_msr_list msr_list;
1386                 unsigned n;
1387
1388                 r = -EFAULT;
1389                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1390                         goto out;
1391                 n = msr_list.nmsrs;
1392                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1393                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1394                         goto out;
1395                 r = -E2BIG;
1396                 if (n < msr_list.nmsrs)
1397                         goto out;
1398                 r = -EFAULT;
1399                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1400                                  num_msrs_to_save * sizeof(u32)))
1401                         goto out;
1402                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1403                                  &emulated_msrs,
1404                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1405                         goto out;
1406                 r = 0;
1407                 break;
1408         }
1409         case KVM_GET_SUPPORTED_CPUID: {
1410                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1411                 struct kvm_cpuid2 cpuid;
1412
1413                 r = -EFAULT;
1414                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1415                         goto out;
1416                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1417                                                       cpuid_arg->entries);
1418                 if (r)
1419                         goto out;
1420
1421                 r = -EFAULT;
1422                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1423                         goto out;
1424                 r = 0;
1425                 break;
1426         }
1427         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
1428                 u64 mce_cap;
1429
1430                 mce_cap = KVM_MCE_CAP_SUPPORTED;
1431                 r = -EFAULT;
1432                 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
1433                         goto out;
1434                 r = 0;
1435                 break;
1436         }
1437         default:
1438                 r = -EINVAL;
1439         }
1440 out:
1441         return r;
1442 }
1443
1444 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1445 {
1446         kvm_x86_ops->vcpu_load(vcpu, cpu);
1447         if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1448                 unsigned long khz = cpufreq_quick_get(cpu);
1449                 if (!khz)
1450                         khz = tsc_khz;
1451                 per_cpu(cpu_tsc_khz, cpu) = khz;
1452         }
1453         kvm_request_guest_time_update(vcpu);
1454 }
1455
1456 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1457 {
1458         kvm_x86_ops->vcpu_put(vcpu);
1459         kvm_put_guest_fpu(vcpu);
1460 }
1461
1462 static int is_efer_nx(void)
1463 {
1464         unsigned long long efer = 0;
1465
1466         rdmsrl_safe(MSR_EFER, &efer);
1467         return efer & EFER_NX;
1468 }
1469
1470 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1471 {
1472         int i;
1473         struct kvm_cpuid_entry2 *e, *entry;
1474
1475         entry = NULL;
1476         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1477                 e = &vcpu->arch.cpuid_entries[i];
1478                 if (e->function == 0x80000001) {
1479                         entry = e;
1480                         break;
1481                 }
1482         }
1483         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1484                 entry->edx &= ~(1 << 20);
1485                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1486         }
1487 }
1488
1489 /* when an old userspace process fills a new kernel module */
1490 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1491                                     struct kvm_cpuid *cpuid,
1492                                     struct kvm_cpuid_entry __user *entries)
1493 {
1494         int r, i;
1495         struct kvm_cpuid_entry *cpuid_entries;
1496
1497         r = -E2BIG;
1498         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1499                 goto out;
1500         r = -ENOMEM;
1501         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1502         if (!cpuid_entries)
1503                 goto out;
1504         r = -EFAULT;
1505         if (copy_from_user(cpuid_entries, entries,
1506                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1507                 goto out_free;
1508         for (i = 0; i < cpuid->nent; i++) {
1509                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1510                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1511                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1512                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1513                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1514                 vcpu->arch.cpuid_entries[i].index = 0;
1515                 vcpu->arch.cpuid_entries[i].flags = 0;
1516                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1517                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1518                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1519         }
1520         vcpu->arch.cpuid_nent = cpuid->nent;
1521         cpuid_fix_nx_cap(vcpu);
1522         r = 0;
1523         kvm_apic_set_version(vcpu);
1524
1525 out_free:
1526         vfree(cpuid_entries);
1527 out:
1528         return r;
1529 }
1530
1531 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1532                                      struct kvm_cpuid2 *cpuid,
1533                                      struct kvm_cpuid_entry2 __user *entries)
1534 {
1535         int r;
1536
1537         r = -E2BIG;
1538         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1539                 goto out;
1540         r = -EFAULT;
1541         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1542                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1543                 goto out;
1544         vcpu->arch.cpuid_nent = cpuid->nent;
1545         kvm_apic_set_version(vcpu);
1546         return 0;
1547
1548 out:
1549         return r;
1550 }
1551
1552 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1553                                      struct kvm_cpuid2 *cpuid,
1554                                      struct kvm_cpuid_entry2 __user *entries)
1555 {
1556         int r;
1557
1558         r = -E2BIG;
1559         if (cpuid->nent < vcpu->arch.cpuid_nent)
1560                 goto out;
1561         r = -EFAULT;
1562         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1563                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1564                 goto out;
1565         return 0;
1566
1567 out:
1568         cpuid->nent = vcpu->arch.cpuid_nent;
1569         return r;
1570 }
1571
1572 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1573                            u32 index)
1574 {
1575         entry->function = function;
1576         entry->index = index;
1577         cpuid_count(entry->function, entry->index,
1578                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1579         entry->flags = 0;
1580 }
1581
1582 #define F(x) bit(X86_FEATURE_##x)
1583
1584 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1585                          u32 index, int *nent, int maxnent)
1586 {
1587         unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1588         unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1589 #ifdef CONFIG_X86_64
1590         unsigned f_lm = F(LM);
1591 #else
1592         unsigned f_lm = 0;
1593 #endif
1594
1595         /* cpuid 1.edx */
1596         const u32 kvm_supported_word0_x86_features =
1597                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1598                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1599                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1600                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1601                 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1602                 0 /* Reserved, DS, ACPI */ | F(MMX) |
1603                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1604                 0 /* HTT, TM, Reserved, PBE */;
1605         /* cpuid 0x80000001.edx */
1606         const u32 kvm_supported_word1_x86_features =
1607                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1608                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1609                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1610                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1611                 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1612                 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1613                 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
1614                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1615         /* cpuid 1.ecx */
1616         const u32 kvm_supported_word4_x86_features =
1617                 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1618                 0 /* DS-CPL, VMX, SMX, EST */ |
1619                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1620                 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1621                 0 /* Reserved, DCA */ | F(XMM4_1) |
1622                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1623                 0 /* Reserved, XSAVE, OSXSAVE */;
1624         /* cpuid 0x80000001.ecx */
1625         const u32 kvm_supported_word6_x86_features =
1626                 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1627                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1628                 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1629                 0 /* SKINIT */ | 0 /* WDT */;
1630
1631         /* all calls to cpuid_count() should be made on the same cpu */
1632         get_cpu();
1633         do_cpuid_1_ent(entry, function, index);
1634         ++*nent;
1635
1636         switch (function) {
1637         case 0:
1638                 entry->eax = min(entry->eax, (u32)0xb);
1639                 break;
1640         case 1:
1641                 entry->edx &= kvm_supported_word0_x86_features;
1642                 entry->ecx &= kvm_supported_word4_x86_features;
1643                 /* we support x2apic emulation even if host does not support
1644                  * it since we emulate x2apic in software */
1645                 entry->ecx |= F(X2APIC);
1646                 break;
1647         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1648          * may return different values. This forces us to get_cpu() before
1649          * issuing the first command, and also to emulate this annoying behavior
1650          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1651         case 2: {
1652                 int t, times = entry->eax & 0xff;
1653
1654                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1655                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1656                 for (t = 1; t < times && *nent < maxnent; ++t) {
1657                         do_cpuid_1_ent(&entry[t], function, 0);
1658                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1659                         ++*nent;
1660                 }
1661                 break;
1662         }
1663         /* function 4 and 0xb have additional index. */
1664         case 4: {
1665                 int i, cache_type;
1666
1667                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1668                 /* read more entries until cache_type is zero */
1669                 for (i = 1; *nent < maxnent; ++i) {
1670                         cache_type = entry[i - 1].eax & 0x1f;
1671                         if (!cache_type)
1672                                 break;
1673                         do_cpuid_1_ent(&entry[i], function, i);
1674                         entry[i].flags |=
1675                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1676                         ++*nent;
1677                 }
1678                 break;
1679         }
1680         case 0xb: {
1681                 int i, level_type;
1682
1683                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1684                 /* read more entries until level_type is zero */
1685                 for (i = 1; *nent < maxnent; ++i) {
1686                         level_type = entry[i - 1].ecx & 0xff00;
1687                         if (!level_type)
1688                                 break;
1689                         do_cpuid_1_ent(&entry[i], function, i);
1690                         entry[i].flags |=
1691                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1692                         ++*nent;
1693                 }
1694                 break;
1695         }
1696         case 0x80000000:
1697                 entry->eax = min(entry->eax, 0x8000001a);
1698                 break;
1699         case 0x80000001:
1700                 entry->edx &= kvm_supported_word1_x86_features;
1701                 entry->ecx &= kvm_supported_word6_x86_features;
1702                 break;
1703         }
1704         put_cpu();
1705 }
1706
1707 #undef F
1708
1709 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1710                                      struct kvm_cpuid_entry2 __user *entries)
1711 {
1712         struct kvm_cpuid_entry2 *cpuid_entries;
1713         int limit, nent = 0, r = -E2BIG;
1714         u32 func;
1715
1716         if (cpuid->nent < 1)
1717                 goto out;
1718         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1719                 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1720         r = -ENOMEM;
1721         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1722         if (!cpuid_entries)
1723                 goto out;
1724
1725         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1726         limit = cpuid_entries[0].eax;
1727         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1728                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1729                              &nent, cpuid->nent);
1730         r = -E2BIG;
1731         if (nent >= cpuid->nent)
1732                 goto out_free;
1733
1734         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1735         limit = cpuid_entries[nent - 1].eax;
1736         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1737                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1738                              &nent, cpuid->nent);
1739         r = -E2BIG;
1740         if (nent >= cpuid->nent)
1741                 goto out_free;
1742
1743         r = -EFAULT;
1744         if (copy_to_user(entries, cpuid_entries,
1745                          nent * sizeof(struct kvm_cpuid_entry2)))
1746                 goto out_free;
1747         cpuid->nent = nent;
1748         r = 0;
1749
1750 out_free:
1751         vfree(cpuid_entries);
1752 out:
1753         return r;
1754 }
1755
1756 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1757                                     struct kvm_lapic_state *s)
1758 {
1759         vcpu_load(vcpu);
1760         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1761         vcpu_put(vcpu);
1762
1763         return 0;
1764 }
1765
1766 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1767                                     struct kvm_lapic_state *s)
1768 {
1769         vcpu_load(vcpu);
1770         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1771         kvm_apic_post_state_restore(vcpu);
1772         update_cr8_intercept(vcpu);
1773         vcpu_put(vcpu);
1774
1775         return 0;
1776 }
1777
1778 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1779                                     struct kvm_interrupt *irq)
1780 {
1781         if (irq->irq < 0 || irq->irq >= 256)
1782                 return -EINVAL;
1783         if (irqchip_in_kernel(vcpu->kvm))
1784                 return -ENXIO;
1785         vcpu_load(vcpu);
1786
1787         kvm_queue_interrupt(vcpu, irq->irq, false);
1788
1789         vcpu_put(vcpu);
1790
1791         return 0;
1792 }
1793
1794 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1795 {
1796         vcpu_load(vcpu);
1797         kvm_inject_nmi(vcpu);
1798         vcpu_put(vcpu);
1799
1800         return 0;
1801 }
1802
1803 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1804                                            struct kvm_tpr_access_ctl *tac)
1805 {
1806         if (tac->flags)
1807                 return -EINVAL;
1808         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1809         return 0;
1810 }
1811
1812 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1813                                         u64 mcg_cap)
1814 {
1815         int r;
1816         unsigned bank_num = mcg_cap & 0xff, bank;
1817
1818         r = -EINVAL;
1819         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
1820                 goto out;
1821         if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1822                 goto out;
1823         r = 0;
1824         vcpu->arch.mcg_cap = mcg_cap;
1825         /* Init IA32_MCG_CTL to all 1s */
1826         if (mcg_cap & MCG_CTL_P)
1827                 vcpu->arch.mcg_ctl = ~(u64)0;
1828         /* Init IA32_MCi_CTL to all 1s */
1829         for (bank = 0; bank < bank_num; bank++)
1830                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
1831 out:
1832         return r;
1833 }
1834
1835 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1836                                       struct kvm_x86_mce *mce)
1837 {
1838         u64 mcg_cap = vcpu->arch.mcg_cap;
1839         unsigned bank_num = mcg_cap & 0xff;
1840         u64 *banks = vcpu->arch.mce_banks;
1841
1842         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
1843                 return -EINVAL;
1844         /*
1845          * if IA32_MCG_CTL is not all 1s, the uncorrected error
1846          * reporting is disabled
1847          */
1848         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
1849             vcpu->arch.mcg_ctl != ~(u64)0)
1850                 return 0;
1851         banks += 4 * mce->bank;
1852         /*
1853          * if IA32_MCi_CTL is not all 1s, the uncorrected error
1854          * reporting is disabled for the bank
1855          */
1856         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
1857                 return 0;
1858         if (mce->status & MCI_STATUS_UC) {
1859                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1860                     !(vcpu->arch.cr4 & X86_CR4_MCE)) {
1861                         printk(KERN_DEBUG "kvm: set_mce: "
1862                                "injects mce exception while "
1863                                "previous one is in progress!\n");
1864                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1865                         return 0;
1866                 }
1867                 if (banks[1] & MCI_STATUS_VAL)
1868                         mce->status |= MCI_STATUS_OVER;
1869                 banks[2] = mce->addr;
1870                 banks[3] = mce->misc;
1871                 vcpu->arch.mcg_status = mce->mcg_status;
1872                 banks[1] = mce->status;
1873                 kvm_queue_exception(vcpu, MC_VECTOR);
1874         } else if (!(banks[1] & MCI_STATUS_VAL)
1875                    || !(banks[1] & MCI_STATUS_UC)) {
1876                 if (banks[1] & MCI_STATUS_VAL)
1877                         mce->status |= MCI_STATUS_OVER;
1878                 banks[2] = mce->addr;
1879                 banks[3] = mce->misc;
1880                 banks[1] = mce->status;
1881         } else
1882                 banks[1] |= MCI_STATUS_OVER;
1883         return 0;
1884 }
1885
1886 long kvm_arch_vcpu_ioctl(struct file *filp,
1887                          unsigned int ioctl, unsigned long arg)
1888 {
1889         struct kvm_vcpu *vcpu = filp->private_data;
1890         void __user *argp = (void __user *)arg;
1891         int r;
1892         struct kvm_lapic_state *lapic = NULL;
1893
1894         switch (ioctl) {
1895         case KVM_GET_LAPIC: {
1896                 r = -EINVAL;
1897                 if (!vcpu->arch.apic)
1898                         goto out;
1899                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1900
1901                 r = -ENOMEM;
1902                 if (!lapic)
1903                         goto out;
1904                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1905                 if (r)
1906                         goto out;
1907                 r = -EFAULT;
1908                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1909                         goto out;
1910                 r = 0;
1911                 break;
1912         }
1913         case KVM_SET_LAPIC: {
1914                 r = -EINVAL;
1915                 if (!vcpu->arch.apic)
1916                         goto out;
1917                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1918                 r = -ENOMEM;
1919                 if (!lapic)
1920                         goto out;
1921                 r = -EFAULT;
1922                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1923                         goto out;
1924                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1925                 if (r)
1926                         goto out;
1927                 r = 0;
1928                 break;
1929         }
1930         case KVM_INTERRUPT: {
1931                 struct kvm_interrupt irq;
1932
1933                 r = -EFAULT;
1934                 if (copy_from_user(&irq, argp, sizeof irq))
1935                         goto out;
1936                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1937                 if (r)
1938                         goto out;
1939                 r = 0;
1940                 break;
1941         }
1942         case KVM_NMI: {
1943                 r = kvm_vcpu_ioctl_nmi(vcpu);
1944                 if (r)
1945                         goto out;
1946                 r = 0;
1947                 break;
1948         }
1949         case KVM_SET_CPUID: {
1950                 struct kvm_cpuid __user *cpuid_arg = argp;
1951                 struct kvm_cpuid cpuid;
1952
1953                 r = -EFAULT;
1954                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1955                         goto out;
1956                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1957                 if (r)
1958                         goto out;
1959                 break;
1960         }
1961         case KVM_SET_CPUID2: {
1962                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1963                 struct kvm_cpuid2 cpuid;
1964
1965                 r = -EFAULT;
1966                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1967                         goto out;
1968                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1969                                               cpuid_arg->entries);
1970                 if (r)
1971                         goto out;
1972                 break;
1973         }
1974         case KVM_GET_CPUID2: {
1975                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1976                 struct kvm_cpuid2 cpuid;
1977
1978                 r = -EFAULT;
1979                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1980                         goto out;
1981                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1982                                               cpuid_arg->entries);
1983                 if (r)
1984                         goto out;
1985                 r = -EFAULT;
1986                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1987                         goto out;
1988                 r = 0;
1989                 break;
1990         }
1991         case KVM_GET_MSRS:
1992                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1993                 break;
1994         case KVM_SET_MSRS:
1995                 r = msr_io(vcpu, argp, do_set_msr, 0);
1996                 break;
1997         case KVM_TPR_ACCESS_REPORTING: {
1998                 struct kvm_tpr_access_ctl tac;
1999
2000                 r = -EFAULT;
2001                 if (copy_from_user(&tac, argp, sizeof tac))
2002                         goto out;
2003                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
2004                 if (r)
2005                         goto out;
2006                 r = -EFAULT;
2007                 if (copy_to_user(argp, &tac, sizeof tac))
2008                         goto out;
2009                 r = 0;
2010                 break;
2011         };
2012         case KVM_SET_VAPIC_ADDR: {
2013                 struct kvm_vapic_addr va;
2014
2015                 r = -EINVAL;
2016                 if (!irqchip_in_kernel(vcpu->kvm))
2017                         goto out;
2018                 r = -EFAULT;
2019                 if (copy_from_user(&va, argp, sizeof va))
2020                         goto out;
2021                 r = 0;
2022                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
2023                 break;
2024         }
2025         case KVM_X86_SETUP_MCE: {
2026                 u64 mcg_cap;
2027
2028                 r = -EFAULT;
2029                 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
2030                         goto out;
2031                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
2032                 break;
2033         }
2034         case KVM_X86_SET_MCE: {
2035                 struct kvm_x86_mce mce;
2036
2037                 r = -EFAULT;
2038                 if (copy_from_user(&mce, argp, sizeof mce))
2039                         goto out;
2040                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2041                 break;
2042         }
2043         default:
2044                 r = -EINVAL;
2045         }
2046 out:
2047         kfree(lapic);
2048         return r;
2049 }
2050
2051 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2052 {
2053         int ret;
2054
2055         if (addr > (unsigned int)(-3 * PAGE_SIZE))
2056                 return -1;
2057         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
2058         return ret;
2059 }
2060
2061 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
2062                                               u64 ident_addr)
2063 {
2064         kvm->arch.ept_identity_map_addr = ident_addr;
2065         return 0;
2066 }
2067
2068 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2069                                           u32 kvm_nr_mmu_pages)
2070 {
2071         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
2072                 return -EINVAL;
2073
2074         down_write(&kvm->slots_lock);
2075         spin_lock(&kvm->mmu_lock);
2076
2077         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
2078         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
2079
2080         spin_unlock(&kvm->mmu_lock);
2081         up_write(&kvm->slots_lock);
2082         return 0;
2083 }
2084
2085 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2086 {
2087         return kvm->arch.n_alloc_mmu_pages;
2088 }
2089
2090 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2091 {
2092         int i;
2093         struct kvm_mem_alias *alias;
2094
2095         for (i = 0; i < kvm->arch.naliases; ++i) {
2096                 alias = &kvm->arch.aliases[i];
2097                 if (gfn >= alias->base_gfn
2098                     && gfn < alias->base_gfn + alias->npages)
2099                         return alias->target_gfn + gfn - alias->base_gfn;
2100         }
2101         return gfn;
2102 }
2103
2104 /*
2105  * Set a new alias region.  Aliases map a portion of physical memory into
2106  * another portion.  This is useful for memory windows, for example the PC
2107  * VGA region.
2108  */
2109 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2110                                          struct kvm_memory_alias *alias)
2111 {
2112         int r, n;
2113         struct kvm_mem_alias *p;
2114
2115         r = -EINVAL;
2116         /* General sanity checks */
2117         if (alias->memory_size & (PAGE_SIZE - 1))
2118                 goto out;
2119         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2120                 goto out;
2121         if (alias->slot >= KVM_ALIAS_SLOTS)
2122                 goto out;
2123         if (alias->guest_phys_addr + alias->memory_size
2124             < alias->guest_phys_addr)
2125                 goto out;
2126         if (alias->target_phys_addr + alias->memory_size
2127             < alias->target_phys_addr)
2128                 goto out;
2129
2130         down_write(&kvm->slots_lock);
2131         spin_lock(&kvm->mmu_lock);
2132
2133         p = &kvm->arch.aliases[alias->slot];
2134         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2135         p->npages = alias->memory_size >> PAGE_SHIFT;
2136         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2137
2138         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2139                 if (kvm->arch.aliases[n - 1].npages)
2140                         break;
2141         kvm->arch.naliases = n;
2142
2143         spin_unlock(&kvm->mmu_lock);
2144         kvm_mmu_zap_all(kvm);
2145
2146         up_write(&kvm->slots_lock);
2147
2148         return 0;
2149
2150 out:
2151         return r;
2152 }
2153
2154 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2155 {
2156         int r;
2157
2158         r = 0;
2159         switch (chip->chip_id) {
2160         case KVM_IRQCHIP_PIC_MASTER:
2161                 memcpy(&chip->chip.pic,
2162                         &pic_irqchip(kvm)->pics[0],
2163                         sizeof(struct kvm_pic_state));
2164                 break;
2165         case KVM_IRQCHIP_PIC_SLAVE:
2166                 memcpy(&chip->chip.pic,
2167                         &pic_irqchip(kvm)->pics[1],
2168                         sizeof(struct kvm_pic_state));
2169                 break;
2170         case KVM_IRQCHIP_IOAPIC:
2171                 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2172                 break;
2173         default:
2174                 r = -EINVAL;
2175                 break;
2176         }
2177         return r;
2178 }
2179
2180 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2181 {
2182         int r;
2183
2184         r = 0;
2185         switch (chip->chip_id) {
2186         case KVM_IRQCHIP_PIC_MASTER:
2187                 spin_lock(&pic_irqchip(kvm)->lock);
2188                 memcpy(&pic_irqchip(kvm)->pics[0],
2189                         &chip->chip.pic,
2190                         sizeof(struct kvm_pic_state));
2191                 spin_unlock(&pic_irqchip(kvm)->lock);
2192                 break;
2193         case KVM_IRQCHIP_PIC_SLAVE:
2194                 spin_lock(&pic_irqchip(kvm)->lock);
2195                 memcpy(&pic_irqchip(kvm)->pics[1],
2196                         &chip->chip.pic,
2197                         sizeof(struct kvm_pic_state));
2198                 spin_unlock(&pic_irqchip(kvm)->lock);
2199                 break;
2200         case KVM_IRQCHIP_IOAPIC:
2201                 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2202                 break;
2203         default:
2204                 r = -EINVAL;
2205                 break;
2206         }
2207         kvm_pic_update_irq(pic_irqchip(kvm));
2208         return r;
2209 }
2210
2211 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2212 {
2213         int r = 0;
2214
2215         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2216         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
2217         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2218         return r;
2219 }
2220
2221 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2222 {
2223         int r = 0;
2224
2225         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2226         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
2227         kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
2228         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2229         return r;
2230 }
2231
2232 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2233 {
2234         int r = 0;
2235
2236         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2237         memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
2238                 sizeof(ps->channels));
2239         ps->flags = kvm->arch.vpit->pit_state.flags;
2240         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2241         return r;
2242 }
2243
2244 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2245 {
2246         int r = 0, start = 0;
2247         u32 prev_legacy, cur_legacy;
2248         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2249         prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2250         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2251         if (!prev_legacy && cur_legacy)
2252                 start = 1;
2253         memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
2254                sizeof(kvm->arch.vpit->pit_state.channels));
2255         kvm->arch.vpit->pit_state.flags = ps->flags;
2256         kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
2257         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2258         return r;
2259 }
2260
2261 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2262                                  struct kvm_reinject_control *control)
2263 {
2264         if (!kvm->arch.vpit)
2265                 return -ENXIO;
2266         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2267         kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
2268         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2269         return 0;
2270 }
2271
2272 /*
2273  * Get (and clear) the dirty memory log for a memory slot.
2274  */
2275 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2276                                       struct kvm_dirty_log *log)
2277 {
2278         int r;
2279         int n;
2280         struct kvm_memory_slot *memslot;
2281         int is_dirty = 0;
2282
2283         down_write(&kvm->slots_lock);
2284
2285         r = kvm_get_dirty_log(kvm, log, &is_dirty);
2286         if (r)
2287                 goto out;
2288
2289         /* If nothing is dirty, don't bother messing with page tables. */
2290         if (is_dirty) {
2291                 spin_lock(&kvm->mmu_lock);
2292                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2293                 spin_unlock(&kvm->mmu_lock);
2294                 memslot = &kvm->memslots[log->slot];
2295                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
2296                 memset(memslot->dirty_bitmap, 0, n);
2297         }
2298         r = 0;
2299 out:
2300         up_write(&kvm->slots_lock);
2301         return r;
2302 }
2303
2304 long kvm_arch_vm_ioctl(struct file *filp,
2305                        unsigned int ioctl, unsigned long arg)
2306 {
2307         struct kvm *kvm = filp->private_data;
2308         void __user *argp = (void __user *)arg;
2309         int r = -ENOTTY;
2310         /*
2311          * This union makes it completely explicit to gcc-3.x
2312          * that these two variables' stack usage should be
2313          * combined, not added together.
2314          */
2315         union {
2316                 struct kvm_pit_state ps;
2317                 struct kvm_pit_state2 ps2;
2318                 struct kvm_memory_alias alias;
2319                 struct kvm_pit_config pit_config;
2320         } u;
2321
2322         switch (ioctl) {
2323         case KVM_SET_TSS_ADDR:
2324                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
2325                 if (r < 0)
2326                         goto out;
2327                 break;
2328         case KVM_SET_IDENTITY_MAP_ADDR: {
2329                 u64 ident_addr;
2330
2331                 r = -EFAULT;
2332                 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
2333                         goto out;
2334                 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
2335                 if (r < 0)
2336                         goto out;
2337                 break;
2338         }
2339         case KVM_SET_MEMORY_REGION: {
2340                 struct kvm_memory_region kvm_mem;
2341                 struct kvm_userspace_memory_region kvm_userspace_mem;
2342
2343                 r = -EFAULT;
2344                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2345                         goto out;
2346                 kvm_userspace_mem.slot = kvm_mem.slot;
2347                 kvm_userspace_mem.flags = kvm_mem.flags;
2348                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2349                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2350                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2351                 if (r)
2352                         goto out;
2353                 break;
2354         }
2355         case KVM_SET_NR_MMU_PAGES:
2356                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2357                 if (r)
2358                         goto out;
2359                 break;
2360         case KVM_GET_NR_MMU_PAGES:
2361                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2362                 break;
2363         case KVM_SET_MEMORY_ALIAS:
2364                 r = -EFAULT;
2365                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2366                         goto out;
2367                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2368                 if (r)
2369                         goto out;
2370                 break;
2371         case KVM_CREATE_IRQCHIP: {
2372                 struct kvm_pic *vpic;
2373
2374                 mutex_lock(&kvm->lock);
2375                 r = -EEXIST;
2376                 if (kvm->arch.vpic)
2377                         goto create_irqchip_unlock;
2378                 r = -ENOMEM;
2379                 vpic = kvm_create_pic(kvm);
2380                 if (vpic) {
2381                         r = kvm_ioapic_init(kvm);
2382                         if (r) {
2383                                 kfree(vpic);
2384                                 goto create_irqchip_unlock;
2385                         }
2386                 } else
2387                         goto create_irqchip_unlock;
2388                 smp_wmb();
2389                 kvm->arch.vpic = vpic;
2390                 smp_wmb();
2391                 r = kvm_setup_default_irq_routing(kvm);
2392                 if (r) {
2393                         mutex_lock(&kvm->irq_lock);
2394                         kfree(kvm->arch.vpic);
2395                         kfree(kvm->arch.vioapic);
2396                         kvm->arch.vpic = NULL;
2397                         kvm->arch.vioapic = NULL;
2398                         mutex_unlock(&kvm->irq_lock);
2399                 }
2400         create_irqchip_unlock:
2401                 mutex_unlock(&kvm->lock);
2402                 break;
2403         }
2404         case KVM_CREATE_PIT:
2405                 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2406                 goto create_pit;
2407         case KVM_CREATE_PIT2:
2408                 r = -EFAULT;
2409                 if (copy_from_user(&u.pit_config, argp,
2410                                    sizeof(struct kvm_pit_config)))
2411                         goto out;
2412         create_pit:
2413                 down_write(&kvm->slots_lock);
2414                 r = -EEXIST;
2415                 if (kvm->arch.vpit)
2416                         goto create_pit_unlock;
2417                 r = -ENOMEM;
2418                 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
2419                 if (kvm->arch.vpit)
2420                         r = 0;
2421         create_pit_unlock:
2422                 up_write(&kvm->slots_lock);
2423                 break;
2424         case KVM_IRQ_LINE_STATUS:
2425         case KVM_IRQ_LINE: {
2426                 struct kvm_irq_level irq_event;
2427
2428                 r = -EFAULT;
2429                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2430                         goto out;
2431                 if (irqchip_in_kernel(kvm)) {
2432                         __s32 status;
2433                         status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2434                                         irq_event.irq, irq_event.level);
2435                         if (ioctl == KVM_IRQ_LINE_STATUS) {
2436                                 irq_event.status = status;
2437                                 if (copy_to_user(argp, &irq_event,
2438                                                         sizeof irq_event))
2439                                         goto out;
2440                         }
2441                         r = 0;
2442                 }
2443                 break;
2444         }
2445         case KVM_GET_IRQCHIP: {
2446                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2447                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2448
2449                 r = -ENOMEM;
2450                 if (!chip)
2451                         goto out;
2452                 r = -EFAULT;
2453                 if (copy_from_user(chip, argp, sizeof *chip))
2454                         goto get_irqchip_out;
2455                 r = -ENXIO;
2456                 if (!irqchip_in_kernel(kvm))
2457                         goto get_irqchip_out;
2458                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
2459                 if (r)
2460                         goto get_irqchip_out;
2461                 r = -EFAULT;
2462                 if (copy_to_user(argp, chip, sizeof *chip))
2463                         goto get_irqchip_out;
2464                 r = 0;
2465         get_irqchip_out:
2466                 kfree(chip);
2467                 if (r)
2468                         goto out;
2469                 break;
2470         }
2471         case KVM_SET_IRQCHIP: {
2472                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2473                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2474
2475                 r = -ENOMEM;
2476                 if (!chip)
2477                         goto out;
2478                 r = -EFAULT;
2479                 if (copy_from_user(chip, argp, sizeof *chip))
2480                         goto set_irqchip_out;
2481                 r = -ENXIO;
2482                 if (!irqchip_in_kernel(kvm))
2483                         goto set_irqchip_out;
2484                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
2485                 if (r)
2486                         goto set_irqchip_out;
2487                 r = 0;
2488         set_irqchip_out:
2489                 kfree(chip);
2490                 if (r)
2491                         goto out;
2492                 break;
2493         }
2494         case KVM_GET_PIT: {
2495                 r = -EFAULT;
2496                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
2497                         goto out;
2498                 r = -ENXIO;
2499                 if (!kvm->arch.vpit)
2500                         goto out;
2501                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
2502                 if (r)
2503                         goto out;
2504                 r = -EFAULT;
2505                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
2506                         goto out;
2507                 r = 0;
2508                 break;
2509         }
2510         case KVM_SET_PIT: {
2511                 r = -EFAULT;
2512                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
2513                         goto out;
2514                 r = -ENXIO;
2515                 if (!kvm->arch.vpit)
2516                         goto out;
2517                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2518                 if (r)
2519                         goto out;
2520                 r = 0;
2521                 break;
2522         }
2523         case KVM_GET_PIT2: {
2524                 r = -ENXIO;
2525                 if (!kvm->arch.vpit)
2526                         goto out;
2527                 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
2528                 if (r)
2529                         goto out;
2530                 r = -EFAULT;
2531                 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
2532                         goto out;
2533                 r = 0;
2534                 break;
2535         }
2536         case KVM_SET_PIT2: {
2537                 r = -EFAULT;
2538                 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
2539                         goto out;
2540                 r = -ENXIO;
2541                 if (!kvm->arch.vpit)
2542                         goto out;
2543                 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
2544                 if (r)
2545                         goto out;
2546                 r = 0;
2547                 break;
2548         }
2549         case KVM_REINJECT_CONTROL: {
2550                 struct kvm_reinject_control control;
2551                 r =  -EFAULT;
2552                 if (copy_from_user(&control, argp, sizeof(control)))
2553                         goto out;
2554                 r = kvm_vm_ioctl_reinject(kvm, &control);
2555                 if (r)
2556                         goto out;
2557                 r = 0;
2558                 break;
2559         }
2560         case KVM_XEN_HVM_CONFIG: {
2561                 r = -EFAULT;
2562                 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2563                                    sizeof(struct kvm_xen_hvm_config)))
2564                         goto out;
2565                 r = -EINVAL;
2566                 if (kvm->arch.xen_hvm_config.flags)
2567                         goto out;
2568                 r = 0;
2569                 break;
2570         }
2571         case KVM_SET_CLOCK: {
2572                 struct timespec now;
2573                 struct kvm_clock_data user_ns;
2574                 u64 now_ns;
2575                 s64 delta;
2576
2577                 r = -EFAULT;
2578                 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2579                         goto out;
2580
2581                 r = -EINVAL;
2582                 if (user_ns.flags)
2583                         goto out;
2584
2585                 r = 0;
2586                 ktime_get_ts(&now);
2587                 now_ns = timespec_to_ns(&now);
2588                 delta = user_ns.clock - now_ns;
2589                 kvm->arch.kvmclock_offset = delta;
2590                 break;
2591         }
2592         case KVM_GET_CLOCK: {
2593                 struct timespec now;
2594                 struct kvm_clock_data user_ns;
2595                 u64 now_ns;
2596
2597                 ktime_get_ts(&now);
2598                 now_ns = timespec_to_ns(&now);
2599                 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2600                 user_ns.flags = 0;
2601
2602                 r = -EFAULT;
2603                 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2604                         goto out;
2605                 r = 0;
2606                 break;
2607         }
2608
2609         default:
2610                 ;
2611         }
2612 out:
2613         return r;
2614 }
2615
2616 static void kvm_init_msr_list(void)
2617 {
2618         u32 dummy[2];
2619         unsigned i, j;
2620
2621         /* skip the first msrs in the list. KVM-specific */
2622         for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2623                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2624                         continue;
2625                 if (j < i)
2626                         msrs_to_save[j] = msrs_to_save[i];
2627                 j++;
2628         }
2629         num_msrs_to_save = j;
2630 }
2631
2632 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2633                            const void *v)
2634 {
2635         if (vcpu->arch.apic &&
2636             !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2637                 return 0;
2638
2639         return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
2640 }
2641
2642 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2643 {
2644         if (vcpu->arch.apic &&
2645             !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2646                 return 0;
2647
2648         return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
2649 }
2650
2651 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2652                                struct kvm_vcpu *vcpu)
2653 {
2654         void *data = val;
2655         int r = X86EMUL_CONTINUE;
2656
2657         while (bytes) {
2658                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2659                 unsigned offset = addr & (PAGE_SIZE-1);
2660                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2661                 int ret;
2662
2663                 if (gpa == UNMAPPED_GVA) {
2664                         r = X86EMUL_PROPAGATE_FAULT;
2665                         goto out;
2666                 }
2667                 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2668                 if (ret < 0) {
2669                         r = X86EMUL_UNHANDLEABLE;
2670                         goto out;
2671                 }
2672
2673                 bytes -= toread;
2674                 data += toread;
2675                 addr += toread;
2676         }
2677 out:
2678         return r;
2679 }
2680
2681 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2682                                 struct kvm_vcpu *vcpu)
2683 {
2684         void *data = val;
2685         int r = X86EMUL_CONTINUE;
2686
2687         while (bytes) {
2688                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2689                 unsigned offset = addr & (PAGE_SIZE-1);
2690                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2691                 int ret;
2692
2693                 if (gpa == UNMAPPED_GVA) {
2694                         r = X86EMUL_PROPAGATE_FAULT;
2695                         goto out;
2696                 }
2697                 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2698                 if (ret < 0) {
2699                         r = X86EMUL_UNHANDLEABLE;
2700                         goto out;
2701                 }
2702
2703                 bytes -= towrite;
2704                 data += towrite;
2705                 addr += towrite;
2706         }
2707 out:
2708         return r;
2709 }
2710
2711
2712 static int emulator_read_emulated(unsigned long addr,
2713                                   void *val,
2714                                   unsigned int bytes,
2715                                   struct kvm_vcpu *vcpu)
2716 {
2717         gpa_t                 gpa;
2718
2719         if (vcpu->mmio_read_completed) {
2720                 memcpy(val, vcpu->mmio_data, bytes);
2721                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
2722                                vcpu->mmio_phys_addr, *(u64 *)val);
2723                 vcpu->mmio_read_completed = 0;
2724                 return X86EMUL_CONTINUE;
2725         }
2726
2727         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2728
2729         /* For APIC access vmexit */
2730         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2731                 goto mmio;
2732
2733         if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2734                                 == X86EMUL_CONTINUE)
2735                 return X86EMUL_CONTINUE;
2736         if (gpa == UNMAPPED_GVA)
2737                 return X86EMUL_PROPAGATE_FAULT;
2738
2739 mmio:
2740         /*
2741          * Is this MMIO handled locally?
2742          */
2743         if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
2744                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
2745                 return X86EMUL_CONTINUE;
2746         }
2747
2748         trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
2749
2750         vcpu->mmio_needed = 1;
2751         vcpu->mmio_phys_addr = gpa;
2752         vcpu->mmio_size = bytes;
2753         vcpu->mmio_is_write = 0;
2754
2755         return X86EMUL_UNHANDLEABLE;
2756 }
2757
2758 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2759                           const void *val, int bytes)
2760 {
2761         int ret;
2762
2763         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2764         if (ret < 0)
2765                 return 0;
2766         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2767         return 1;
2768 }
2769
2770 static int emulator_write_emulated_onepage(unsigned long addr,
2771                                            const void *val,
2772                                            unsigned int bytes,
2773                                            struct kvm_vcpu *vcpu)
2774 {
2775         gpa_t                 gpa;
2776
2777         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2778
2779         if (gpa == UNMAPPED_GVA) {
2780                 kvm_inject_page_fault(vcpu, addr, 2);
2781                 return X86EMUL_PROPAGATE_FAULT;
2782         }
2783
2784         /* For APIC access vmexit */
2785         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2786                 goto mmio;
2787
2788         if (emulator_write_phys(vcpu, gpa, val, bytes))
2789                 return X86EMUL_CONTINUE;
2790
2791 mmio:
2792         trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
2793         /*
2794          * Is this MMIO handled locally?
2795          */
2796         if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
2797                 return X86EMUL_CONTINUE;
2798
2799         vcpu->mmio_needed = 1;
2800         vcpu->mmio_phys_addr = gpa;
2801         vcpu->mmio_size = bytes;
2802         vcpu->mmio_is_write = 1;
2803         memcpy(vcpu->mmio_data, val, bytes);
2804
2805         return X86EMUL_CONTINUE;
2806 }
2807
2808 int emulator_write_emulated(unsigned long addr,
2809                                    const void *val,
2810                                    unsigned int bytes,
2811                                    struct kvm_vcpu *vcpu)
2812 {
2813         /* Crossing a page boundary? */
2814         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2815                 int rc, now;
2816
2817                 now = -addr & ~PAGE_MASK;
2818                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2819                 if (rc != X86EMUL_CONTINUE)
2820                         return rc;
2821                 addr += now;
2822                 val += now;
2823                 bytes -= now;
2824         }
2825         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2826 }
2827 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2828
2829 static int emulator_cmpxchg_emulated(unsigned long addr,
2830                                      const void *old,
2831                                      const void *new,
2832                                      unsigned int bytes,
2833                                      struct kvm_vcpu *vcpu)
2834 {
2835         printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
2836 #ifndef CONFIG_X86_64
2837         /* guests cmpxchg8b have to be emulated atomically */
2838         if (bytes == 8) {
2839                 gpa_t gpa;
2840                 struct page *page;
2841                 char *kaddr;
2842                 u64 val;
2843
2844                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2845
2846                 if (gpa == UNMAPPED_GVA ||
2847                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2848                         goto emul_write;
2849
2850                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2851                         goto emul_write;
2852
2853                 val = *(u64 *)new;
2854
2855                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2856
2857                 kaddr = kmap_atomic(page, KM_USER0);
2858                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2859                 kunmap_atomic(kaddr, KM_USER0);
2860                 kvm_release_page_dirty(page);
2861         }
2862 emul_write:
2863 #endif
2864
2865         return emulator_write_emulated(addr, new, bytes, vcpu);
2866 }
2867
2868 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2869 {
2870         return kvm_x86_ops->get_segment_base(vcpu, seg);
2871 }
2872
2873 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2874 {
2875         kvm_mmu_invlpg(vcpu, address);
2876         return X86EMUL_CONTINUE;
2877 }
2878
2879 int emulate_clts(struct kvm_vcpu *vcpu)
2880 {
2881         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2882         return X86EMUL_CONTINUE;
2883 }
2884
2885 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2886 {
2887         struct kvm_vcpu *vcpu = ctxt->vcpu;
2888
2889         switch (dr) {
2890         case 0 ... 3:
2891                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2892                 return X86EMUL_CONTINUE;
2893         default:
2894                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2895                 return X86EMUL_UNHANDLEABLE;
2896         }
2897 }
2898
2899 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2900 {
2901         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2902         int exception;
2903
2904         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2905         if (exception) {
2906                 /* FIXME: better handling */
2907                 return X86EMUL_UNHANDLEABLE;
2908         }
2909         return X86EMUL_CONTINUE;
2910 }
2911
2912 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2913 {
2914         u8 opcodes[4];
2915         unsigned long rip = kvm_rip_read(vcpu);
2916         unsigned long rip_linear;
2917
2918         if (!printk_ratelimit())
2919                 return;
2920
2921         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2922
2923         kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2924
2925         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2926                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2927 }
2928 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2929
2930 static struct x86_emulate_ops emulate_ops = {
2931         .read_std            = kvm_read_guest_virt,
2932         .read_emulated       = emulator_read_emulated,
2933         .write_emulated      = emulator_write_emulated,
2934         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2935 };
2936
2937 static void cache_all_regs(struct kvm_vcpu *vcpu)
2938 {
2939         kvm_register_read(vcpu, VCPU_REGS_RAX);
2940         kvm_register_read(vcpu, VCPU_REGS_RSP);
2941         kvm_register_read(vcpu, VCPU_REGS_RIP);
2942         vcpu->arch.regs_dirty = ~0;
2943 }
2944
2945 int emulate_instruction(struct kvm_vcpu *vcpu,
2946                         unsigned long cr2,
2947                         u16 error_code,
2948                         int emulation_type)
2949 {
2950         int r, shadow_mask;
2951         struct decode_cache *c;
2952         struct kvm_run *run = vcpu->run;
2953
2954         kvm_clear_exception_queue(vcpu);
2955         vcpu->arch.mmio_fault_cr2 = cr2;
2956         /*
2957          * TODO: fix emulate.c to use guest_read/write_register
2958          * instead of direct ->regs accesses, can save hundred cycles
2959          * on Intel for instructions that don't read/change RSP, for
2960          * for example.
2961          */
2962         cache_all_regs(vcpu);
2963
2964         vcpu->mmio_is_write = 0;
2965         vcpu->arch.pio.string = 0;
2966
2967         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2968                 int cs_db, cs_l;
2969                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2970
2971                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2972                 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2973                 vcpu->arch.emulate_ctxt.mode =
2974                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2975                         ? X86EMUL_MODE_REAL : cs_l
2976                         ? X86EMUL_MODE_PROT64 : cs_db
2977                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2978
2979                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2980
2981                 /* Only allow emulation of specific instructions on #UD
2982                  * (namely VMMCALL, sysenter, sysexit, syscall)*/
2983                 c = &vcpu->arch.emulate_ctxt.decode;
2984                 if (emulation_type & EMULTYPE_TRAP_UD) {
2985                         if (!c->twobyte)
2986                                 return EMULATE_FAIL;
2987                         switch (c->b) {
2988                         case 0x01: /* VMMCALL */
2989                                 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2990                                         return EMULATE_FAIL;
2991                                 break;
2992                         case 0x34: /* sysenter */
2993                         case 0x35: /* sysexit */
2994                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2995                                         return EMULATE_FAIL;
2996                                 break;
2997                         case 0x05: /* syscall */
2998                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2999                                         return EMULATE_FAIL;
3000                                 break;
3001                         default:
3002                                 return EMULATE_FAIL;
3003                         }
3004
3005                         if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
3006                                 return EMULATE_FAIL;
3007                 }
3008
3009                 ++vcpu->stat.insn_emulation;
3010                 if (r)  {
3011                         ++vcpu->stat.insn_emulation_fail;
3012                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3013                                 return EMULATE_DONE;
3014                         return EMULATE_FAIL;
3015                 }
3016         }
3017
3018         if (emulation_type & EMULTYPE_SKIP) {
3019                 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
3020                 return EMULATE_DONE;
3021         }
3022
3023         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3024         shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3025
3026         if (r == 0)
3027                 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
3028
3029         if (vcpu->arch.pio.string)
3030                 return EMULATE_DO_MMIO;
3031
3032         if ((r || vcpu->mmio_is_write) && run) {
3033                 run->exit_reason = KVM_EXIT_MMIO;
3034                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
3035                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
3036                 run->mmio.len = vcpu->mmio_size;
3037                 run->mmio.is_write = vcpu->mmio_is_write;
3038         }
3039
3040         if (r) {
3041                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3042                         return EMULATE_DONE;
3043                 if (!vcpu->mmio_needed) {
3044                         kvm_report_emulation_failure(vcpu, "mmio");
3045                         return EMULATE_FAIL;
3046                 }
3047                 return EMULATE_DO_MMIO;
3048         }
3049
3050         kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3051
3052         if (vcpu->mmio_is_write) {
3053                 vcpu->mmio_needed = 0;
3054                 return EMULATE_DO_MMIO;
3055         }
3056
3057         return EMULATE_DONE;
3058 }
3059 EXPORT_SYMBOL_GPL(emulate_instruction);
3060
3061 static int pio_copy_data(struct kvm_vcpu *vcpu)
3062 {
3063         void *p = vcpu->arch.pio_data;
3064         gva_t q = vcpu->arch.pio.guest_gva;
3065         unsigned bytes;
3066         int ret;
3067
3068         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3069         if (vcpu->arch.pio.in)
3070                 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
3071         else
3072                 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
3073         return ret;
3074 }
3075
3076 int complete_pio(struct kvm_vcpu *vcpu)
3077 {
3078         struct kvm_pio_request *io = &vcpu->arch.pio;
3079         long delta;
3080         int r;
3081         unsigned long val;
3082
3083         if (!io->string) {
3084                 if (io->in) {
3085                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3086                         memcpy(&val, vcpu->arch.pio_data, io->size);
3087                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
3088                 }
3089         } else {
3090                 if (io->in) {
3091                         r = pio_copy_data(vcpu);
3092                         if (r)
3093                                 return r;
3094                 }
3095
3096                 delta = 1;
3097                 if (io->rep) {
3098                         delta *= io->cur_count;
3099                         /*
3100                          * The size of the register should really depend on
3101                          * current address size.
3102                          */
3103                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
3104                         val -= delta;
3105                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
3106                 }
3107                 if (io->down)
3108                         delta = -delta;
3109                 delta *= io->size;
3110                 if (io->in) {
3111                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
3112                         val += delta;
3113                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
3114                 } else {
3115                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
3116                         val += delta;
3117                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3118                 }
3119         }
3120
3121         io->count -= io->cur_count;
3122         io->cur_count = 0;
3123
3124         return 0;
3125 }
3126
3127 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3128 {
3129         /* TODO: String I/O for in kernel device */
3130         int r;
3131
3132         if (vcpu->arch.pio.in)
3133                 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
3134                                     vcpu->arch.pio.size, pd);
3135         else
3136                 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
3137                                      vcpu->arch.pio.size, pd);
3138         return r;
3139 }
3140
3141 static int pio_string_write(struct kvm_vcpu *vcpu)
3142 {
3143         struct kvm_pio_request *io = &vcpu->arch.pio;
3144         void *pd = vcpu->arch.pio_data;
3145         int i, r = 0;
3146
3147         for (i = 0; i < io->cur_count; i++) {
3148                 if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
3149                                      io->port, io->size, pd)) {
3150                         r = -EOPNOTSUPP;
3151                         break;
3152                 }
3153                 pd += io->size;
3154         }
3155         return r;
3156 }
3157
3158 int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3159 {
3160         unsigned long val;
3161
3162         vcpu->run->exit_reason = KVM_EXIT_IO;
3163         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3164         vcpu->run->io.size = vcpu->arch.pio.size = size;
3165         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3166         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3167         vcpu->run->io.port = vcpu->arch.pio.port = port;
3168         vcpu->arch.pio.in = in;
3169         vcpu->arch.pio.string = 0;
3170         vcpu->arch.pio.down = 0;
3171         vcpu->arch.pio.rep = 0;
3172
3173         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3174                       size, 1);
3175
3176         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3177         memcpy(vcpu->arch.pio_data, &val, 4);
3178
3179         if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3180                 complete_pio(vcpu);
3181                 return 1;
3182         }
3183         return 0;
3184 }
3185 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3186
3187 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3188                   int size, unsigned long count, int down,
3189                   gva_t address, int rep, unsigned port)
3190 {
3191         unsigned now, in_page;
3192         int ret = 0;
3193
3194         vcpu->run->exit_reason = KVM_EXIT_IO;
3195         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3196         vcpu->run->io.size = vcpu->arch.pio.size = size;
3197         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3198         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3199         vcpu->run->io.port = vcpu->arch.pio.port = port;
3200         vcpu->arch.pio.in = in;
3201         vcpu->arch.pio.string = 1;
3202         vcpu->arch.pio.down = down;
3203         vcpu->arch.pio.rep = rep;
3204
3205         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3206                       size, count);
3207
3208         if (!count) {
3209                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3210                 return 1;
3211         }
3212
3213         if (!down)
3214                 in_page = PAGE_SIZE - offset_in_page(address);
3215         else
3216                 in_page = offset_in_page(address) + size;
3217         now = min(count, (unsigned long)in_page / size);
3218         if (!now)
3219                 now = 1;
3220         if (down) {
3221                 /*
3222                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
3223                  */
3224                 pr_unimpl(vcpu, "guest string pio down\n");
3225                 kvm_inject_gp(vcpu, 0);
3226                 return 1;
3227         }
3228         vcpu->run->io.count = now;
3229         vcpu->arch.pio.cur_count = now;
3230
3231         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3232                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3233
3234         vcpu->arch.pio.guest_gva = address;
3235
3236         if (!vcpu->arch.pio.in) {
3237                 /* string PIO write */
3238                 ret = pio_copy_data(vcpu);
3239                 if (ret == X86EMUL_PROPAGATE_FAULT) {
3240                         kvm_inject_gp(vcpu, 0);
3241                         return 1;
3242                 }
3243                 if (ret == 0 && !pio_string_write(vcpu)) {
3244                         complete_pio(vcpu);
3245                         if (vcpu->arch.pio.count == 0)
3246                                 ret = 1;
3247                 }
3248         }
3249         /* no string PIO read support yet */
3250
3251         return ret;
3252 }
3253 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
3254
3255 static void bounce_off(void *info)
3256 {
3257         /* nothing */
3258 }
3259
3260 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3261                                      void *data)
3262 {
3263         struct cpufreq_freqs *freq = data;
3264         struct kvm *kvm;
3265         struct kvm_vcpu *vcpu;
3266         int i, send_ipi = 0;
3267
3268         if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3269                 return 0;
3270         if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3271                 return 0;
3272         per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3273
3274         spin_lock(&kvm_lock);
3275         list_for_each_entry(kvm, &vm_list, vm_list) {
3276                 kvm_for_each_vcpu(i, vcpu, kvm) {
3277                         if (vcpu->cpu != freq->cpu)
3278                                 continue;
3279                         if (!kvm_request_guest_time_update(vcpu))
3280                                 continue;
3281                         if (vcpu->cpu != smp_processor_id())
3282                                 send_ipi++;
3283                 }
3284         }
3285         spin_unlock(&kvm_lock);
3286
3287         if (freq->old < freq->new && send_ipi) {
3288                 /*
3289                  * We upscale the frequency.  Must make the guest
3290                  * doesn't see old kvmclock values while running with
3291                  * the new frequency, otherwise we risk the guest sees
3292                  * time go backwards.
3293                  *
3294                  * In case we update the frequency for another cpu
3295                  * (which might be in guest context) send an interrupt
3296                  * to kick the cpu out of guest context.  Next time
3297                  * guest context is entered kvmclock will be updated,
3298                  * so the guest will not see stale values.
3299                  */
3300                 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
3301         }
3302         return 0;
3303 }
3304
3305 static struct notifier_block kvmclock_cpufreq_notifier_block = {
3306         .notifier_call  = kvmclock_cpufreq_notifier
3307 };
3308
3309 static void kvm_timer_init(void)
3310 {
3311         int cpu;
3312
3313         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3314                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3315                                           CPUFREQ_TRANSITION_NOTIFIER);
3316                 for_each_online_cpu(cpu) {
3317                         unsigned long khz = cpufreq_get(cpu);
3318                         if (!khz)
3319                                 khz = tsc_khz;
3320                         per_cpu(cpu_tsc_khz, cpu) = khz;
3321                 }
3322         } else {
3323                 for_each_possible_cpu(cpu)
3324                         per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3325         }
3326 }
3327
3328 int kvm_arch_init(void *opaque)
3329 {
3330         int r;
3331         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3332
3333         if (kvm_x86_ops) {
3334                 printk(KERN_ERR "kvm: already loaded the other module\n");
3335                 r = -EEXIST;
3336                 goto out;
3337         }
3338
3339         if (!ops->cpu_has_kvm_support()) {
3340                 printk(KERN_ERR "kvm: no hardware support\n");
3341                 r = -EOPNOTSUPP;
3342                 goto out;
3343         }
3344         if (ops->disabled_by_bios()) {
3345                 printk(KERN_ERR "kvm: disabled by bios\n");
3346                 r = -EOPNOTSUPP;
3347                 goto out;
3348         }
3349
3350         r = kvm_mmu_module_init();
3351         if (r)
3352                 goto out;
3353
3354         kvm_init_msr_list();
3355
3356         kvm_x86_ops = ops;
3357         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
3358         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
3359         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3360                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
3361
3362         kvm_timer_init();
3363
3364         return 0;
3365
3366 out:
3367         return r;
3368 }
3369
3370 void kvm_arch_exit(void)
3371 {
3372         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
3373                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
3374                                             CPUFREQ_TRANSITION_NOTIFIER);
3375         kvm_x86_ops = NULL;
3376         kvm_mmu_module_exit();
3377 }
3378
3379 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
3380 {
3381         ++vcpu->stat.halt_exits;
3382         if (irqchip_in_kernel(vcpu->kvm)) {
3383                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
3384                 return 1;
3385         } else {
3386                 vcpu->run->exit_reason = KVM_EXIT_HLT;
3387                 return 0;
3388         }
3389 }
3390 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
3391
3392 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3393                            unsigned long a1)
3394 {
3395         if (is_long_mode(vcpu))
3396                 return a0;
3397         else
3398                 return a0 | ((gpa_t)a1 << 32);
3399 }
3400
3401 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3402 {
3403         unsigned long nr, a0, a1, a2, a3, ret;
3404         int r = 1;
3405
3406         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3407         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3408         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
3409         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
3410         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
3411
3412         trace_kvm_hypercall(nr, a0, a1, a2, a3);
3413
3414         if (!is_long_mode(vcpu)) {
3415                 nr &= 0xFFFFFFFF;
3416                 a0 &= 0xFFFFFFFF;
3417                 a1 &= 0xFFFFFFFF;
3418                 a2 &= 0xFFFFFFFF;
3419                 a3 &= 0xFFFFFFFF;
3420         }
3421
3422         if (kvm_x86_ops->get_cpl(vcpu) != 0) {
3423                 ret = -KVM_EPERM;
3424                 goto out;
3425         }
3426
3427         switch (nr) {
3428         case KVM_HC_VAPIC_POLL_IRQ:
3429                 ret = 0;
3430                 break;
3431         case KVM_HC_MMU_OP:
3432                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
3433                 break;
3434         default:
3435                 ret = -KVM_ENOSYS;
3436                 break;
3437         }
3438 out:
3439         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3440         ++vcpu->stat.hypercalls;
3441         return r;
3442 }
3443 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3444
3445 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3446 {
3447         char instruction[3];
3448         int ret = 0;
3449         unsigned long rip = kvm_rip_read(vcpu);
3450
3451
3452         /*
3453          * Blow out the MMU to ensure that no other VCPU has an active mapping
3454          * to ensure that the updated hypercall appears atomically across all
3455          * VCPUs.
3456          */
3457         kvm_mmu_zap_all(vcpu->kvm);
3458
3459         kvm_x86_ops->patch_hypercall(vcpu, instruction);
3460         if (emulator_write_emulated(rip, instruction, 3, vcpu)
3461             != X86EMUL_CONTINUE)
3462                 ret = -EFAULT;
3463
3464         return ret;
3465 }
3466
3467 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3468 {
3469         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3470 }
3471
3472 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3473 {
3474         struct descriptor_table dt = { limit, base };
3475
3476         kvm_x86_ops->set_gdt(vcpu, &dt);
3477 }
3478
3479 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3480 {
3481         struct descriptor_table dt = { limit, base };
3482
3483         kvm_x86_ops->set_idt(vcpu, &dt);
3484 }
3485
3486 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3487                    unsigned long *rflags)
3488 {
3489         kvm_lmsw(vcpu, msw);
3490         *rflags = kvm_get_rflags(vcpu);
3491 }
3492
3493 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3494 {
3495         unsigned long value;
3496
3497         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3498         switch (cr) {
3499         case 0:
3500                 value = vcpu->arch.cr0;
3501                 break;
3502         case 2:
3503                 value = vcpu->arch.cr2;
3504                 break;
3505         case 3:
3506                 value = vcpu->arch.cr3;
3507                 break;
3508         case 4:
3509                 value = vcpu->arch.cr4;
3510                 break;
3511         case 8:
3512                 value = kvm_get_cr8(vcpu);
3513                 break;
3514         default:
3515                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3516                 return 0;
3517         }
3518
3519         return value;
3520 }
3521
3522 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3523                      unsigned long *rflags)
3524 {
3525         switch (cr) {
3526         case 0:
3527                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3528                 *rflags = kvm_get_rflags(vcpu);
3529                 break;
3530         case 2:
3531                 vcpu->arch.cr2 = val;
3532                 break;
3533         case 3:
3534                 kvm_set_cr3(vcpu, val);
3535                 break;
3536         case 4:
3537                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
3538                 break;
3539         case 8:
3540                 kvm_set_cr8(vcpu, val & 0xfUL);
3541                 break;
3542         default:
3543                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3544         }
3545 }
3546
3547 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3548 {
3549         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3550         int j, nent = vcpu->arch.cpuid_nent;
3551
3552         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3553         /* when no next entry is found, the current entry[i] is reselected */
3554         for (j = i + 1; ; j = (j + 1) % nent) {
3555                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3556                 if (ej->function == e->function) {
3557                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3558                         return j;
3559                 }
3560         }
3561         return 0; /* silence gcc, even though control never reaches here */
3562 }
3563
3564 /* find an entry with matching function, matching index (if needed), and that
3565  * should be read next (if it's stateful) */
3566 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3567         u32 function, u32 index)
3568 {
3569         if (e->function != function)
3570                 return 0;
3571         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3572                 return 0;
3573         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3574             !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3575                 return 0;
3576         return 1;
3577 }
3578
3579 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3580                                               u32 function, u32 index)
3581 {
3582         int i;
3583         struct kvm_cpuid_entry2 *best = NULL;
3584
3585         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3586                 struct kvm_cpuid_entry2 *e;
3587
3588                 e = &vcpu->arch.cpuid_entries[i];
3589                 if (is_matching_cpuid_entry(e, function, index)) {
3590                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3591                                 move_to_next_stateful_cpuid_entry(vcpu, i);
3592                         best = e;
3593                         break;
3594                 }
3595                 /*
3596                  * Both basic or both extended?
3597                  */
3598                 if (((e->function ^ function) & 0x80000000) == 0)
3599                         if (!best || e->function > best->function)
3600                                 best = e;
3601         }
3602         return best;
3603 }
3604
3605 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3606 {
3607         struct kvm_cpuid_entry2 *best;
3608
3609         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3610         if (best)
3611                 return best->eax & 0xff;
3612         return 36;
3613 }
3614
3615 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3616 {
3617         u32 function, index;
3618         struct kvm_cpuid_entry2 *best;
3619
3620         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3621         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3622         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3623         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3624         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3625         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3626         best = kvm_find_cpuid_entry(vcpu, function, index);
3627         if (best) {
3628                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3629                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3630                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3631                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3632         }
3633         kvm_x86_ops->skip_emulated_instruction(vcpu);
3634         trace_kvm_cpuid(function,
3635                         kvm_register_read(vcpu, VCPU_REGS_RAX),
3636                         kvm_register_read(vcpu, VCPU_REGS_RBX),
3637                         kvm_register_read(vcpu, VCPU_REGS_RCX),
3638                         kvm_register_read(vcpu, VCPU_REGS_RDX));
3639 }
3640 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3641
3642 /*
3643  * Check if userspace requested an interrupt window, and that the
3644  * interrupt window is open.
3645  *
3646  * No need to exit to userspace if we already have an interrupt queued.
3647  */
3648 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3649 {
3650         return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3651                 vcpu->run->request_interrupt_window &&
3652                 kvm_arch_interrupt_allowed(vcpu));
3653 }
3654
3655 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3656 {
3657         struct kvm_run *kvm_run = vcpu->run;
3658
3659         kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3660         kvm_run->cr8 = kvm_get_cr8(vcpu);
3661         kvm_run->apic_base = kvm_get_apic_base(vcpu);
3662         if (irqchip_in_kernel(vcpu->kvm))
3663                 kvm_run->ready_for_interrupt_injection = 1;
3664         else
3665                 kvm_run->ready_for_interrupt_injection =
3666                         kvm_arch_interrupt_allowed(vcpu) &&
3667                         !kvm_cpu_has_interrupt(vcpu) &&
3668                         !kvm_event_needs_reinjection(vcpu);
3669 }
3670
3671 static void vapic_enter(struct kvm_vcpu *vcpu)
3672 {
3673         struct kvm_lapic *apic = vcpu->arch.apic;
3674         struct page *page;
3675
3676         if (!apic || !apic->vapic_addr)
3677                 return;
3678
3679         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3680
3681         vcpu->arch.apic->vapic_page = page;
3682 }
3683
3684 static void vapic_exit(struct kvm_vcpu *vcpu)
3685 {
3686         struct kvm_lapic *apic = vcpu->arch.apic;
3687
3688         if (!apic || !apic->vapic_addr)
3689                 return;
3690
3691         down_read(&vcpu->kvm->slots_lock);
3692         kvm_release_page_dirty(apic->vapic_page);
3693         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3694         up_read(&vcpu->kvm->slots_lock);
3695 }
3696
3697 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3698 {
3699         int max_irr, tpr;
3700
3701         if (!kvm_x86_ops->update_cr8_intercept)
3702                 return;
3703
3704         if (!vcpu->arch.apic)
3705                 return;
3706
3707         if (!vcpu->arch.apic->vapic_addr)
3708                 max_irr = kvm_lapic_find_highest_irr(vcpu);
3709         else
3710                 max_irr = -1;
3711
3712         if (max_irr != -1)
3713                 max_irr >>= 4;
3714
3715         tpr = kvm_lapic_get_cr8(vcpu);
3716
3717         kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3718 }
3719
3720 static void inject_pending_event(struct kvm_vcpu *vcpu)
3721 {
3722         /* try to reinject previous events if any */
3723         if (vcpu->arch.exception.pending) {
3724                 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
3725                                           vcpu->arch.exception.has_error_code,
3726                                           vcpu->arch.exception.error_code);
3727                 return;
3728         }
3729
3730         if (vcpu->arch.nmi_injected) {
3731                 kvm_x86_ops->set_nmi(vcpu);
3732                 return;
3733         }
3734
3735         if (vcpu->arch.interrupt.pending) {
3736                 kvm_x86_ops->set_irq(vcpu);
3737                 return;
3738         }
3739
3740         /* try to inject new event if pending */
3741         if (vcpu->arch.nmi_pending) {
3742                 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3743                         vcpu->arch.nmi_pending = false;
3744                         vcpu->arch.nmi_injected = true;
3745                         kvm_x86_ops->set_nmi(vcpu);
3746                 }
3747         } else if (kvm_cpu_has_interrupt(vcpu)) {
3748                 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3749                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3750                                             false);
3751                         kvm_x86_ops->set_irq(vcpu);
3752                 }
3753         }
3754 }
3755
3756 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3757 {
3758         int r;
3759         bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3760                 vcpu->run->request_interrupt_window;
3761
3762         if (vcpu->requests)
3763                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3764                         kvm_mmu_unload(vcpu);
3765
3766         r = kvm_mmu_reload(vcpu);
3767         if (unlikely(r))
3768                 goto out;
3769
3770         if (vcpu->requests) {
3771                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3772                         __kvm_migrate_timers(vcpu);
3773                 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3774                         kvm_write_guest_time(vcpu);
3775                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3776                         kvm_mmu_sync_roots(vcpu);
3777                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3778                         kvm_x86_ops->tlb_flush(vcpu);
3779                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3780                                        &vcpu->requests)) {
3781                         vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3782                         r = 0;
3783                         goto out;
3784                 }
3785                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3786                         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3787                         r = 0;
3788                         goto out;
3789                 }
3790         }
3791
3792         preempt_disable();
3793
3794         kvm_x86_ops->prepare_guest_switch(vcpu);
3795         kvm_load_guest_fpu(vcpu);
3796
3797         local_irq_disable();
3798
3799         clear_bit(KVM_REQ_KICK, &vcpu->requests);
3800         smp_mb__after_clear_bit();
3801
3802         if (vcpu->requests || need_resched() || signal_pending(current)) {
3803                 set_bit(KVM_REQ_KICK, &vcpu->requests);
3804                 local_irq_enable();
3805                 preempt_enable();
3806                 r = 1;
3807                 goto out;
3808         }
3809
3810         inject_pending_event(vcpu);
3811
3812         /* enable NMI/IRQ window open exits if needed */
3813         if (vcpu->arch.nmi_pending)
3814                 kvm_x86_ops->enable_nmi_window(vcpu);
3815         else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3816                 kvm_x86_ops->enable_irq_window(vcpu);
3817
3818         if (kvm_lapic_enabled(vcpu)) {
3819                 update_cr8_intercept(vcpu);
3820                 kvm_lapic_sync_to_vapic(vcpu);
3821         }
3822
3823         up_read(&vcpu->kvm->slots_lock);
3824
3825         kvm_guest_enter();
3826
3827         if (unlikely(vcpu->arch.switch_db_regs)) {
3828                 set_debugreg(0, 7);
3829                 set_debugreg(vcpu->arch.eff_db[0], 0);
3830                 set_debugreg(vcpu->arch.eff_db[1], 1);
3831                 set_debugreg(vcpu->arch.eff_db[2], 2);
3832                 set_debugreg(vcpu->arch.eff_db[3], 3);
3833         }
3834
3835         trace_kvm_entry(vcpu->vcpu_id);
3836         kvm_x86_ops->run(vcpu);
3837
3838         if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
3839                 set_debugreg(current->thread.debugreg0, 0);
3840                 set_debugreg(current->thread.debugreg1, 1);
3841                 set_debugreg(current->thread.debugreg2, 2);
3842                 set_debugreg(current->thread.debugreg3, 3);
3843                 set_debugreg(current->thread.debugreg6, 6);
3844                 set_debugreg(current->thread.debugreg7, 7);
3845         }
3846
3847         set_bit(KVM_REQ_KICK, &vcpu->requests);
3848         local_irq_enable();
3849
3850         ++vcpu->stat.exits;
3851
3852         /*
3853          * We must have an instruction between local_irq_enable() and
3854          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3855          * the interrupt shadow.  The stat.exits increment will do nicely.
3856          * But we need to prevent reordering, hence this barrier():
3857          */
3858         barrier();
3859
3860         kvm_guest_exit();
3861
3862         preempt_enable();
3863
3864         down_read(&vcpu->kvm->slots_lock);
3865
3866         /*
3867          * Profile KVM exit RIPs:
3868          */
3869         if (unlikely(prof_on == KVM_PROFILING)) {
3870                 unsigned long rip = kvm_rip_read(vcpu);
3871                 profile_hit(KVM_PROFILING, (void *)rip);
3872         }
3873
3874
3875         kvm_lapic_sync_from_vapic(vcpu);
3876
3877         r = kvm_x86_ops->handle_exit(vcpu);
3878 out:
3879         return r;
3880 }
3881
3882
3883 static int __vcpu_run(struct kvm_vcpu *vcpu)
3884 {
3885         int r;
3886
3887         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3888                 pr_debug("vcpu %d received sipi with vector # %x\n",
3889                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3890                 kvm_lapic_reset(vcpu);
3891                 r = kvm_arch_vcpu_reset(vcpu);
3892                 if (r)
3893                         return r;
3894                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3895         }
3896
3897         down_read(&vcpu->kvm->slots_lock);
3898         vapic_enter(vcpu);
3899
3900         r = 1;
3901         while (r > 0) {
3902                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3903                         r = vcpu_enter_guest(vcpu);
3904                 else {
3905                         up_read(&vcpu->kvm->slots_lock);
3906                         kvm_vcpu_block(vcpu);
3907                         down_read(&vcpu->kvm->slots_lock);
3908                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3909                         {
3910                                 switch(vcpu->arch.mp_state) {
3911                                 case KVM_MP_STATE_HALTED:
3912                                         vcpu->arch.mp_state =
3913                                                 KVM_MP_STATE_RUNNABLE;
3914                                 case KVM_MP_STATE_RUNNABLE:
3915                                         break;
3916                                 case KVM_MP_STATE_SIPI_RECEIVED:
3917                                 default:
3918                                         r = -EINTR;
3919                                         break;
3920                                 }
3921                         }
3922                 }
3923
3924                 if (r <= 0)
3925                         break;
3926
3927                 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3928                 if (kvm_cpu_has_pending_timer(vcpu))
3929                         kvm_inject_pending_timer_irqs(vcpu);
3930
3931                 if (dm_request_for_irq_injection(vcpu)) {
3932                         r = -EINTR;
3933                         vcpu->run->exit_reason = KVM_EXIT_INTR;
3934                         ++vcpu->stat.request_irq_exits;
3935                 }
3936                 if (signal_pending(current)) {
3937                         r = -EINTR;
3938                         vcpu->run->exit_reason = KVM_EXIT_INTR;
3939                         ++vcpu->stat.signal_exits;
3940                 }
3941                 if (need_resched()) {
3942                         up_read(&vcpu->kvm->slots_lock);
3943                         kvm_resched(vcpu);
3944                         down_read(&vcpu->kvm->slots_lock);
3945                 }
3946         }
3947
3948         up_read(&vcpu->kvm->slots_lock);
3949         post_kvm_run_save(vcpu);
3950
3951         vapic_exit(vcpu);
3952
3953         return r;
3954 }
3955
3956 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3957 {
3958         int r;
3959         sigset_t sigsaved;
3960
3961         vcpu_load(vcpu);
3962
3963         if (vcpu->sigset_active)
3964                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3965
3966         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3967                 kvm_vcpu_block(vcpu);
3968                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3969                 r = -EAGAIN;
3970                 goto out;
3971         }
3972
3973         /* re-sync apic's tpr */
3974         if (!irqchip_in_kernel(vcpu->kvm))
3975                 kvm_set_cr8(vcpu, kvm_run->cr8);
3976
3977         if (vcpu->arch.pio.cur_count) {
3978                 r = complete_pio(vcpu);
3979                 if (r)
3980                         goto out;
3981         }
3982         if (vcpu->mmio_needed) {
3983                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3984                 vcpu->mmio_read_completed = 1;
3985                 vcpu->mmio_needed = 0;
3986
3987                 down_read(&vcpu->kvm->slots_lock);
3988                 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3989                                         EMULTYPE_NO_DECODE);
3990                 up_read(&vcpu->kvm->slots_lock);
3991                 if (r == EMULATE_DO_MMIO) {
3992                         /*
3993                          * Read-modify-write.  Back to userspace.
3994                          */
3995                         r = 0;
3996                         goto out;
3997                 }
3998         }
3999         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
4000                 kvm_register_write(vcpu, VCPU_REGS_RAX,
4001                                      kvm_run->hypercall.ret);
4002
4003         r = __vcpu_run(vcpu);
4004
4005 out:
4006         if (vcpu->sigset_active)
4007                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4008
4009         vcpu_put(vcpu);
4010         return r;
4011 }
4012
4013 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4014 {
4015         vcpu_load(vcpu);
4016
4017         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4018         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4019         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4020         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4021         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4022         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4023         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4024         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4025 #ifdef CONFIG_X86_64
4026         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
4027         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
4028         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
4029         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
4030         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
4031         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
4032         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
4033         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
4034 #endif
4035
4036         regs->rip = kvm_rip_read(vcpu);
4037         regs->rflags = kvm_get_rflags(vcpu);
4038
4039         vcpu_put(vcpu);
4040
4041         return 0;
4042 }
4043
4044 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4045 {
4046         vcpu_load(vcpu);
4047
4048         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4049         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4050         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
4051         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
4052         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
4053         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
4054         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
4055         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
4056 #ifdef CONFIG_X86_64
4057         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
4058         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
4059         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
4060         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
4061         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
4062         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
4063         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
4064         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
4065 #endif
4066
4067         kvm_rip_write(vcpu, regs->rip);
4068         kvm_set_rflags(vcpu, regs->rflags);
4069
4070         vcpu->arch.exception.pending = false;
4071
4072         vcpu_put(vcpu);
4073
4074         return 0;
4075 }
4076
4077 void kvm_get_segment(struct kvm_vcpu *vcpu,
4078                      struct kvm_segment *var, int seg)
4079 {
4080         kvm_x86_ops->get_segment(vcpu, var, seg);
4081 }
4082
4083 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4084 {
4085         struct kvm_segment cs;
4086
4087         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
4088         *db = cs.db;
4089         *l = cs.l;
4090 }
4091 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
4092
4093 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4094                                   struct kvm_sregs *sregs)
4095 {
4096         struct descriptor_table dt;
4097
4098         vcpu_load(vcpu);
4099
4100         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4101         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4102         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4103         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4104         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4105         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4106
4107         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4108         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4109
4110         kvm_x86_ops->get_idt(vcpu, &dt);
4111         sregs->idt.limit = dt.limit;
4112         sregs->idt.base = dt.base;
4113         kvm_x86_ops->get_gdt(vcpu, &dt);
4114         sregs->gdt.limit = dt.limit;
4115         sregs->gdt.base = dt.base;
4116
4117         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4118         sregs->cr0 = vcpu->arch.cr0;
4119         sregs->cr2 = vcpu->arch.cr2;
4120         sregs->cr3 = vcpu->arch.cr3;
4121         sregs->cr4 = vcpu->arch.cr4;
4122         sregs->cr8 = kvm_get_cr8(vcpu);
4123         sregs->efer = vcpu->arch.shadow_efer;
4124         sregs->apic_base = kvm_get_apic_base(vcpu);
4125
4126         memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
4127
4128         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
4129                 set_bit(vcpu->arch.interrupt.nr,
4130                         (unsigned long *)sregs->interrupt_bitmap);
4131
4132         vcpu_put(vcpu);
4133
4134         return 0;
4135 }
4136
4137 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4138                                     struct kvm_mp_state *mp_state)
4139 {
4140         vcpu_load(vcpu);
4141         mp_state->mp_state = vcpu->arch.mp_state;
4142         vcpu_put(vcpu);
4143         return 0;
4144 }
4145
4146 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4147                                     struct kvm_mp_state *mp_state)
4148 {
4149         vcpu_load(vcpu);
4150         vcpu->arch.mp_state = mp_state->mp_state;
4151         vcpu_put(vcpu);
4152         return 0;
4153 }
4154
4155 static void kvm_set_segment(struct kvm_vcpu *vcpu,
4156                         struct kvm_segment *var, int seg)
4157 {
4158         kvm_x86_ops->set_segment(vcpu, var, seg);
4159 }
4160
4161 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
4162                                    struct kvm_segment *kvm_desct)
4163 {
4164         kvm_desct->base = get_desc_base(seg_desc);
4165         kvm_desct->limit = get_desc_limit(seg_desc);
4166         if (seg_desc->g) {
4167                 kvm_desct->limit <<= 12;
4168                 kvm_desct->limit |= 0xfff;
4169         }
4170         kvm_desct->selector = selector;
4171         kvm_desct->type = seg_desc->type;
4172         kvm_desct->present = seg_desc->p;
4173         kvm_desct->dpl = seg_desc->dpl;
4174         kvm_desct->db = seg_desc->d;
4175         kvm_desct->s = seg_desc->s;
4176         kvm_desct->l = seg_desc->l;
4177         kvm_desct->g = seg_desc->g;
4178         kvm_desct->avl = seg_desc->avl;
4179         if (!selector)
4180                 kvm_desct->unusable = 1;
4181         else
4182                 kvm_desct->unusable = 0;
4183         kvm_desct->padding = 0;
4184 }
4185
4186 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4187                                           u16 selector,
4188                                           struct descriptor_table *dtable)
4189 {
4190         if (selector & 1 << 2) {
4191                 struct kvm_segment kvm_seg;
4192
4193                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4194
4195                 if (kvm_seg.unusable)
4196                         dtable->limit = 0;
4197                 else
4198                         dtable->limit = kvm_seg.limit;
4199                 dtable->base = kvm_seg.base;
4200         }
4201         else
4202                 kvm_x86_ops->get_gdt(vcpu, dtable);
4203 }
4204
4205 /* allowed just for 8 bytes segments */
4206 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4207                                          struct desc_struct *seg_desc)
4208 {
4209         struct descriptor_table dtable;
4210         u16 index = selector >> 3;
4211
4212         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4213
4214         if (dtable.limit < index * 8 + 7) {
4215                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4216                 return 1;
4217         }
4218         return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4219 }
4220
4221 /* allowed just for 8 bytes segments */
4222 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4223                                          struct desc_struct *seg_desc)
4224 {
4225         struct descriptor_table dtable;
4226         u16 index = selector >> 3;
4227
4228         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4229
4230         if (dtable.limit < index * 8 + 7)
4231                 return 1;
4232         return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4233 }
4234
4235 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
4236                              struct desc_struct *seg_desc)
4237 {
4238         u32 base_addr = get_desc_base(seg_desc);
4239
4240         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
4241 }
4242
4243 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4244 {
4245         struct kvm_segment kvm_seg;
4246
4247         kvm_get_segment(vcpu, &kvm_seg, seg);
4248         return kvm_seg.selector;
4249 }
4250
4251 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4252                                                 u16 selector,
4253                                                 struct kvm_segment *kvm_seg)
4254 {
4255         struct desc_struct seg_desc;
4256
4257         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4258                 return 1;
4259         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4260         return 0;
4261 }
4262
4263 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4264 {
4265         struct kvm_segment segvar = {
4266                 .base = selector << 4,
4267                 .limit = 0xffff,
4268                 .selector = selector,
4269                 .type = 3,
4270                 .present = 1,
4271                 .dpl = 3,
4272                 .db = 0,
4273                 .s = 1,
4274                 .l = 0,
4275                 .g = 0,
4276                 .avl = 0,
4277                 .unusable = 0,
4278         };
4279         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4280         return 0;
4281 }
4282
4283 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4284 {
4285         return (seg != VCPU_SREG_LDTR) &&
4286                 (seg != VCPU_SREG_TR) &&
4287                 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4288 }
4289
4290 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4291                                 int type_bits, int seg)
4292 {
4293         struct kvm_segment kvm_seg;
4294
4295         if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
4296                 return kvm_load_realmode_segment(vcpu, selector, seg);
4297         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4298                 return 1;
4299         kvm_seg.type |= type_bits;
4300
4301         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
4302             seg != VCPU_SREG_LDTR)
4303                 if (!kvm_seg.s)
4304                         kvm_seg.unusable = 1;
4305
4306         kvm_set_segment(vcpu, &kvm_seg, seg);
4307         return 0;
4308 }
4309
4310 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4311                                 struct tss_segment_32 *tss)
4312 {
4313         tss->cr3 = vcpu->arch.cr3;
4314         tss->eip = kvm_rip_read(vcpu);
4315         tss->eflags = kvm_get_rflags(vcpu);
4316         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4317         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4318         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4319         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4320         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4321         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4322         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4323         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4324         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4325         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4326         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4327         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4328         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4329         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4330         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4331 }
4332
4333 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4334                                   struct tss_segment_32 *tss)
4335 {
4336         kvm_set_cr3(vcpu, tss->cr3);
4337
4338         kvm_rip_write(vcpu, tss->eip);
4339         kvm_set_rflags(vcpu, tss->eflags | 2);
4340
4341         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4342         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
4343         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
4344         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
4345         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
4346         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
4347         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4348         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4349
4350         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
4351                 return 1;
4352
4353         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4354                 return 1;
4355
4356         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4357                 return 1;
4358
4359         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4360                 return 1;
4361
4362         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4363                 return 1;
4364
4365         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
4366                 return 1;
4367
4368         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
4369                 return 1;
4370         return 0;
4371 }
4372
4373 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4374                                 struct tss_segment_16 *tss)
4375 {
4376         tss->ip = kvm_rip_read(vcpu);
4377         tss->flag = kvm_get_rflags(vcpu);
4378         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4379         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4380         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4381         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4382         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4383         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4384         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
4385         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
4386
4387         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4388         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4389         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4390         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4391         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4392 }
4393
4394 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4395                                  struct tss_segment_16 *tss)
4396 {
4397         kvm_rip_write(vcpu, tss->ip);
4398         kvm_set_rflags(vcpu, tss->flag | 2);
4399         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4400         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4401         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
4402         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
4403         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
4404         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
4405         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4406         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4407
4408         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
4409                 return 1;
4410
4411         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4412                 return 1;
4413
4414         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4415                 return 1;
4416
4417         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4418                 return 1;
4419
4420         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4421                 return 1;
4422         return 0;
4423 }
4424
4425 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4426                               u16 old_tss_sel, u32 old_tss_base,
4427                               struct desc_struct *nseg_desc)
4428 {
4429         struct tss_segment_16 tss_segment_16;
4430         int ret = 0;
4431
4432         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4433                            sizeof tss_segment_16))
4434                 goto out;
4435
4436         save_state_to_tss16(vcpu, &tss_segment_16);
4437
4438         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4439                             sizeof tss_segment_16))
4440                 goto out;
4441
4442         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4443                            &tss_segment_16, sizeof tss_segment_16))
4444                 goto out;
4445
4446         if (old_tss_sel != 0xffff) {
4447                 tss_segment_16.prev_task_link = old_tss_sel;
4448
4449                 if (kvm_write_guest(vcpu->kvm,
4450                                     get_tss_base_addr(vcpu, nseg_desc),
4451                                     &tss_segment_16.prev_task_link,
4452                                     sizeof tss_segment_16.prev_task_link))
4453                         goto out;
4454         }
4455
4456         if (load_state_from_tss16(vcpu, &tss_segment_16))
4457                 goto out;
4458
4459         ret = 1;
4460 out:
4461         return ret;
4462 }
4463
4464 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4465                        u16 old_tss_sel, u32 old_tss_base,
4466                        struct desc_struct *nseg_desc)
4467 {
4468         struct tss_segment_32 tss_segment_32;
4469         int ret = 0;
4470
4471         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4472                            sizeof tss_segment_32))
4473                 goto out;
4474
4475         save_state_to_tss32(vcpu, &tss_segment_32);
4476
4477         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4478                             sizeof tss_segment_32))
4479                 goto out;
4480
4481         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4482                            &tss_segment_32, sizeof tss_segment_32))
4483                 goto out;
4484
4485         if (old_tss_sel != 0xffff) {
4486                 tss_segment_32.prev_task_link = old_tss_sel;
4487
4488                 if (kvm_write_guest(vcpu->kvm,
4489                                     get_tss_base_addr(vcpu, nseg_desc),
4490                                     &tss_segment_32.prev_task_link,
4491                                     sizeof tss_segment_32.prev_task_link))
4492                         goto out;
4493         }
4494
4495         if (load_state_from_tss32(vcpu, &tss_segment_32))
4496                 goto out;
4497
4498         ret = 1;
4499 out:
4500         return ret;
4501 }
4502
4503 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4504 {
4505         struct kvm_segment tr_seg;
4506         struct desc_struct cseg_desc;
4507         struct desc_struct nseg_desc;
4508         int ret = 0;
4509         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4510         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4511
4512         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
4513
4514         /* FIXME: Handle errors. Failure to read either TSS or their
4515          * descriptors should generate a pagefault.
4516          */
4517         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
4518                 goto out;
4519
4520         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
4521                 goto out;
4522
4523         if (reason != TASK_SWITCH_IRET) {
4524                 int cpl;
4525
4526                 cpl = kvm_x86_ops->get_cpl(vcpu);
4527                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
4528                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4529                         return 1;
4530                 }
4531         }
4532
4533         if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
4534                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4535                 return 1;
4536         }
4537
4538         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
4539                 cseg_desc.type &= ~(1 << 1); //clear the B flag
4540                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
4541         }
4542
4543         if (reason == TASK_SWITCH_IRET) {
4544                 u32 eflags = kvm_get_rflags(vcpu);
4545                 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4546         }
4547
4548         /* set back link to prev task only if NT bit is set in eflags
4549            note that old_tss_sel is not used afetr this point */
4550         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4551                 old_tss_sel = 0xffff;
4552
4553         if (nseg_desc.type & 8)
4554                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4555                                          old_tss_base, &nseg_desc);
4556         else
4557                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
4558                                          old_tss_base, &nseg_desc);
4559
4560         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4561                 u32 eflags = kvm_get_rflags(vcpu);
4562                 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4563         }
4564
4565         if (reason != TASK_SWITCH_IRET) {
4566                 nseg_desc.type |= (1 << 1);
4567                 save_guest_segment_descriptor(vcpu, tss_selector,
4568                                               &nseg_desc);
4569         }
4570
4571         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
4572         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4573         tr_seg.type = 11;
4574         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
4575 out:
4576         return ret;
4577 }
4578 EXPORT_SYMBOL_GPL(kvm_task_switch);
4579
4580 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4581                                   struct kvm_sregs *sregs)
4582 {
4583         int mmu_reset_needed = 0;
4584         int pending_vec, max_bits;
4585         struct descriptor_table dt;
4586
4587         vcpu_load(vcpu);
4588
4589         dt.limit = sregs->idt.limit;
4590         dt.base = sregs->idt.base;
4591         kvm_x86_ops->set_idt(vcpu, &dt);
4592         dt.limit = sregs->gdt.limit;
4593         dt.base = sregs->gdt.base;
4594         kvm_x86_ops->set_gdt(vcpu, &dt);
4595
4596         vcpu->arch.cr2 = sregs->cr2;
4597         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4598         vcpu->arch.cr3 = sregs->cr3;
4599
4600         kvm_set_cr8(vcpu, sregs->cr8);
4601
4602         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4603         kvm_x86_ops->set_efer(vcpu, sregs->efer);
4604         kvm_set_apic_base(vcpu, sregs->apic_base);
4605
4606         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4607
4608         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4609         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4610         vcpu->arch.cr0 = sregs->cr0;
4611
4612         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4613         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4614         if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4615                 load_pdptrs(vcpu, vcpu->arch.cr3);
4616                 mmu_reset_needed = 1;
4617         }
4618
4619         if (mmu_reset_needed)
4620                 kvm_mmu_reset_context(vcpu);
4621
4622         max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4623         pending_vec = find_first_bit(
4624                 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
4625         if (pending_vec < max_bits) {
4626                 kvm_queue_interrupt(vcpu, pending_vec, false);
4627                 pr_debug("Set back pending irq %d\n", pending_vec);
4628                 if (irqchip_in_kernel(vcpu->kvm))
4629                         kvm_pic_clear_isr_ack(vcpu->kvm);
4630         }
4631
4632         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4633         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4634         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4635         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4636         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4637         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4638
4639         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4640         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4641
4642         update_cr8_intercept(vcpu);
4643
4644         /* Older userspace won't unhalt the vcpu on reset. */
4645         if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4646             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4647             !(vcpu->arch.cr0 & X86_CR0_PE))
4648                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4649
4650         vcpu_put(vcpu);
4651
4652         return 0;
4653 }
4654
4655 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4656                                         struct kvm_guest_debug *dbg)
4657 {
4658         unsigned long rflags;
4659         int i;
4660
4661         vcpu_load(vcpu);
4662
4663         /*
4664          * Read rflags as long as potentially injected trace flags are still
4665          * filtered out.
4666          */
4667         rflags = kvm_get_rflags(vcpu);
4668
4669         vcpu->guest_debug = dbg->control;
4670         if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4671                 vcpu->guest_debug = 0;
4672
4673         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4674                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4675                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4676                 vcpu->arch.switch_db_regs =
4677                         (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4678         } else {
4679                 for (i = 0; i < KVM_NR_DB_REGS; i++)
4680                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4681                 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4682         }
4683
4684         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
4685                 vcpu->arch.singlestep_cs =
4686                         get_segment_selector(vcpu, VCPU_SREG_CS);
4687                 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
4688         }
4689
4690         /*
4691          * Trigger an rflags update that will inject or remove the trace
4692          * flags.
4693          */
4694         kvm_set_rflags(vcpu, rflags);
4695
4696         kvm_x86_ops->set_guest_debug(vcpu, dbg);
4697
4698         if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_DB)
4699                 kvm_queue_exception(vcpu, DB_VECTOR);
4700         else if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_BP)
4701                 kvm_queue_exception(vcpu, BP_VECTOR);
4702
4703         vcpu_put(vcpu);
4704
4705         return 0;
4706 }
4707
4708 /*
4709  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4710  * we have asm/x86/processor.h
4711  */
4712 struct fxsave {
4713         u16     cwd;
4714         u16     swd;
4715         u16     twd;
4716         u16     fop;
4717         u64     rip;
4718         u64     rdp;
4719         u32     mxcsr;
4720         u32     mxcsr_mask;
4721         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4722 #ifdef CONFIG_X86_64
4723         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4724 #else
4725         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4726 #endif
4727 };
4728
4729 /*
4730  * Translate a guest virtual address to a guest physical address.
4731  */
4732 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4733                                     struct kvm_translation *tr)
4734 {
4735         unsigned long vaddr = tr->linear_address;
4736         gpa_t gpa;
4737
4738         vcpu_load(vcpu);
4739         down_read(&vcpu->kvm->slots_lock);
4740         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4741         up_read(&vcpu->kvm->slots_lock);
4742         tr->physical_address = gpa;
4743         tr->valid = gpa != UNMAPPED_GVA;
4744         tr->writeable = 1;
4745         tr->usermode = 0;
4746         vcpu_put(vcpu);
4747
4748         return 0;
4749 }
4750
4751 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4752 {
4753         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4754
4755         vcpu_load(vcpu);
4756
4757         memcpy(fpu->fpr, fxsave->st_space, 128);
4758         fpu->fcw = fxsave->cwd;
4759         fpu->fsw = fxsave->swd;
4760         fpu->ftwx = fxsave->twd;
4761         fpu->last_opcode = fxsave->fop;
4762         fpu->last_ip = fxsave->rip;
4763         fpu->last_dp = fxsave->rdp;
4764         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4765
4766         vcpu_put(vcpu);
4767
4768         return 0;
4769 }
4770
4771 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4772 {
4773         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4774
4775         vcpu_load(vcpu);
4776
4777         memcpy(fxsave->st_space, fpu->fpr, 128);
4778         fxsave->cwd = fpu->fcw;
4779         fxsave->swd = fpu->fsw;
4780         fxsave->twd = fpu->ftwx;
4781         fxsave->fop = fpu->last_opcode;
4782         fxsave->rip = fpu->last_ip;
4783         fxsave->rdp = fpu->last_dp;
4784         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4785
4786         vcpu_put(vcpu);
4787
4788         return 0;
4789 }
4790
4791 void fx_init(struct kvm_vcpu *vcpu)
4792 {
4793         unsigned after_mxcsr_mask;
4794
4795         /*
4796          * Touch the fpu the first time in non atomic context as if
4797          * this is the first fpu instruction the exception handler
4798          * will fire before the instruction returns and it'll have to
4799          * allocate ram with GFP_KERNEL.
4800          */
4801         if (!used_math())
4802                 kvm_fx_save(&vcpu->arch.host_fx_image);
4803
4804         /* Initialize guest FPU by resetting ours and saving into guest's */
4805         preempt_disable();
4806         kvm_fx_save(&vcpu->arch.host_fx_image);
4807         kvm_fx_finit();
4808         kvm_fx_save(&vcpu->arch.guest_fx_image);
4809         kvm_fx_restore(&vcpu->arch.host_fx_image);
4810         preempt_enable();
4811
4812         vcpu->arch.cr0 |= X86_CR0_ET;
4813         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4814         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4815         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4816                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4817 }
4818 EXPORT_SYMBOL_GPL(fx_init);
4819
4820 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4821 {
4822         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4823                 return;
4824
4825         vcpu->guest_fpu_loaded = 1;
4826         kvm_fx_save(&vcpu->arch.host_fx_image);
4827         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4828 }
4829 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4830
4831 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4832 {
4833         if (!vcpu->guest_fpu_loaded)
4834                 return;
4835
4836         vcpu->guest_fpu_loaded = 0;
4837         kvm_fx_save(&vcpu->arch.guest_fx_image);
4838         kvm_fx_restore(&vcpu->arch.host_fx_image);
4839         ++vcpu->stat.fpu_reload;
4840 }
4841 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4842
4843 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4844 {
4845         if (vcpu->arch.time_page) {
4846                 kvm_release_page_dirty(vcpu->arch.time_page);
4847                 vcpu->arch.time_page = NULL;
4848         }
4849
4850         kvm_x86_ops->vcpu_free(vcpu);
4851 }
4852
4853 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4854                                                 unsigned int id)
4855 {
4856         return kvm_x86_ops->vcpu_create(kvm, id);
4857 }
4858
4859 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4860 {
4861         int r;
4862
4863         /* We do fxsave: this must be aligned. */
4864         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4865
4866         vcpu->arch.mtrr_state.have_fixed = 1;
4867         vcpu_load(vcpu);
4868         r = kvm_arch_vcpu_reset(vcpu);
4869         if (r == 0)
4870                 r = kvm_mmu_setup(vcpu);
4871         vcpu_put(vcpu);
4872         if (r < 0)
4873                 goto free_vcpu;
4874
4875         return 0;
4876 free_vcpu:
4877         kvm_x86_ops->vcpu_free(vcpu);
4878         return r;
4879 }
4880
4881 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4882 {
4883         vcpu_load(vcpu);
4884         kvm_mmu_unload(vcpu);
4885         vcpu_put(vcpu);
4886
4887         kvm_x86_ops->vcpu_free(vcpu);
4888 }
4889
4890 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4891 {
4892         vcpu->arch.nmi_pending = false;
4893         vcpu->arch.nmi_injected = false;
4894
4895         vcpu->arch.switch_db_regs = 0;
4896         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4897         vcpu->arch.dr6 = DR6_FIXED_1;
4898         vcpu->arch.dr7 = DR7_FIXED_1;
4899
4900         return kvm_x86_ops->vcpu_reset(vcpu);
4901 }
4902
4903 int kvm_arch_hardware_enable(void *garbage)
4904 {
4905         /*
4906          * Since this may be called from a hotplug notifcation,
4907          * we can't get the CPU frequency directly.
4908          */
4909         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4910                 int cpu = raw_smp_processor_id();
4911                 per_cpu(cpu_tsc_khz, cpu) = 0;
4912         }
4913
4914         kvm_shared_msr_cpu_online();
4915
4916         return kvm_x86_ops->hardware_enable(garbage);
4917 }
4918
4919 void kvm_arch_hardware_disable(void *garbage)
4920 {
4921         kvm_x86_ops->hardware_disable(garbage);
4922 }
4923
4924 int kvm_arch_hardware_setup(void)
4925 {
4926         return kvm_x86_ops->hardware_setup();
4927 }
4928
4929 void kvm_arch_hardware_unsetup(void)
4930 {
4931         kvm_x86_ops->hardware_unsetup();
4932 }
4933
4934 void kvm_arch_check_processor_compat(void *rtn)
4935 {
4936         kvm_x86_ops->check_processor_compatibility(rtn);
4937 }
4938
4939 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4940 {
4941         struct page *page;
4942         struct kvm *kvm;
4943         int r;
4944
4945         BUG_ON(vcpu->kvm == NULL);
4946         kvm = vcpu->kvm;
4947
4948         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4949         if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
4950                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4951         else
4952                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4953
4954         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4955         if (!page) {
4956                 r = -ENOMEM;
4957                 goto fail;
4958         }
4959         vcpu->arch.pio_data = page_address(page);
4960
4961         r = kvm_mmu_create(vcpu);
4962         if (r < 0)
4963                 goto fail_free_pio_data;
4964
4965         if (irqchip_in_kernel(kvm)) {
4966                 r = kvm_create_lapic(vcpu);
4967                 if (r < 0)
4968                         goto fail_mmu_destroy;
4969         }
4970
4971         vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
4972                                        GFP_KERNEL);
4973         if (!vcpu->arch.mce_banks) {
4974                 r = -ENOMEM;
4975                 goto fail_mmu_destroy;
4976         }
4977         vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4978
4979         return 0;
4980
4981 fail_mmu_destroy:
4982         kvm_mmu_destroy(vcpu);
4983 fail_free_pio_data:
4984         free_page((unsigned long)vcpu->arch.pio_data);
4985 fail:
4986         return r;
4987 }
4988
4989 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4990 {
4991         kvm_free_lapic(vcpu);
4992         down_read(&vcpu->kvm->slots_lock);
4993         kvm_mmu_destroy(vcpu);
4994         up_read(&vcpu->kvm->slots_lock);
4995         free_page((unsigned long)vcpu->arch.pio_data);
4996 }
4997
4998 struct  kvm *kvm_arch_create_vm(void)
4999 {
5000         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5001
5002         if (!kvm)
5003                 return ERR_PTR(-ENOMEM);
5004
5005         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5006         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5007
5008         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
5009         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
5010
5011         rdtscll(kvm->arch.vm_init_tsc);
5012
5013         return kvm;
5014 }
5015
5016 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
5017 {
5018         vcpu_load(vcpu);
5019         kvm_mmu_unload(vcpu);
5020         vcpu_put(vcpu);
5021 }
5022
5023 static void kvm_free_vcpus(struct kvm *kvm)
5024 {
5025         unsigned int i;
5026         struct kvm_vcpu *vcpu;
5027
5028         /*
5029          * Unpin any mmu pages first.
5030          */
5031         kvm_for_each_vcpu(i, vcpu, kvm)
5032                 kvm_unload_vcpu_mmu(vcpu);
5033         kvm_for_each_vcpu(i, vcpu, kvm)
5034                 kvm_arch_vcpu_free(vcpu);
5035
5036         mutex_lock(&kvm->lock);
5037         for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
5038                 kvm->vcpus[i] = NULL;
5039
5040         atomic_set(&kvm->online_vcpus, 0);
5041         mutex_unlock(&kvm->lock);
5042 }
5043
5044 void kvm_arch_sync_events(struct kvm *kvm)
5045 {
5046         kvm_free_all_assigned_devices(kvm);
5047 }
5048
5049 void kvm_arch_destroy_vm(struct kvm *kvm)
5050 {
5051         kvm_iommu_unmap_guest(kvm);
5052         kvm_free_pit(kvm);
5053         kfree(kvm->arch.vpic);
5054         kfree(kvm->arch.vioapic);
5055         kvm_free_vcpus(kvm);
5056         kvm_free_physmem(kvm);
5057         if (kvm->arch.apic_access_page)
5058                 put_page(kvm->arch.apic_access_page);
5059         if (kvm->arch.ept_identity_pagetable)
5060                 put_page(kvm->arch.ept_identity_pagetable);
5061         kfree(kvm);
5062 }
5063
5064 int kvm_arch_set_memory_region(struct kvm *kvm,
5065                                 struct kvm_userspace_memory_region *mem,
5066                                 struct kvm_memory_slot old,
5067                                 int user_alloc)
5068 {
5069         int npages = mem->memory_size >> PAGE_SHIFT;
5070         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
5071
5072         /*To keep backward compatibility with older userspace,
5073          *x86 needs to hanlde !user_alloc case.
5074          */
5075         if (!user_alloc) {
5076                 if (npages && !old.rmap) {
5077                         unsigned long userspace_addr;
5078
5079                         down_write(&current->mm->mmap_sem);
5080                         userspace_addr = do_mmap(NULL, 0,
5081                                                  npages * PAGE_SIZE,
5082                                                  PROT_READ | PROT_WRITE,
5083                                                  MAP_PRIVATE | MAP_ANONYMOUS,
5084                                                  0);
5085                         up_write(&current->mm->mmap_sem);
5086
5087                         if (IS_ERR((void *)userspace_addr))
5088                                 return PTR_ERR((void *)userspace_addr);
5089
5090                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
5091                         spin_lock(&kvm->mmu_lock);
5092                         memslot->userspace_addr = userspace_addr;
5093                         spin_unlock(&kvm->mmu_lock);
5094                 } else {
5095                         if (!old.user_alloc && old.rmap) {
5096                                 int ret;
5097
5098                                 down_write(&current->mm->mmap_sem);
5099                                 ret = do_munmap(current->mm, old.userspace_addr,
5100                                                 old.npages * PAGE_SIZE);
5101                                 up_write(&current->mm->mmap_sem);
5102                                 if (ret < 0)
5103                                         printk(KERN_WARNING
5104                                        "kvm_vm_ioctl_set_memory_region: "
5105                                        "failed to munmap memory\n");
5106                         }
5107                 }
5108         }
5109
5110         spin_lock(&kvm->mmu_lock);
5111         if (!kvm->arch.n_requested_mmu_pages) {
5112                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
5113                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
5114         }
5115
5116         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
5117         spin_unlock(&kvm->mmu_lock);
5118
5119         return 0;
5120 }
5121
5122 void kvm_arch_flush_shadow(struct kvm *kvm)
5123 {
5124         kvm_mmu_zap_all(kvm);
5125         kvm_reload_remote_mmus(kvm);
5126 }
5127
5128 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
5129 {
5130         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
5131                 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
5132                 || vcpu->arch.nmi_pending ||
5133                 (kvm_arch_interrupt_allowed(vcpu) &&
5134                  kvm_cpu_has_interrupt(vcpu));
5135 }
5136
5137 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5138 {
5139         int me;
5140         int cpu = vcpu->cpu;
5141
5142         if (waitqueue_active(&vcpu->wq)) {
5143                 wake_up_interruptible(&vcpu->wq);
5144                 ++vcpu->stat.halt_wakeup;
5145         }
5146
5147         me = get_cpu();
5148         if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5149                 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
5150                         smp_send_reschedule(cpu);
5151         put_cpu();
5152 }
5153
5154 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
5155 {
5156         return kvm_x86_ops->interrupt_allowed(vcpu);
5157 }
5158
5159 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5160 {
5161         unsigned long rflags;
5162
5163         rflags = kvm_x86_ops->get_rflags(vcpu);
5164         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5165                 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5166         return rflags;
5167 }
5168 EXPORT_SYMBOL_GPL(kvm_get_rflags);
5169
5170 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5171 {
5172         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5173             vcpu->arch.singlestep_cs ==
5174                         get_segment_selector(vcpu, VCPU_SREG_CS) &&
5175             vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5176                 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5177         kvm_x86_ops->set_rflags(vcpu, rflags);
5178 }
5179 EXPORT_SYMBOL_GPL(kvm_set_rflags);
5180
5181 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
5182 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
5183 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
5184 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
5185 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5186 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5187 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5188 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5189 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5190 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5191 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);