i386: move xen
authorThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:16:51 +0000 (11:16 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:16:51 +0000 (11:16 +0200)
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
38 files changed:
arch/i386/Kconfig
arch/i386/Makefile
arch/i386/kernel/head_32.S
arch/i386/kernel/vsyscall-note_32.S
arch/i386/xen/Kconfig [deleted file]
arch/i386/xen/Makefile [deleted file]
arch/i386/xen/enlighten.c [deleted file]
arch/i386/xen/events.c [deleted file]
arch/i386/xen/features.c [deleted file]
arch/i386/xen/manage.c [deleted file]
arch/i386/xen/mmu.c [deleted file]
arch/i386/xen/mmu.h [deleted file]
arch/i386/xen/multicalls.c [deleted file]
arch/i386/xen/multicalls.h [deleted file]
arch/i386/xen/setup.c [deleted file]
arch/i386/xen/smp.c [deleted file]
arch/i386/xen/time.c [deleted file]
arch/i386/xen/vdso.h [deleted file]
arch/i386/xen/xen-asm.S [deleted file]
arch/i386/xen/xen-head.S [deleted file]
arch/i386/xen/xen-ops.h [deleted file]
arch/x86/xen/Kconfig [new file with mode: 0644]
arch/x86/xen/Makefile [new file with mode: 0644]
arch/x86/xen/enlighten.c [new file with mode: 0644]
arch/x86/xen/events.c [new file with mode: 0644]
arch/x86/xen/features.c [new file with mode: 0644]
arch/x86/xen/manage.c [new file with mode: 0644]
arch/x86/xen/mmu.c [new file with mode: 0644]
arch/x86/xen/mmu.h [new file with mode: 0644]
arch/x86/xen/multicalls.c [new file with mode: 0644]
arch/x86/xen/multicalls.h [new file with mode: 0644]
arch/x86/xen/setup.c [new file with mode: 0644]
arch/x86/xen/smp.c [new file with mode: 0644]
arch/x86/xen/time.c [new file with mode: 0644]
arch/x86/xen/vdso.h [new file with mode: 0644]
arch/x86/xen/xen-asm.S [new file with mode: 0644]
arch/x86/xen/xen-head.S [new file with mode: 0644]
arch/x86/xen/xen-ops.h [new file with mode: 0644]

index 561cc21914b2b376a1678a122b239e2f11e3d17c..0f6186f03034bfd5389045063c3f2520a9f25946 100644 (file)
@@ -226,7 +226,7 @@ config PARAVIRT
          However, when run without a hypervisor the kernel is
          theoretically slower.  If in doubt, say N.
 
-source "arch/i386/xen/Kconfig"
+source "arch/x86/xen/Kconfig"
 
 config VMI
        bool "VMI Paravirt-ops support"
index c462803a4db2f23cce7ea0f907a20ce9cdee1744..fc85d6674fcb9e8480e1fabd612c05a32c264416 100644 (file)
@@ -94,7 +94,7 @@ mcore-$(CONFIG_X86_ES7000)    := arch/x86/mach-default
 core-$(CONFIG_X86_ES7000)      := arch/x86/mach-es7000/
 
 # Xen paravirtualization support
-core-$(CONFIG_XEN)             += arch/i386/xen/
+core-$(CONFIG_XEN)             += arch/x86/xen/
 
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
index 8f0382161c91ec8e0a1c4f75b8ed22824c35eb90..9150ca9b5f809285e987d1ed84db3b353357ae16 100644 (file)
@@ -537,7 +537,7 @@ fault_msg:
        .ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
        .asciz "Stack: %p %p %p %p %p %p %p %p\n"
 
-#include "../xen/xen-head.S"
+#include "../../x86/xen/xen-head.S"
 
 /*
  * The IDT and GDT 'descriptors' are a strange 48-bit object
index 07c0daf78237bcb1b80ea1401ada31f0f1065cb2..fcf376a37f79c142bb35216234e56e15adf0b41c 100644 (file)
@@ -33,7 +33,7 @@ ELFNOTE_END
  * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
  */
 
-#include "../xen/vdso.h"       /* Defines VDSO_NOTE_NONEGSEG_BIT.  */
+#include "../../x86/xen/vdso.h"        /* Defines VDSO_NOTE_NONEGSEG_BIT.  */
 
        .globl VDSO_NOTE_MASK
 ELFNOTE_START(GNU, 2, "a")
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
deleted file mode 100644 (file)
index 9df99e1..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# This Kconfig describes xen options
-#
-
-config XEN
-       bool "Enable support for Xen hypervisor"
-       depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
-       help
-         This is the Linux Xen port.  Enabling this will allow the
-         kernel to boot in a paravirtualized environment under the
-         Xen hypervisor.
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
deleted file mode 100644 (file)
index 343df24..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-obj-y          := enlighten.o setup.o features.o multicalls.o mmu.o \
-                       events.o time.o manage.o xen-asm.o
-
-obj-$(CONFIG_SMP)      += smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
deleted file mode 100644 (file)
index f01bfcd..0000000
+++ /dev/null
@@ -1,1146 +0,0 @@
-/*
- * Core of Xen paravirt_ops implementation.
- *
- * This file contains the xen_paravirt_ops structure itself, and the
- * implementations for:
- * - privileged instructions
- * - interrupt flags
- * - segment operations
- * - booting and setup
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/preempt.h>
-#include <linux/hardirq.h>
-#include <linux/percpu.h>
-#include <linux/delay.h>
-#include <linux/start_kernel.h>
-#include <linux/sched.h>
-#include <linux/bootmem.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-
-#include <xen/interface/xen.h>
-#include <xen/interface/physdev.h>
-#include <xen/interface/vcpu.h>
-#include <xen/interface/sched.h>
-#include <xen/features.h>
-#include <xen/page.h>
-
-#include <asm/paravirt.h>
-#include <asm/page.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/fixmap.h>
-#include <asm/processor.h>
-#include <asm/setup.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/reboot.h>
-
-#include "xen-ops.h"
-#include "mmu.h"
-#include "multicalls.h"
-
-EXPORT_SYMBOL_GPL(hypercall_page);
-
-DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
-
-DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-DEFINE_PER_CPU(unsigned long, xen_cr3);
-
-struct start_info *xen_start_info;
-EXPORT_SYMBOL_GPL(xen_start_info);
-
-static /* __initdata */ struct shared_info dummy_shared_info;
-
-/*
- * Point at some empty memory to start with. We map the real shared_info
- * page as soon as fixmap is up and running.
- */
-struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
-
-/*
- * Flag to determine whether vcpu info placement is available on all
- * VCPUs.  We assume it is to start with, and then set it to zero on
- * the first failure.  This is because it can succeed on some VCPUs
- * and not others, since it can involve hypervisor memory allocation,
- * or because the guest failed to guarantee all the appropriate
- * constraints on all VCPUs (ie buffer can't cross a page boundary).
- *
- * Note that any particular CPU may be using a placed vcpu structure,
- * but we can only optimise if the all are.
- *
- * 0: not available, 1: available
- */
-static int have_vcpu_info_placement = 1;
-
-static void __init xen_vcpu_setup(int cpu)
-{
-       struct vcpu_register_vcpu_info info;
-       int err;
-       struct vcpu_info *vcpup;
-
-       per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
-
-       if (!have_vcpu_info_placement)
-               return;         /* already tested, not available */
-
-       vcpup = &per_cpu(xen_vcpu_info, cpu);
-
-       info.mfn = virt_to_mfn(vcpup);
-       info.offset = offset_in_page(vcpup);
-
-       printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
-              cpu, vcpup, info.mfn, info.offset);
-
-       /* Check to see if the hypervisor will put the vcpu_info
-          structure where we want it, which allows direct access via
-          a percpu-variable. */
-       err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
-
-       if (err) {
-               printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
-               have_vcpu_info_placement = 0;
-       } else {
-               /* This cpu is using the registered vcpu info, even if
-                  later ones fail to. */
-               per_cpu(xen_vcpu, cpu) = vcpup;
-
-               printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
-                      cpu, vcpup);
-       }
-}
-
-static void __init xen_banner(void)
-{
-       printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-              paravirt_ops.name);
-       printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
-}
-
-static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
-                     unsigned int *ecx, unsigned int *edx)
-{
-       unsigned maskedx = ~0;
-
-       /*
-        * Mask out inconvenient features, to try and disable as many
-        * unsupported kernel subsystems as possible.
-        */
-       if (*eax == 1)
-               maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
-                           (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
-                           (1 << X86_FEATURE_ACC));   /* thermal monitoring */
-
-       asm(XEN_EMULATE_PREFIX "cpuid"
-               : "=a" (*eax),
-                 "=b" (*ebx),
-                 "=c" (*ecx),
-                 "=d" (*edx)
-               : "0" (*eax), "2" (*ecx));
-       *edx &= maskedx;
-}
-
-static void xen_set_debugreg(int reg, unsigned long val)
-{
-       HYPERVISOR_set_debugreg(reg, val);
-}
-
-static unsigned long xen_get_debugreg(int reg)
-{
-       return HYPERVISOR_get_debugreg(reg);
-}
-
-static unsigned long xen_save_fl(void)
-{
-       struct vcpu_info *vcpu;
-       unsigned long flags;
-
-       vcpu = x86_read_percpu(xen_vcpu);
-
-       /* flag has opposite sense of mask */
-       flags = !vcpu->evtchn_upcall_mask;
-
-       /* convert to IF type flag
-          -0 -> 0x00000000
-          -1 -> 0xffffffff
-       */
-       return (-flags) & X86_EFLAGS_IF;
-}
-
-static void xen_restore_fl(unsigned long flags)
-{
-       struct vcpu_info *vcpu;
-
-       /* convert from IF type flag */
-       flags = !(flags & X86_EFLAGS_IF);
-
-       /* There's a one instruction preempt window here.  We need to
-          make sure we're don't switch CPUs between getting the vcpu
-          pointer and updating the mask. */
-       preempt_disable();
-       vcpu = x86_read_percpu(xen_vcpu);
-       vcpu->evtchn_upcall_mask = flags;
-       preempt_enable_no_resched();
-
-       /* Doesn't matter if we get preempted here, because any
-          pending event will get dealt with anyway. */
-
-       if (flags == 0) {
-               preempt_check_resched();
-               barrier(); /* unmask then check (avoid races) */
-               if (unlikely(vcpu->evtchn_upcall_pending))
-                       force_evtchn_callback();
-       }
-}
-
-static void xen_irq_disable(void)
-{
-       /* There's a one instruction preempt window here.  We need to
-          make sure we're don't switch CPUs between getting the vcpu
-          pointer and updating the mask. */
-       preempt_disable();
-       x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
-       preempt_enable_no_resched();
-}
-
-static void xen_irq_enable(void)
-{
-       struct vcpu_info *vcpu;
-
-       /* There's a one instruction preempt window here.  We need to
-          make sure we're don't switch CPUs between getting the vcpu
-          pointer and updating the mask. */
-       preempt_disable();
-       vcpu = x86_read_percpu(xen_vcpu);
-       vcpu->evtchn_upcall_mask = 0;
-       preempt_enable_no_resched();
-
-       /* Doesn't matter if we get preempted here, because any
-          pending event will get dealt with anyway. */
-
-       barrier(); /* unmask then check (avoid races) */
-       if (unlikely(vcpu->evtchn_upcall_pending))
-               force_evtchn_callback();
-}
-
-static void xen_safe_halt(void)
-{
-       /* Blocking includes an implicit local_irq_enable(). */
-       if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
-               BUG();
-}
-
-static void xen_halt(void)
-{
-       if (irqs_disabled())
-               HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
-       else
-               xen_safe_halt();
-}
-
-static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
-{
-       BUG_ON(preemptible());
-
-       switch (mode) {
-       case PARAVIRT_LAZY_NONE:
-               BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
-               break;
-
-       case PARAVIRT_LAZY_MMU:
-       case PARAVIRT_LAZY_CPU:
-               BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
-               break;
-
-       case PARAVIRT_LAZY_FLUSH:
-               /* flush if necessary, but don't change state */
-               if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
-                       xen_mc_flush();
-               return;
-       }
-
-       xen_mc_flush();
-       x86_write_percpu(xen_lazy_mode, mode);
-}
-
-static unsigned long xen_store_tr(void)
-{
-       return 0;
-}
-
-static void xen_set_ldt(const void *addr, unsigned entries)
-{
-       unsigned long linear_addr = (unsigned long)addr;
-       struct mmuext_op *op;
-       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-
-       op = mcs.args;
-       op->cmd = MMUEXT_SET_LDT;
-       if (linear_addr) {
-               /* ldt my be vmalloced, use arbitrary_virt_to_machine */
-               xmaddr_t maddr;
-               maddr = arbitrary_virt_to_machine((unsigned long)addr);
-               linear_addr = (unsigned long)maddr.maddr;
-       }
-       op->arg1.linear_addr = linear_addr;
-       op->arg2.nr_ents = entries;
-
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_CPU);
-}
-
-static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
-{
-       unsigned long *frames;
-       unsigned long va = dtr->address;
-       unsigned int size = dtr->size + 1;
-       unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
-       int f;
-       struct multicall_space mcs;
-
-       /* A GDT can be up to 64k in size, which corresponds to 8192
-          8-byte entries, or 16 4k pages.. */
-
-       BUG_ON(size > 65536);
-       BUG_ON(va & ~PAGE_MASK);
-
-       mcs = xen_mc_entry(sizeof(*frames) * pages);
-       frames = mcs.args;
-
-       for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-               frames[f] = virt_to_mfn(va);
-               make_lowmem_page_readonly((void *)va);
-       }
-
-       MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-
-       xen_mc_issue(PARAVIRT_LAZY_CPU);
-}
-
-static void load_TLS_descriptor(struct thread_struct *t,
-                               unsigned int cpu, unsigned int i)
-{
-       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-       xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
-       struct multicall_space mc = __xen_mc_entry(0);
-
-       MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
-}
-
-static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-       xen_mc_batch();
-
-       load_TLS_descriptor(t, cpu, 0);
-       load_TLS_descriptor(t, cpu, 1);
-       load_TLS_descriptor(t, cpu, 2);
-
-       xen_mc_issue(PARAVIRT_LAZY_CPU);
-
-       /*
-        * XXX sleazy hack: If we're being called in a lazy-cpu zone,
-        * it means we're in a context switch, and %gs has just been
-        * saved.  This means we can zero it out to prevent faults on
-        * exit from the hypervisor if the next process has no %gs.
-        * Either way, it has been saved, and the new value will get
-        * loaded properly.  This will go away as soon as Xen has been
-        * modified to not save/restore %gs for normal hypercalls.
-        */
-       if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
-               loadsegment(gs, 0);
-}
-
-static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
-                               u32 low, u32 high)
-{
-       unsigned long lp = (unsigned long)&dt[entrynum];
-       xmaddr_t mach_lp = virt_to_machine(lp);
-       u64 entry = (u64)high << 32 | low;
-
-       preempt_disable();
-
-       xen_mc_flush();
-       if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
-               BUG();
-
-       preempt_enable();
-}
-
-static int cvt_gate_to_trap(int vector, u32 low, u32 high,
-                           struct trap_info *info)
-{
-       u8 type, dpl;
-
-       type = (high >> 8) & 0x1f;
-       dpl = (high >> 13) & 3;
-
-       if (type != 0xf && type != 0xe)
-               return 0;
-
-       info->vector = vector;
-       info->address = (high & 0xffff0000) | (low & 0x0000ffff);
-       info->cs = low >> 16;
-       info->flags = dpl;
-       /* interrupt gates clear IF */
-       if (type == 0xe)
-               info->flags |= 4;
-
-       return 1;
-}
-
-/* Locations of each CPU's IDT */
-static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
-
-/* Set an IDT entry.  If the entry is part of the current IDT, then
-   also update Xen. */
-static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
-                               u32 low, u32 high)
-{
-       unsigned long p = (unsigned long)&dt[entrynum];
-       unsigned long start, end;
-
-       preempt_disable();
-
-       start = __get_cpu_var(idt_desc).address;
-       end = start + __get_cpu_var(idt_desc).size + 1;
-
-       xen_mc_flush();
-
-       write_dt_entry(dt, entrynum, low, high);
-
-       if (p >= start && (p + 8) <= end) {
-               struct trap_info info[2];
-
-               info[1].address = 0;
-
-               if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
-                       if (HYPERVISOR_set_trap_table(info))
-                               BUG();
-       }
-
-       preempt_enable();
-}
-
-static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
-                                 struct trap_info *traps)
-{
-       unsigned in, out, count;
-
-       count = (desc->size+1) / 8;
-       BUG_ON(count > 256);
-
-       for (in = out = 0; in < count; in++) {
-               const u32 *entry = (u32 *)(desc->address + in * 8);
-
-               if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
-                       out++;
-       }
-       traps[out].address = 0;
-}
-
-void xen_copy_trap_info(struct trap_info *traps)
-{
-       const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
-
-       xen_convert_trap_info(desc, traps);
-}
-
-/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
-   hold a spinlock to protect the static traps[] array (static because
-   it avoids allocation, and saves stack space). */
-static void xen_load_idt(const struct Xgt_desc_struct *desc)
-{
-       static DEFINE_SPINLOCK(lock);
-       static struct trap_info traps[257];
-
-       spin_lock(&lock);
-
-       __get_cpu_var(idt_desc) = *desc;
-
-       xen_convert_trap_info(desc, traps);
-
-       xen_mc_flush();
-       if (HYPERVISOR_set_trap_table(traps))
-               BUG();
-
-       spin_unlock(&lock);
-}
-
-/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
-   they're handled differently. */
-static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
-                               u32 low, u32 high)
-{
-       preempt_disable();
-
-       switch ((high >> 8) & 0xff) {
-       case DESCTYPE_LDT:
-       case DESCTYPE_TSS:
-               /* ignore */
-               break;
-
-       default: {
-               xmaddr_t maddr = virt_to_machine(&dt[entry]);
-               u64 desc = (u64)high << 32 | low;
-
-               xen_mc_flush();
-               if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
-                       BUG();
-       }
-
-       }
-
-       preempt_enable();
-}
-
-static void xen_load_esp0(struct tss_struct *tss,
-                         struct thread_struct *thread)
-{
-       struct multicall_space mcs = xen_mc_entry(0);
-       MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
-       xen_mc_issue(PARAVIRT_LAZY_CPU);
-}
-
-static void xen_set_iopl_mask(unsigned mask)
-{
-       struct physdev_set_iopl set_iopl;
-
-       /* Force the change at ring 0. */
-       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
-       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
-}
-
-static void xen_io_delay(void)
-{
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long xen_apic_read(unsigned long reg)
-{
-       return 0;
-}
-
-static void xen_apic_write(unsigned long reg, unsigned long val)
-{
-       /* Warn to see if there's any stray references */
-       WARN_ON(1);
-}
-#endif
-
-static void xen_flush_tlb(void)
-{
-       struct mmuext_op *op;
-       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-
-       op = mcs.args;
-       op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-
-static void xen_flush_tlb_single(unsigned long addr)
-{
-       struct mmuext_op *op;
-       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-
-       op = mcs.args;
-       op->cmd = MMUEXT_INVLPG_LOCAL;
-       op->arg1.linear_addr = addr & PAGE_MASK;
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-
-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
-                                unsigned long va)
-{
-       struct {
-               struct mmuext_op op;
-               cpumask_t mask;
-       } *args;
-       cpumask_t cpumask = *cpus;
-       struct multicall_space mcs;
-
-       /*
-        * A couple of (to be removed) sanity checks:
-        *
-        * - current CPU must not be in mask
-        * - mask must exist :)
-        */
-       BUG_ON(cpus_empty(cpumask));
-       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-       BUG_ON(!mm);
-
-       /* If a CPU which we ran on has gone down, OK. */
-       cpus_and(cpumask, cpumask, cpu_online_map);
-       if (cpus_empty(cpumask))
-               return;
-
-       mcs = xen_mc_entry(sizeof(*args));
-       args = mcs.args;
-       args->mask = cpumask;
-       args->op.arg2.vcpumask = &args->mask;
-
-       if (va == TLB_FLUSH_ALL) {
-               args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-       } else {
-               args->op.cmd = MMUEXT_INVLPG_MULTI;
-               args->op.arg1.linear_addr = va;
-       }
-
-       MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-
-static void xen_write_cr2(unsigned long cr2)
-{
-       x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
-}
-
-static unsigned long xen_read_cr2(void)
-{
-       return x86_read_percpu(xen_vcpu)->arch.cr2;
-}
-
-static unsigned long xen_read_cr2_direct(void)
-{
-       return x86_read_percpu(xen_vcpu_info.arch.cr2);
-}
-
-static void xen_write_cr4(unsigned long cr4)
-{
-       /* Just ignore cr4 changes; Xen doesn't allow us to do
-          anything anyway. */
-}
-
-static unsigned long xen_read_cr3(void)
-{
-       return x86_read_percpu(xen_cr3);
-}
-
-static void xen_write_cr3(unsigned long cr3)
-{
-       BUG_ON(preemptible());
-
-       if (cr3 == x86_read_percpu(xen_cr3)) {
-               /* just a simple tlb flush */
-               xen_flush_tlb();
-               return;
-       }
-
-       x86_write_percpu(xen_cr3, cr3);
-
-
-       {
-               struct mmuext_op *op;
-               struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-               unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
-
-               op = mcs.args;
-               op->cmd = MMUEXT_NEW_BASEPTR;
-               op->arg1.mfn = mfn;
-
-               MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-               xen_mc_issue(PARAVIRT_LAZY_CPU);
-       }
-}
-
-/* Early in boot, while setting up the initial pagetable, assume
-   everything is pinned. */
-static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
-{
-       BUG_ON(mem_map);        /* should only be used early */
-       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
-
-/* This needs to make sure the new pte page is pinned iff its being
-   attached to a pinned pagetable. */
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
-{
-       struct page *page = pfn_to_page(pfn);
-
-       if (PagePinned(virt_to_page(mm->pgd))) {
-               SetPagePinned(page);
-
-               if (!PageHighMem(page))
-                       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-               else
-                       /* make sure there are no stray mappings of
-                          this page */
-                       kmap_flush_unused();
-       }
-}
-
-/* This should never happen until we're OK to use struct page */
-static void xen_release_pt(u32 pfn)
-{
-       struct page *page = pfn_to_page(pfn);
-
-       if (PagePinned(page)) {
-               if (!PageHighMem(page))
-                       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-       }
-}
-
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
-       pgprot_t prot = PAGE_KERNEL;
-
-       if (PagePinned(page))
-               prot = PAGE_KERNEL_RO;
-
-       if (0 && PageHighMem(page))
-               printk("mapping highpte %lx type %d prot %s\n",
-                      page_to_pfn(page), type,
-                      (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
-       return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
-       /* If there's an existing pte, then don't allow _PAGE_RW to be set */
-       if (pte_val_ma(*ptep) & _PAGE_PRESENT)
-               pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
-                              pte_val_ma(pte));
-
-       return pte;
-}
-
-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
-       pte = mask_rw_pte(ptep, pte);
-
-       xen_set_pte(ptep, pte);
-}
-
-static __init void xen_pagetable_setup_start(pgd_t *base)
-{
-       pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
-
-       /* special set_pte for pagetable initialization */
-       paravirt_ops.set_pte = xen_set_pte_init;
-
-       init_mm.pgd = base;
-       /*
-        * copy top-level of Xen-supplied pagetable into place.  For
-        * !PAE we can use this as-is, but for PAE it is a stand-in
-        * while we copy the pmd pages.
-        */
-       memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-
-       if (PTRS_PER_PMD > 1) {
-               int i;
-               /*
-                * For PAE, need to allocate new pmds, rather than
-                * share Xen's, since Xen doesn't like pmd's being
-                * shared between address spaces.
-                */
-               for (i = 0; i < PTRS_PER_PGD; i++) {
-                       if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
-                               pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-
-                               memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
-                                      PAGE_SIZE);
-
-                               make_lowmem_page_readonly(pmd);
-
-                               set_pgd(&base[i], __pgd(1 + __pa(pmd)));
-                       } else
-                               pgd_clear(&base[i]);
-               }
-       }
-
-       /* make sure zero_page is mapped RO so we can use it in pagetables */
-       make_lowmem_page_readonly(empty_zero_page);
-       make_lowmem_page_readonly(base);
-       /*
-        * Switch to new pagetable.  This is done before
-        * pagetable_init has done anything so that the new pages
-        * added to the table can be prepared properly for Xen.
-        */
-       xen_write_cr3(__pa(base));
-}
-
-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
-       /* This will work as long as patching hasn't happened yet
-          (which it hasn't) */
-       paravirt_ops.alloc_pt = xen_alloc_pt;
-       paravirt_ops.set_pte = xen_set_pte;
-
-       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-               /*
-                * Create a mapping for the shared info page.
-                * Should be set_fixmap(), but shared_info is a machine
-                * address with no corresponding pseudo-phys address.
-                */
-               set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
-                           PFN_DOWN(xen_start_info->shared_info),
-                           PAGE_KERNEL);
-
-               HYPERVISOR_shared_info =
-                       (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
-       } else
-               HYPERVISOR_shared_info =
-                       (struct shared_info *)__va(xen_start_info->shared_info);
-
-       /* Actually pin the pagetable down, but we can't set PG_pinned
-          yet because the page structures don't exist yet. */
-       {
-               struct mmuext_op op;
-#ifdef CONFIG_X86_PAE
-               op.cmd = MMUEXT_PIN_L3_TABLE;
-#else
-               op.cmd = MMUEXT_PIN_L3_TABLE;
-#endif
-               op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
-               if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-                       BUG();
-       }
-}
-
-/* This is called once we have the cpu_possible_map */
-void __init xen_setup_vcpu_info_placement(void)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               xen_vcpu_setup(cpu);
-
-       /* xen_vcpu_setup managed to place the vcpu_info within the
-          percpu area for all cpus, so make use of it */
-       if (have_vcpu_info_placement) {
-               printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
-               paravirt_ops.save_fl = xen_save_fl_direct;
-               paravirt_ops.restore_fl = xen_restore_fl_direct;
-               paravirt_ops.irq_disable = xen_irq_disable_direct;
-               paravirt_ops.irq_enable = xen_irq_enable_direct;
-               paravirt_ops.read_cr2 = xen_read_cr2_direct;
-               paravirt_ops.iret = xen_iret_direct;
-       }
-}
-
-static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
-                         unsigned long addr, unsigned len)
-{
-       char *start, *end, *reloc;
-       unsigned ret;
-
-       start = end = reloc = NULL;
-
-#define SITE(x)                                                                \
-       case PARAVIRT_PATCH(x):                                         \
-       if (have_vcpu_info_placement) {                                 \
-               start = (char *)xen_##x##_direct;                       \
-               end = xen_##x##_direct_end;                             \
-               reloc = xen_##x##_direct_reloc;                         \
-       }                                                               \
-       goto patch_site
-
-       switch (type) {
-               SITE(irq_enable);
-               SITE(irq_disable);
-               SITE(save_fl);
-               SITE(restore_fl);
-#undef SITE
-
-       patch_site:
-               if (start == NULL || (end-start) > len)
-                       goto default_patch;
-
-               ret = paravirt_patch_insns(insnbuf, len, start, end);
-
-               /* Note: because reloc is assigned from something that
-                  appears to be an array, gcc assumes it's non-null,
-                  but doesn't know its relationship with start and
-                  end. */
-               if (reloc > start && reloc < end) {
-                       int reloc_off = reloc - start;
-                       long *relocp = (long *)(insnbuf + reloc_off);
-                       long delta = start - (char *)addr;
-
-                       *relocp += delta;
-               }
-               break;
-
-       default_patch:
-       default:
-               ret = paravirt_patch_default(type, clobbers, insnbuf,
-                                            addr, len);
-               break;
-       }
-
-       return ret;
-}
-
-static const struct paravirt_ops xen_paravirt_ops __initdata = {
-       .paravirt_enabled = 1,
-       .shared_kernel_pmd = 0,
-
-       .name = "Xen",
-       .banner = xen_banner,
-
-       .patch = xen_patch,
-
-       .memory_setup = xen_memory_setup,
-       .arch_setup = xen_arch_setup,
-       .init_IRQ = xen_init_IRQ,
-       .post_allocator_init = xen_mark_init_mm_pinned,
-
-       .time_init = xen_time_init,
-       .set_wallclock = xen_set_wallclock,
-       .get_wallclock = xen_get_wallclock,
-       .get_cpu_khz = xen_cpu_khz,
-       .sched_clock = xen_sched_clock,
-
-       .cpuid = xen_cpuid,
-
-       .set_debugreg = xen_set_debugreg,
-       .get_debugreg = xen_get_debugreg,
-
-       .clts = native_clts,
-
-       .read_cr0 = native_read_cr0,
-       .write_cr0 = native_write_cr0,
-
-       .read_cr2 = xen_read_cr2,
-       .write_cr2 = xen_write_cr2,
-
-       .read_cr3 = xen_read_cr3,
-       .write_cr3 = xen_write_cr3,
-
-       .read_cr4 = native_read_cr4,
-       .read_cr4_safe = native_read_cr4_safe,
-       .write_cr4 = xen_write_cr4,
-
-       .save_fl = xen_save_fl,
-       .restore_fl = xen_restore_fl,
-       .irq_disable = xen_irq_disable,
-       .irq_enable = xen_irq_enable,
-       .safe_halt = xen_safe_halt,
-       .halt = xen_halt,
-       .wbinvd = native_wbinvd,
-
-       .read_msr = native_read_msr_safe,
-       .write_msr = native_write_msr_safe,
-       .read_tsc = native_read_tsc,
-       .read_pmc = native_read_pmc,
-
-       .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
-       .irq_enable_sysexit = NULL,  /* never called */
-
-       .load_tr_desc = paravirt_nop,
-       .set_ldt = xen_set_ldt,
-       .load_gdt = xen_load_gdt,
-       .load_idt = xen_load_idt,
-       .load_tls = xen_load_tls,
-
-       .store_gdt = native_store_gdt,
-       .store_idt = native_store_idt,
-       .store_tr = xen_store_tr,
-
-       .write_ldt_entry = xen_write_ldt_entry,
-       .write_gdt_entry = xen_write_gdt_entry,
-       .write_idt_entry = xen_write_idt_entry,
-       .load_esp0 = xen_load_esp0,
-
-       .set_iopl_mask = xen_set_iopl_mask,
-       .io_delay = xen_io_delay,
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       .apic_write = xen_apic_write,
-       .apic_write_atomic = xen_apic_write,
-       .apic_read = xen_apic_read,
-       .setup_boot_clock = paravirt_nop,
-       .setup_secondary_clock = paravirt_nop,
-       .startup_ipi_hook = paravirt_nop,
-#endif
-
-       .flush_tlb_user = xen_flush_tlb,
-       .flush_tlb_kernel = xen_flush_tlb,
-       .flush_tlb_single = xen_flush_tlb_single,
-       .flush_tlb_others = xen_flush_tlb_others,
-
-       .pte_update = paravirt_nop,
-       .pte_update_defer = paravirt_nop,
-
-       .pagetable_setup_start = xen_pagetable_setup_start,
-       .pagetable_setup_done = xen_pagetable_setup_done,
-
-       .alloc_pt = xen_alloc_pt_init,
-       .release_pt = xen_release_pt,
-       .alloc_pd = paravirt_nop,
-       .alloc_pd_clone = paravirt_nop,
-       .release_pd = paravirt_nop,
-
-#ifdef CONFIG_HIGHPTE
-       .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
-       .set_pte = NULL,        /* see xen_pagetable_setup_* */
-       .set_pte_at = xen_set_pte_at,
-       .set_pmd = xen_set_pmd,
-
-       .pte_val = xen_pte_val,
-       .pgd_val = xen_pgd_val,
-
-       .make_pte = xen_make_pte,
-       .make_pgd = xen_make_pgd,
-
-#ifdef CONFIG_X86_PAE
-       .set_pte_atomic = xen_set_pte_atomic,
-       .set_pte_present = xen_set_pte_at,
-       .set_pud = xen_set_pud,
-       .pte_clear = xen_pte_clear,
-       .pmd_clear = xen_pmd_clear,
-
-       .make_pmd = xen_make_pmd,
-       .pmd_val = xen_pmd_val,
-#endif /* PAE */
-
-       .activate_mm = xen_activate_mm,
-       .dup_mmap = xen_dup_mmap,
-       .exit_mmap = xen_exit_mmap,
-
-       .set_lazy_mode = xen_set_lazy_mode,
-};
-
-#ifdef CONFIG_SMP
-static const struct smp_ops xen_smp_ops __initdata = {
-       .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
-       .smp_prepare_cpus = xen_smp_prepare_cpus,
-       .cpu_up = xen_cpu_up,
-       .smp_cpus_done = xen_smp_cpus_done,
-
-       .smp_send_stop = xen_smp_send_stop,
-       .smp_send_reschedule = xen_smp_send_reschedule,
-       .smp_call_function_mask = xen_smp_call_function_mask,
-};
-#endif /* CONFIG_SMP */
-
-static void xen_reboot(int reason)
-{
-#ifdef CONFIG_SMP
-       smp_send_stop();
-#endif
-
-       if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
-               BUG();
-}
-
-static void xen_restart(char *msg)
-{
-       xen_reboot(SHUTDOWN_reboot);
-}
-
-static void xen_emergency_restart(void)
-{
-       xen_reboot(SHUTDOWN_reboot);
-}
-
-static void xen_machine_halt(void)
-{
-       xen_reboot(SHUTDOWN_poweroff);
-}
-
-static void xen_crash_shutdown(struct pt_regs *regs)
-{
-       xen_reboot(SHUTDOWN_crash);
-}
-
-static const struct machine_ops __initdata xen_machine_ops = {
-       .restart = xen_restart,
-       .halt = xen_machine_halt,
-       .power_off = xen_machine_halt,
-       .shutdown = xen_machine_halt,
-       .crash_shutdown = xen_crash_shutdown,
-       .emergency_restart = xen_emergency_restart,
-};
-
-
-/* First C function to be called on Xen boot */
-asmlinkage void __init xen_start_kernel(void)
-{
-       pgd_t *pgd;
-
-       if (!xen_start_info)
-               return;
-
-       BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
-
-       /* Install Xen paravirt ops */
-       paravirt_ops = xen_paravirt_ops;
-       machine_ops = xen_machine_ops;
-
-#ifdef CONFIG_SMP
-       smp_ops = xen_smp_ops;
-#endif
-
-       xen_setup_features();
-
-       /* Get mfn list */
-       if (!xen_feature(XENFEAT_auto_translated_physmap))
-               phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
-
-       pgd = (pgd_t *)xen_start_info->pt_base;
-
-       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-
-       init_mm.pgd = pgd; /* use the Xen pagetables to start */
-
-       /* keep using Xen gdt for now; no urgent need to change it */
-
-       x86_write_percpu(xen_cr3, __pa(pgd));
-
-#ifdef CONFIG_SMP
-       /* Don't do the full vcpu_info placement stuff until we have a
-          possible map. */
-       per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-#else
-       /* May as well do it now, since there's no good time to call
-          it later on UP. */
-       xen_setup_vcpu_info_placement();
-#endif
-
-       paravirt_ops.kernel_rpl = 1;
-       if (xen_feature(XENFEAT_supervisor_mode_kernel))
-               paravirt_ops.kernel_rpl = 0;
-
-       /* set the limit of our address space */
-       reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
-
-       /* set up basic CPUID stuff */
-       cpu_detect(&new_cpu_data);
-       new_cpu_data.hard_math = 1;
-       new_cpu_data.x86_capability[0] = cpuid_edx(1);
-
-       /* Poke various useful things into boot_params */
-       LOADER_TYPE = (9 << 4) | 0;
-       INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
-       INITRD_SIZE = xen_start_info->mod_len;
-
-       /* Start the world */
-       start_kernel();
-}
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
deleted file mode 100644 (file)
index da1b173..0000000
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Xen event channels
- *
- * Xen models interrupts with abstract event channels.  Because each
- * domain gets 1024 event channels, but NR_IRQ is not that large, we
- * must dynamically map irqs<->event channels.  The event channels
- * interface with the rest of the kernel by defining a xen interrupt
- * chip.  When an event is recieved, it is mapped to an irq and sent
- * through the normal interrupt processing path.
- *
- * There are four kinds of events which can be mapped to an event
- * channel:
- *
- * 1. Inter-domain notifications.  This includes all the virtual
- *    device events, since they're driven by front-ends in another domain
- *    (typically dom0).
- * 2. VIRQs, typically used for timers.  These are per-cpu events.
- * 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/linkage.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/ptrace.h>
-#include <asm/irq.h>
-#include <asm/sync_bitops.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-
-#include <xen/events.h>
-#include <xen/interface/xen.h>
-#include <xen/interface/event_channel.h>
-
-#include "xen-ops.h"
-
-/*
- * This lock protects updates to the following mapping and reference-count
- * arrays. The lock does not need to be acquired to read the mapping tables.
- */
-static DEFINE_SPINLOCK(irq_mapping_update_lock);
-
-/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
-
-/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
-
-/* Packed IRQ information: binding type, sub-type index, and event channel. */
-struct packed_irq
-{
-       unsigned short evtchn;
-       unsigned char index;
-       unsigned char type;
-};
-
-static struct packed_irq irq_info[NR_IRQS];
-
-/* Binding types. */
-enum {
-       IRQT_UNBOUND,
-       IRQT_PIRQ,
-       IRQT_VIRQ,
-       IRQT_IPI,
-       IRQT_EVTCHN
-};
-
-/* Convenient shorthand for packed representation of an unbound IRQ. */
-#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
-
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
-       [0 ... NR_EVENT_CHANNELS-1] = -1
-};
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
-static u8 cpu_evtchn[NR_EVENT_CHANNELS];
-
-/* Reference counts for bindings to IRQs. */
-static int irq_bindcount[NR_IRQS];
-
-/* Xen will never allocate port zero for any purpose. */
-#define VALID_EVTCHN(chn)      ((chn) != 0)
-
-/*
- * Force a proper event-channel callback from Xen after clearing the
- * callback mask. We do this in a very simple manner, by making a call
- * down into Xen. The pending flag will be checked by Xen on return.
- */
-void force_evtchn_callback(void)
-{
-       (void)HYPERVISOR_xen_version(0, NULL);
-}
-EXPORT_SYMBOL_GPL(force_evtchn_callback);
-
-static struct irq_chip xen_dynamic_chip;
-
-/* Constructor for packed IRQ information. */
-static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
-{
-       return (struct packed_irq) { evtchn, index, type };
-}
-
-/*
- * Accessors for packed IRQ information.
- */
-static inline unsigned int evtchn_from_irq(int irq)
-{
-       return irq_info[irq].evtchn;
-}
-
-static inline unsigned int index_from_irq(int irq)
-{
-       return irq_info[irq].index;
-}
-
-static inline unsigned int type_from_irq(int irq)
-{
-       return irq_info[irq].type;
-}
-
-static inline unsigned long active_evtchns(unsigned int cpu,
-                                          struct shared_info *sh,
-                                          unsigned int idx)
-{
-       return (sh->evtchn_pending[idx] &
-               cpu_evtchn_mask[cpu][idx] &
-               ~sh->evtchn_mask[idx]);
-}
-
-static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
-{
-       int irq = evtchn_to_irq[chn];
-
-       BUG_ON(irq == -1);
-#ifdef CONFIG_SMP
-       irq_desc[irq].affinity = cpumask_of_cpu(cpu);
-#endif
-
-       __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
-       __set_bit(chn, cpu_evtchn_mask[cpu]);
-
-       cpu_evtchn[chn] = cpu;
-}
-
-static void init_evtchn_cpu_bindings(void)
-{
-#ifdef CONFIG_SMP
-       int i;
-       /* By default all event channels notify CPU#0. */
-       for (i = 0; i < NR_IRQS; i++)
-               irq_desc[i].affinity = cpumask_of_cpu(0);
-#endif
-
-       memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
-       memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
-}
-
-static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
-{
-       return cpu_evtchn[evtchn];
-}
-
-static inline void clear_evtchn(int port)
-{
-       struct shared_info *s = HYPERVISOR_shared_info;
-       sync_clear_bit(port, &s->evtchn_pending[0]);
-}
-
-static inline void set_evtchn(int port)
-{
-       struct shared_info *s = HYPERVISOR_shared_info;
-       sync_set_bit(port, &s->evtchn_pending[0]);
-}
-
-
-/**
- * notify_remote_via_irq - send event to remote end of event channel via irq
- * @irq: irq of event channel to send event to
- *
- * Unlike notify_remote_via_evtchn(), this is safe to use across
- * save/restore. Notifications on a broken connection are silently
- * dropped.
- */
-void notify_remote_via_irq(int irq)
-{
-       int evtchn = evtchn_from_irq(irq);
-
-       if (VALID_EVTCHN(evtchn))
-               notify_remote_via_evtchn(evtchn);
-}
-EXPORT_SYMBOL_GPL(notify_remote_via_irq);
-
-static void mask_evtchn(int port)
-{
-       struct shared_info *s = HYPERVISOR_shared_info;
-       sync_set_bit(port, &s->evtchn_mask[0]);
-}
-
-static void unmask_evtchn(int port)
-{
-       struct shared_info *s = HYPERVISOR_shared_info;
-       unsigned int cpu = get_cpu();
-
-       BUG_ON(!irqs_disabled());
-
-       /* Slow path (hypercall) if this is a non-local port. */
-       if (unlikely(cpu != cpu_from_evtchn(port))) {
-               struct evtchn_unmask unmask = { .port = port };
-               (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
-       } else {
-               struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-
-               sync_clear_bit(port, &s->evtchn_mask[0]);
-
-               /*
-                * The following is basically the equivalent of
-                * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
-                * the interrupt edge' if the channel is masked.
-                */
-               if (sync_test_bit(port, &s->evtchn_pending[0]) &&
-                   !sync_test_and_set_bit(port / BITS_PER_LONG,
-                                          &vcpu_info->evtchn_pending_sel))
-                       vcpu_info->evtchn_upcall_pending = 1;
-       }
-
-       put_cpu();
-}
-
-static int find_unbound_irq(void)
-{
-       int irq;
-
-       /* Only allocate from dynirq range */
-       for (irq = 0; irq < NR_IRQS; irq++)
-               if (irq_bindcount[irq] == 0)
-                       break;
-
-       if (irq == NR_IRQS)
-               panic("No available IRQ to bind to: increase NR_IRQS!\n");
-
-       return irq;
-}
-
-int bind_evtchn_to_irq(unsigned int evtchn)
-{
-       int irq;
-
-       spin_lock(&irq_mapping_update_lock);
-
-       irq = evtchn_to_irq[evtchn];
-
-       if (irq == -1) {
-               irq = find_unbound_irq();
-
-               dynamic_irq_init(irq);
-               set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-                                             handle_level_irq, "event");
-
-               evtchn_to_irq[evtchn] = irq;
-               irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
-       }
-
-       irq_bindcount[irq]++;
-
-       spin_unlock(&irq_mapping_update_lock);
-
-       return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
-
-static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
-{
-       struct evtchn_bind_ipi bind_ipi;
-       int evtchn, irq;
-
-       spin_lock(&irq_mapping_update_lock);
-
-       irq = per_cpu(ipi_to_irq, cpu)[ipi];
-       if (irq == -1) {
-               irq = find_unbound_irq();
-               if (irq < 0)
-                       goto out;
-
-               dynamic_irq_init(irq);
-               set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-                                             handle_level_irq, "ipi");
-
-               bind_ipi.vcpu = cpu;
-               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
-                                               &bind_ipi) != 0)
-                       BUG();
-               evtchn = bind_ipi.port;
-
-               evtchn_to_irq[evtchn] = irq;
-               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
-
-               per_cpu(ipi_to_irq, cpu)[ipi] = irq;
-
-               bind_evtchn_to_cpu(evtchn, cpu);
-       }
-
-       irq_bindcount[irq]++;
-
- out:
-       spin_unlock(&irq_mapping_update_lock);
-       return irq;
-}
-
-
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
-{
-       struct evtchn_bind_virq bind_virq;
-       int evtchn, irq;
-
-       spin_lock(&irq_mapping_update_lock);
-
-       irq = per_cpu(virq_to_irq, cpu)[virq];
-
-       if (irq == -1) {
-               bind_virq.virq = virq;
-               bind_virq.vcpu = cpu;
-               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
-                                               &bind_virq) != 0)
-                       BUG();
-               evtchn = bind_virq.port;
-
-               irq = find_unbound_irq();
-
-               dynamic_irq_init(irq);
-               set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-                                             handle_level_irq, "virq");
-
-               evtchn_to_irq[evtchn] = irq;
-               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
-
-               per_cpu(virq_to_irq, cpu)[virq] = irq;
-
-               bind_evtchn_to_cpu(evtchn, cpu);
-       }
-
-       irq_bindcount[irq]++;
-
-       spin_unlock(&irq_mapping_update_lock);
-
-       return irq;
-}
-
-static void unbind_from_irq(unsigned int irq)
-{
-       struct evtchn_close close;
-       int evtchn = evtchn_from_irq(irq);
-
-       spin_lock(&irq_mapping_update_lock);
-
-       if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
-               close.port = evtchn;
-               if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-                       BUG();
-
-               switch (type_from_irq(irq)) {
-               case IRQT_VIRQ:
-                       per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
-                               [index_from_irq(irq)] = -1;
-                       break;
-               default:
-                       break;
-               }
-
-               /* Closed ports are implicitly re-bound to VCPU0. */
-               bind_evtchn_to_cpu(evtchn, 0);
-
-               evtchn_to_irq[evtchn] = -1;
-               irq_info[irq] = IRQ_UNBOUND;
-
-               dynamic_irq_init(irq);
-       }
-
-       spin_unlock(&irq_mapping_update_lock);
-}
-
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
-                             irqreturn_t (*handler)(int, void *),
-                             unsigned long irqflags,
-                             const char *devname, void *dev_id)
-{
-       unsigned int irq;
-       int retval;
-
-       irq = bind_evtchn_to_irq(evtchn);
-       retval = request_irq(irq, handler, irqflags, devname, dev_id);
-       if (retval != 0) {
-               unbind_from_irq(irq);
-               return retval;
-       }
-
-       return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-
-int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
-                           irqreturn_t (*handler)(int, void *),
-                           unsigned long irqflags, const char *devname, void *dev_id)
-{
-       unsigned int irq;
-       int retval;
-
-       irq = bind_virq_to_irq(virq, cpu);
-       retval = request_irq(irq, handler, irqflags, devname, dev_id);
-       if (retval != 0) {
-               unbind_from_irq(irq);
-               return retval;
-       }
-
-       return irq;
-}
-EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
-
-int bind_ipi_to_irqhandler(enum ipi_vector ipi,
-                          unsigned int cpu,
-                          irq_handler_t handler,
-                          unsigned long irqflags,
-                          const char *devname,
-                          void *dev_id)
-{
-       int irq, retval;
-
-       irq = bind_ipi_to_irq(ipi, cpu);
-       if (irq < 0)
-               return irq;
-
-       retval = request_irq(irq, handler, irqflags, devname, dev_id);
-       if (retval != 0) {
-               unbind_from_irq(irq);
-               return retval;
-       }
-
-       return irq;
-}
-
-void unbind_from_irqhandler(unsigned int irq, void *dev_id)
-{
-       free_irq(irq, dev_id);
-       unbind_from_irq(irq);
-}
-EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
-
-void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
-{
-       int irq = per_cpu(ipi_to_irq, cpu)[vector];
-       BUG_ON(irq < 0);
-       notify_remote_via_irq(irq);
-}
-
-
-/*
- * Search the CPUs pending events bitmasks.  For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
- *
- * Xen uses a two-level bitmap to speed searching.  The first level is
- * a bitset of words which contain pending event bits.  The second
- * level is a bitset of pending events themselves.
- */
-fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
-{
-       int cpu = get_cpu();
-       struct shared_info *s = HYPERVISOR_shared_info;
-       struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-       unsigned long pending_words;
-
-       vcpu_info->evtchn_upcall_pending = 0;
-
-       /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
-       pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
-       while (pending_words != 0) {
-               unsigned long pending_bits;
-               int word_idx = __ffs(pending_words);
-               pending_words &= ~(1UL << word_idx);
-
-               while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
-                       int bit_idx = __ffs(pending_bits);
-                       int port = (word_idx * BITS_PER_LONG) + bit_idx;
-                       int irq = evtchn_to_irq[port];
-
-                       if (irq != -1) {
-                               regs->orig_eax = ~irq;
-                               do_IRQ(regs);
-                       }
-               }
-       }
-
-       put_cpu();
-}
-
-/* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
-{
-       struct evtchn_bind_vcpu bind_vcpu;
-       int evtchn = evtchn_from_irq(irq);
-
-       if (!VALID_EVTCHN(evtchn))
-               return;
-
-       /* Send future instances of this interrupt to other vcpu. */
-       bind_vcpu.port = evtchn;
-       bind_vcpu.vcpu = tcpu;
-
-       /*
-        * If this fails, it usually just indicates that we're dealing with a
-        * virq or IPI channel, which don't actually need to be rebound. Ignore
-        * it, but don't do the xenlinux-level rebind in that case.
-        */
-       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
-               bind_evtchn_to_cpu(evtchn, tcpu);
-}
-
-
-static void set_affinity_irq(unsigned irq, cpumask_t dest)
-{
-       unsigned tcpu = first_cpu(dest);
-       rebind_irq_to_cpu(irq, tcpu);
-}
-
-static void enable_dynirq(unsigned int irq)
-{
-       int evtchn = evtchn_from_irq(irq);
-
-       if (VALID_EVTCHN(evtchn))
-               unmask_evtchn(evtchn);
-}
-
-static void disable_dynirq(unsigned int irq)
-{
-       int evtchn = evtchn_from_irq(irq);
-
-       if (VALID_EVTCHN(evtchn))
-               mask_evtchn(evtchn);
-}
-
-static void ack_dynirq(unsigned int irq)
-{
-       int evtchn = evtchn_from_irq(irq);
-
-       move_native_irq(irq);
-
-       if (VALID_EVTCHN(evtchn))
-               clear_evtchn(evtchn);
-}
-
-static int retrigger_dynirq(unsigned int irq)
-{
-       int evtchn = evtchn_from_irq(irq);
-       int ret = 0;
-
-       if (VALID_EVTCHN(evtchn)) {
-               set_evtchn(evtchn);
-               ret = 1;
-       }
-
-       return ret;
-}
-
-static struct irq_chip xen_dynamic_chip __read_mostly = {
-       .name           = "xen-dyn",
-       .mask           = disable_dynirq,
-       .unmask         = enable_dynirq,
-       .ack            = ack_dynirq,
-       .set_affinity   = set_affinity_irq,
-       .retrigger      = retrigger_dynirq,
-};
-
-void __init xen_init_IRQ(void)
-{
-       int i;
-
-       init_evtchn_cpu_bindings();
-
-       /* No event channels are 'live' right now. */
-       for (i = 0; i < NR_EVENT_CHANNELS; i++)
-               mask_evtchn(i);
-
-       /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-       for (i = 0; i < NR_IRQS; i++)
-               irq_bindcount[i] = 0;
-
-       irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c
deleted file mode 100644 (file)
index 0707714..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/******************************************************************************
- * features.c
- *
- * Xen feature flags.
- *
- * Copyright (c) 2006, Ian Campbell, XenSource Inc.
- */
-#include <linux/types.h>
-#include <linux/cache.h>
-#include <linux/module.h>
-#include <asm/xen/hypervisor.h>
-#include <xen/features.h>
-
-u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
-
-void xen_setup_features(void)
-{
-       struct xen_feature_info fi;
-       int i, j;
-
-       for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
-               fi.submap_idx = i;
-               if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
-                       break;
-               for (j = 0; j < 32; j++)
-                       xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
-       }
-}
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c
deleted file mode 100644 (file)
index aa7af9e..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Handle extern requests for shutdown, reboot and sysrq
- */
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/reboot.h>
-#include <linux/sysrq.h>
-
-#include <xen/xenbus.h>
-
-#define SHUTDOWN_INVALID  -1
-#define SHUTDOWN_POWEROFF  0
-#define SHUTDOWN_SUSPEND   2
-/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
- * report a crash, not be instructed to crash!
- * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
- * the distinction when we return the reason code to them.
- */
-#define SHUTDOWN_HALT      4
-
-/* Ignore multiple shutdown requests. */
-static int shutting_down = SHUTDOWN_INVALID;
-
-static void shutdown_handler(struct xenbus_watch *watch,
-                            const char **vec, unsigned int len)
-{
-       char *str;
-       struct xenbus_transaction xbt;
-       int err;
-
-       if (shutting_down != SHUTDOWN_INVALID)
-               return;
-
- again:
-       err = xenbus_transaction_start(&xbt);
-       if (err)
-               return;
-
-       str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
-       /* Ignore read errors and empty reads. */
-       if (XENBUS_IS_ERR_READ(str)) {
-               xenbus_transaction_end(xbt, 1);
-               return;
-       }
-
-       xenbus_write(xbt, "control", "shutdown", "");
-
-       err = xenbus_transaction_end(xbt, 0);
-       if (err == -EAGAIN) {
-               kfree(str);
-               goto again;
-       }
-
-       if (strcmp(str, "poweroff") == 0 ||
-           strcmp(str, "halt") == 0)
-               orderly_poweroff(false);
-       else if (strcmp(str, "reboot") == 0)
-               ctrl_alt_del();
-       else {
-               printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
-               shutting_down = SHUTDOWN_INVALID;
-       }
-
-       kfree(str);
-}
-
-static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
-                         unsigned int len)
-{
-       char sysrq_key = '\0';
-       struct xenbus_transaction xbt;
-       int err;
-
- again:
-       err = xenbus_transaction_start(&xbt);
-       if (err)
-               return;
-       if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
-               printk(KERN_ERR "Unable to read sysrq code in "
-                      "control/sysrq\n");
-               xenbus_transaction_end(xbt, 1);
-               return;
-       }
-
-       if (sysrq_key != '\0')
-               xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
-
-       err = xenbus_transaction_end(xbt, 0);
-       if (err == -EAGAIN)
-               goto again;
-
-       if (sysrq_key != '\0')
-               handle_sysrq(sysrq_key, NULL);
-}
-
-static struct xenbus_watch shutdown_watch = {
-       .node = "control/shutdown",
-       .callback = shutdown_handler
-};
-
-static struct xenbus_watch sysrq_watch = {
-       .node = "control/sysrq",
-       .callback = sysrq_handler
-};
-
-static int setup_shutdown_watcher(void)
-{
-       int err;
-
-       err = register_xenbus_watch(&shutdown_watch);
-       if (err) {
-               printk(KERN_ERR "Failed to set shutdown watcher\n");
-               return err;
-       }
-
-       err = register_xenbus_watch(&sysrq_watch);
-       if (err) {
-               printk(KERN_ERR "Failed to set sysrq watcher\n");
-               return err;
-       }
-
-       return 0;
-}
-
-static int shutdown_event(struct notifier_block *notifier,
-                         unsigned long event,
-                         void *data)
-{
-       setup_shutdown_watcher();
-       return NOTIFY_DONE;
-}
-
-static int __init setup_shutdown_event(void)
-{
-       static struct notifier_block xenstore_notifier = {
-               .notifier_call = shutdown_event
-       };
-       register_xenstore_notifier(&xenstore_notifier);
-
-       return 0;
-}
-
-subsys_initcall(setup_shutdown_event);
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
deleted file mode 100644 (file)
index 874db0c..0000000
+++ /dev/null
@@ -1,567 +0,0 @@
-/*
- * Xen mmu operations
- *
- * This file contains the various mmu fetch and update operations.
- * The most important job they must perform is the mapping between the
- * domain's pfn and the overall machine mfns.
- *
- * Xen allows guests to directly update the pagetable, in a controlled
- * fashion.  In other words, the guest modifies the same pagetable
- * that the CPU actually uses, which eliminates the overhead of having
- * a separate shadow pagetable.
- *
- * In order to allow this, it falls on the guest domain to map its
- * notion of a "physical" pfn - which is just a domain-local linear
- * address - into a real "machine address" which the CPU's MMU can
- * use.
- *
- * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
- * inserted directly into the pagetable.  When creating a new
- * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
- * when reading the content back with __(pgd|pmd|pte)_val, it converts
- * the mfn back into a pfn.
- *
- * The other constraint is that all pages which make up a pagetable
- * must be mapped read-only in the guest.  This prevents uncontrolled
- * guest updates to the pagetable.  Xen strictly enforces this, and
- * will disallow any pagetable update which will end up mapping a
- * pagetable page RW, and will disallow using any writable page as a
- * pagetable.
- *
- * Naively, when loading %cr3 with the base of a new pagetable, Xen
- * would need to validate the whole pagetable before going on.
- * Naturally, this is quite slow.  The solution is to "pin" a
- * pagetable, which enforces all the constraints on the pagetable even
- * when it is not actively in use.  This menas that Xen can be assured
- * that it is still valid when you do load it into %cr3, and doesn't
- * need to revalidate it.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/bug.h>
-#include <linux/sched.h>
-
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/paravirt.h>
-
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-
-#include <xen/page.h>
-#include <xen/interface/xen.h>
-
-#include "multicalls.h"
-#include "mmu.h"
-
-xmaddr_t arbitrary_virt_to_machine(unsigned long address)
-{
-       pte_t *pte = lookup_address(address);
-       unsigned offset = address & PAGE_MASK;
-
-       BUG_ON(pte == NULL);
-
-       return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
-}
-
-void make_lowmem_page_readonly(void *vaddr)
-{
-       pte_t *pte, ptev;
-       unsigned long address = (unsigned long)vaddr;
-
-       pte = lookup_address(address);
-       BUG_ON(pte == NULL);
-
-       ptev = pte_wrprotect(*pte);
-
-       if (HYPERVISOR_update_va_mapping(address, ptev, 0))
-               BUG();
-}
-
-void make_lowmem_page_readwrite(void *vaddr)
-{
-       pte_t *pte, ptev;
-       unsigned long address = (unsigned long)vaddr;
-
-       pte = lookup_address(address);
-       BUG_ON(pte == NULL);
-
-       ptev = pte_mkwrite(*pte);
-
-       if (HYPERVISOR_update_va_mapping(address, ptev, 0))
-               BUG();
-}
-
-
-void xen_set_pmd(pmd_t *ptr, pmd_t val)
-{
-       struct multicall_space mcs;
-       struct mmu_update *u;
-
-       preempt_disable();
-
-       mcs = xen_mc_entry(sizeof(*u));
-       u = mcs.args;
-       u->ptr = virt_to_machine(ptr).maddr;
-       u->val = pmd_val_ma(val);
-       MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-       preempt_enable();
-}
-
-/*
- * Associate a virtual page frame with a given physical page frame
- * and protection flags for that frame.
- */
-void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       pgd = swapper_pg_dir + pgd_index(vaddr);
-       if (pgd_none(*pgd)) {
-               BUG();
-               return;
-       }
-       pud = pud_offset(pgd, vaddr);
-       if (pud_none(*pud)) {
-               BUG();
-               return;
-       }
-       pmd = pmd_offset(pud, vaddr);
-       if (pmd_none(*pmd)) {
-               BUG();
-               return;
-       }
-       pte = pte_offset_kernel(pmd, vaddr);
-       /* <mfn,flags> stored as-is, to permit clearing entries */
-       xen_set_pte(pte, mfn_pte(mfn, flags));
-
-       /*
-        * It's enough to flush this one mapping.
-        * (PGE mappings get flushed as well)
-        */
-       __flush_tlb_one(vaddr);
-}
-
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
-                   pte_t *ptep, pte_t pteval)
-{
-       if (mm == current->mm || mm == &init_mm) {
-               if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
-                       struct multicall_space mcs;
-                       mcs = xen_mc_entry(0);
-
-                       MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
-                       xen_mc_issue(PARAVIRT_LAZY_MMU);
-                       return;
-               } else
-                       if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
-                               return;
-       }
-       xen_set_pte(ptep, pteval);
-}
-
-#ifdef CONFIG_X86_PAE
-void xen_set_pud(pud_t *ptr, pud_t val)
-{
-       struct multicall_space mcs;
-       struct mmu_update *u;
-
-       preempt_disable();
-
-       mcs = xen_mc_entry(sizeof(*u));
-       u = mcs.args;
-       u->ptr = virt_to_machine(ptr).maddr;
-       u->val = pud_val_ma(val);
-       MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-       preempt_enable();
-}
-
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
-       ptep->pte_high = pte.pte_high;
-       smp_wmb();
-       ptep->pte_low = pte.pte_low;
-}
-
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-       set_64bit((u64 *)ptep, pte_val_ma(pte));
-}
-
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       ptep->pte_low = 0;
-       smp_wmb();              /* make sure low gets written first */
-       ptep->pte_high = 0;
-}
-
-void xen_pmd_clear(pmd_t *pmdp)
-{
-       xen_set_pmd(pmdp, __pmd(0));
-}
-
-unsigned long long xen_pte_val(pte_t pte)
-{
-       unsigned long long ret = 0;
-
-       if (pte.pte_low) {
-               ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
-               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
-       }
-
-       return ret;
-}
-
-unsigned long long xen_pmd_val(pmd_t pmd)
-{
-       unsigned long long ret = pmd.pmd;
-       if (ret)
-               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
-       return ret;
-}
-
-unsigned long long xen_pgd_val(pgd_t pgd)
-{
-       unsigned long long ret = pgd.pgd;
-       if (ret)
-               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
-       return ret;
-}
-
-pte_t xen_make_pte(unsigned long long pte)
-{
-       if (pte & 1)
-               pte = phys_to_machine(XPADDR(pte)).maddr;
-
-       return (pte_t){ pte, pte >> 32 };
-}
-
-pmd_t xen_make_pmd(unsigned long long pmd)
-{
-       if (pmd & 1)
-               pmd = phys_to_machine(XPADDR(pmd)).maddr;
-
-       return (pmd_t){ pmd };
-}
-
-pgd_t xen_make_pgd(unsigned long long pgd)
-{
-       if (pgd & _PAGE_PRESENT)
-               pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
-       return (pgd_t){ pgd };
-}
-#else  /* !PAE */
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
-       *ptep = pte;
-}
-
-unsigned long xen_pte_val(pte_t pte)
-{
-       unsigned long ret = pte.pte_low;
-
-       if (ret & _PAGE_PRESENT)
-               ret = machine_to_phys(XMADDR(ret)).paddr;
-
-       return ret;
-}
-
-unsigned long xen_pgd_val(pgd_t pgd)
-{
-       unsigned long ret = pgd.pgd;
-       if (ret)
-               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
-       return ret;
-}
-
-pte_t xen_make_pte(unsigned long pte)
-{
-       if (pte & _PAGE_PRESENT)
-               pte = phys_to_machine(XPADDR(pte)).maddr;
-
-       return (pte_t){ pte };
-}
-
-pgd_t xen_make_pgd(unsigned long pgd)
-{
-       if (pgd & _PAGE_PRESENT)
-               pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
-       return (pgd_t){ pgd };
-}
-#endif /* CONFIG_X86_PAE */
-
-
-
-/*
-  (Yet another) pagetable walker.  This one is intended for pinning a
-  pagetable.  This means that it walks a pagetable and calls the
-  callback function on each page it finds making up the page table,
-  at every level.  It walks the entire pagetable, but it only bothers
-  pinning pte pages which are below pte_limit.  In the normal case
-  this will be TASK_SIZE, but at boot we need to pin up to
-  FIXADDR_TOP.  But the important bit is that we don't pin beyond
-  there, because then we start getting into Xen's ptes.
-*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
-                   unsigned long limit)
-{
-       pgd_t *pgd = pgd_base;
-       int flush = 0;
-       unsigned long addr = 0;
-       unsigned long pgd_next;
-
-       BUG_ON(limit > FIXADDR_TOP);
-
-       if (xen_feature(XENFEAT_auto_translated_physmap))
-               return 0;
-
-       for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
-               pud_t *pud;
-               unsigned long pud_limit, pud_next;
-
-               pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
-
-               if (!pgd_val(*pgd))
-                       continue;
-
-               pud = pud_offset(pgd, 0);
-
-               if (PTRS_PER_PUD > 1) /* not folded */
-                       flush |= (*func)(virt_to_page(pud), 0);
-
-               for (; addr != pud_limit; pud++, addr = pud_next) {
-                       pmd_t *pmd;
-                       unsigned long pmd_limit;
-
-                       pud_next = pud_addr_end(addr, pud_limit);
-
-                       if (pud_next < limit)
-                               pmd_limit = pud_next;
-                       else
-                               pmd_limit = limit;
-
-                       if (pud_none(*pud))
-                               continue;
-
-                       pmd = pmd_offset(pud, 0);
-
-                       if (PTRS_PER_PMD > 1) /* not folded */
-                               flush |= (*func)(virt_to_page(pmd), 0);
-
-                       for (; addr != pmd_limit; pmd++) {
-                               addr += (PAGE_SIZE * PTRS_PER_PTE);
-                               if ((pmd_limit-1) < (addr-1)) {
-                                       addr = pmd_limit;
-                                       break;
-                               }
-
-                               if (pmd_none(*pmd))
-                                       continue;
-
-                               flush |= (*func)(pmd_page(*pmd), 0);
-                       }
-               }
-       }
-
-       flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
-
-       return flush;
-}
-
-static int pin_page(struct page *page, unsigned flags)
-{
-       unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
-       int flush;
-
-       if (pgfl)
-               flush = 0;              /* already pinned */
-       else if (PageHighMem(page))
-               /* kmaps need flushing if we found an unpinned
-                  highpage */
-               flush = 1;
-       else {
-               void *pt = lowmem_page_address(page);
-               unsigned long pfn = page_to_pfn(page);
-               struct multicall_space mcs = __xen_mc_entry(0);
-
-               flush = 0;
-
-               MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
-                                       pfn_pte(pfn, PAGE_KERNEL_RO),
-                                       flags);
-       }
-
-       return flush;
-}
-
-/* This is called just after a mm has been created, but it has not
-   been used yet.  We need to make sure that its pagetable is all
-   read-only, and can be pinned. */
-void xen_pgd_pin(pgd_t *pgd)
-{
-       struct multicall_space mcs;
-       struct mmuext_op *op;
-
-       xen_mc_batch();
-
-       if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
-               /* re-enable interrupts for kmap_flush_unused */
-               xen_mc_issue(0);
-               kmap_flush_unused();
-               xen_mc_batch();
-       }
-
-       mcs = __xen_mc_entry(sizeof(*op));
-       op = mcs.args;
-
-#ifdef CONFIG_X86_PAE
-       op->cmd = MMUEXT_PIN_L3_TABLE;
-#else
-       op->cmd = MMUEXT_PIN_L2_TABLE;
-#endif
-       op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(0);
-}
-
-/* The init_mm pagetable is really pinned as soon as its created, but
-   that's before we have page structures to store the bits.  So do all
-   the book-keeping now. */
-static __init int mark_pinned(struct page *page, unsigned flags)
-{
-       SetPagePinned(page);
-       return 0;
-}
-
-void __init xen_mark_init_mm_pinned(void)
-{
-       pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
-}
-
-static int unpin_page(struct page *page, unsigned flags)
-{
-       unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
-
-       if (pgfl && !PageHighMem(page)) {
-               void *pt = lowmem_page_address(page);
-               unsigned long pfn = page_to_pfn(page);
-               struct multicall_space mcs = __xen_mc_entry(0);
-
-               MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
-                                       pfn_pte(pfn, PAGE_KERNEL),
-                                       flags);
-       }
-
-       return 0;               /* never need to flush on unpin */
-}
-
-/* Release a pagetables pages back as normal RW */
-static void xen_pgd_unpin(pgd_t *pgd)
-{
-       struct mmuext_op *op;
-       struct multicall_space mcs;
-
-       xen_mc_batch();
-
-       mcs = __xen_mc_entry(sizeof(*op));
-
-       op = mcs.args;
-       op->cmd = MMUEXT_UNPIN_TABLE;
-       op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-       pgd_walk(pgd, unpin_page, TASK_SIZE);
-
-       xen_mc_issue(0);
-}
-
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
-       spin_lock(&next->page_table_lock);
-       xen_pgd_pin(next->pgd);
-       spin_unlock(&next->page_table_lock);
-}
-
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
-{
-       spin_lock(&mm->page_table_lock);
-       xen_pgd_pin(mm->pgd);
-       spin_unlock(&mm->page_table_lock);
-}
-
-
-#ifdef CONFIG_SMP
-/* Another cpu may still have their %cr3 pointing at the pagetable, so
-   we need to repoint it somewhere else before we can unpin it. */
-static void drop_other_mm_ref(void *info)
-{
-       struct mm_struct *mm = info;
-
-       if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
-               leave_mm(smp_processor_id());
-}
-
-static void drop_mm_ref(struct mm_struct *mm)
-{
-       if (current->active_mm == mm) {
-               if (current->mm == mm)
-                       load_cr3(swapper_pg_dir);
-               else
-                       leave_mm(smp_processor_id());
-       }
-
-       if (!cpus_empty(mm->cpu_vm_mask))
-               xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
-                                          mm, 1);
-}
-#else
-static void drop_mm_ref(struct mm_struct *mm)
-{
-       if (current->active_mm == mm)
-               load_cr3(swapper_pg_dir);
-}
-#endif
-
-/*
- * While a process runs, Xen pins its pagetables, which means that the
- * hypervisor forces it to be read-only, and it controls all updates
- * to it.  This means that all pagetable updates have to go via the
- * hypervisor, which is moderately expensive.
- *
- * Since we're pulling the pagetable down, we switch to use init_mm,
- * unpin old process pagetable and mark it all read-write, which
- * allows further operations on it to be simple memory accesses.
- *
- * The only subtle point is that another CPU may be still using the
- * pagetable because of lazy tlb flushing.  This means we need need to
- * switch all CPUs off this pagetable before we can unpin it.
- */
-void xen_exit_mmap(struct mm_struct *mm)
-{
-       get_cpu();              /* make sure we don't move around */
-       drop_mm_ref(mm);
-       put_cpu();
-
-       spin_lock(&mm->page_table_lock);
-
-       /* pgd may not be pinned in the error exit path of execve */
-       if (PagePinned(virt_to_page(mm->pgd)))
-               xen_pgd_unpin(mm->pgd);
-       spin_unlock(&mm->page_table_lock);
-}
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
deleted file mode 100644 (file)
index c9ff27f..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef _XEN_MMU_H
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-
-
-void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-void xen_set_pte(pte_t *ptep, pte_t pteval);
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
-                   pte_t *ptep, pte_t pteval);
-void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
-
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-void xen_exit_mmap(struct mm_struct *mm);
-
-void xen_pgd_pin(pgd_t *pgd);
-//void xen_pgd_unpin(pgd_t *pgd);
-
-#ifdef CONFIG_X86_PAE
-unsigned long long xen_pte_val(pte_t);
-unsigned long long xen_pmd_val(pmd_t);
-unsigned long long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long long);
-pmd_t xen_make_pmd(unsigned long long);
-pgd_t xen_make_pgd(unsigned long long);
-
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
-                   pte_t *ptep, pte_t pteval);
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
-void xen_set_pud(pud_t *ptr, pud_t val);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
-
-
-#else
-unsigned long xen_pte_val(pte_t);
-unsigned long xen_pmd_val(pmd_t);
-unsigned long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long);
-pmd_t xen_make_pmd(unsigned long);
-pgd_t xen_make_pgd(unsigned long);
-#endif
-
-#endif /* _XEN_MMU_H */
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
deleted file mode 100644 (file)
index c837e8e..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Xen hypercall batching.
- *
- * Xen allows multiple hypercalls to be issued at once, using the
- * multicall interface.  This allows the cost of trapping into the
- * hypervisor to be amortized over several calls.
- *
- * This file implements a simple interface for multicalls.  There's a
- * per-cpu buffer of outstanding multicalls.  When you want to queue a
- * multicall for issuing, you can allocate a multicall slot for the
- * call and its arguments, along with storage for space which is
- * pointed to by the arguments (for passing pointers to structures,
- * etc).  When the multicall is actually issued, all the space for the
- * commands and allocated memory is freed for reuse.
- *
- * Multicalls are flushed whenever any of the buffers get full, or
- * when explicitly requested.  There's no way to get per-multicall
- * return results back.  It will BUG if any of the multicalls fail.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-
-#include <asm/xen/hypercall.h>
-
-#include "multicalls.h"
-
-#define MC_BATCH       32
-#define MC_ARGS                (MC_BATCH * 16 / sizeof(u64))
-
-struct mc_buffer {
-       struct multicall_entry entries[MC_BATCH];
-       u64 args[MC_ARGS];
-       unsigned mcidx, argidx;
-};
-
-static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
-DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
-
-void xen_mc_flush(void)
-{
-       struct mc_buffer *b = &__get_cpu_var(mc_buffer);
-       int ret = 0;
-       unsigned long flags;
-
-       BUG_ON(preemptible());
-
-       /* Disable interrupts in case someone comes in and queues
-          something in the middle */
-       local_irq_save(flags);
-
-       if (b->mcidx) {
-               int i;
-
-               if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
-                       BUG();
-               for (i = 0; i < b->mcidx; i++)
-                       if (b->entries[i].result < 0)
-                               ret++;
-               b->mcidx = 0;
-               b->argidx = 0;
-       } else
-               BUG_ON(b->argidx != 0);
-
-       local_irq_restore(flags);
-
-       BUG_ON(ret);
-}
-
-struct multicall_space __xen_mc_entry(size_t args)
-{
-       struct mc_buffer *b = &__get_cpu_var(mc_buffer);
-       struct multicall_space ret;
-       unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
-
-       BUG_ON(preemptible());
-       BUG_ON(argspace > MC_ARGS);
-
-       if (b->mcidx == MC_BATCH ||
-           (b->argidx + argspace) > MC_ARGS)
-               xen_mc_flush();
-
-       ret.mc = &b->entries[b->mcidx];
-       b->mcidx++;
-       ret.args = &b->args[b->argidx];
-       b->argidx += argspace;
-
-       return ret;
-}
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h
deleted file mode 100644 (file)
index e6f7530..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef _XEN_MULTICALLS_H
-#define _XEN_MULTICALLS_H
-
-#include "xen-ops.h"
-
-/* Multicalls */
-struct multicall_space
-{
-       struct multicall_entry *mc;
-       void *args;
-};
-
-/* Allocate room for a multicall and its args */
-struct multicall_space __xen_mc_entry(size_t args);
-
-DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
-
-/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
-   paired with xen_mc_issue() */
-static inline void xen_mc_batch(void)
-{
-       /* need to disable interrupts until this entry is complete */
-       local_irq_save(__get_cpu_var(xen_mc_irq_flags));
-}
-
-static inline struct multicall_space xen_mc_entry(size_t args)
-{
-       xen_mc_batch();
-       return __xen_mc_entry(args);
-}
-
-/* Flush all pending multicalls */
-void xen_mc_flush(void);
-
-/* Issue a multicall if we're not in a lazy mode */
-static inline void xen_mc_issue(unsigned mode)
-{
-       if ((xen_get_lazy_mode() & mode) == 0)
-               xen_mc_flush();
-
-       /* restore flags saved in xen_mc_batch */
-       local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
-}
-
-#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
deleted file mode 100644 (file)
index f84e772..0000000
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Machine specific setup for xen
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/pm.h>
-
-#include <asm/elf.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/interface/physdev.h>
-#include <xen/features.h>
-
-#include "xen-ops.h"
-#include "vdso.h"
-
-/* These are code, but not functions.  Defined in entry.S */
-extern const char xen_hypervisor_callback[];
-extern const char xen_failsafe_callback[];
-
-unsigned long *phys_to_machine_mapping;
-EXPORT_SYMBOL(phys_to_machine_mapping);
-
-/**
- * machine_specific_memory_setup - Hook for machine specific memory setup.
- **/
-
-char * __init xen_memory_setup(void)
-{
-       unsigned long max_pfn = xen_start_info->nr_pages;
-
-       e820.nr_map = 0;
-       add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
-
-       return "Xen";
-}
-
-static void xen_idle(void)
-{
-       local_irq_disable();
-
-       if (need_resched())
-               local_irq_enable();
-       else {
-               current_thread_info()->status &= ~TS_POLLING;
-               smp_mb__after_clear_bit();
-               safe_halt();
-               current_thread_info()->status |= TS_POLLING;
-       }
-}
-
-/*
- * Set the bit indicating "nosegneg" library variants should be used.
- */
-static void fiddle_vdso(void)
-{
-       extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S.  */
-       extern char vsyscall_int80_start;
-       u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
-                            &vsyscall_int80_start);
-       *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
-}
-
-void __init xen_arch_setup(void)
-{
-       struct physdev_set_iopl set_iopl;
-       int rc;
-
-       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
-       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
-
-       if (!xen_feature(XENFEAT_auto_translated_physmap))
-               HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
-
-       HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
-                                __KERNEL_CS, (unsigned long)xen_failsafe_callback);
-
-       set_iopl.iopl = 1;
-       rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
-       if (rc != 0)
-               printk(KERN_INFO "physdev_op failed %d\n", rc);
-
-#ifdef CONFIG_ACPI
-       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
-               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
-               disable_acpi();
-       }
-#endif
-
-       memcpy(boot_command_line, xen_start_info->cmd_line,
-              MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
-              COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
-
-       pm_idle = xen_idle;
-
-#ifdef CONFIG_SMP
-       /* fill cpus_possible with all available cpus */
-       xen_fill_possible_map();
-#endif
-
-       paravirt_disable_iospace();
-
-       fiddle_vdso();
-}
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
deleted file mode 100644 (file)
index 557b8e2..0000000
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
- * Xen SMP support
- *
- * This file implements the Xen versions of smp_ops.  SMP under Xen is
- * very straightforward.  Bringing a CPU up is simply a matter of
- * loading its initial context and setting it running.
- *
- * IPIs are handled through the Xen event mechanism.
- *
- * Because virtual CPUs can be scheduled onto any real CPU, there's no
- * useful topology information for the kernel to make use of.  As a
- * result, all CPUs are treated as if they're single-core and
- * single-threaded.
- *
- * This does not handle HOTPLUG_CPU yet.
- */
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/smp.h>
-
-#include <asm/paravirt.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/cpu.h>
-
-#include <xen/interface/xen.h>
-#include <xen/interface/vcpu.h>
-
-#include <asm/xen/interface.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/page.h>
-#include <xen/events.h>
-
-#include "xen-ops.h"
-#include "mmu.h"
-
-static cpumask_t cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq);
-static DEFINE_PER_CPU(int, callfunc_irq);
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-       void (*func) (void *info);
-       void *info;
-       atomic_t started;
-       atomic_t finished;
-       int wait;
-};
-
-static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
-
-static struct call_data_struct *call_data;
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
-{
-       return IRQ_HANDLED;
-}
-
-static __cpuinit void cpu_bringup_and_idle(void)
-{
-       int cpu = smp_processor_id();
-
-       cpu_init();
-
-       preempt_disable();
-       per_cpu(cpu_state, cpu) = CPU_ONLINE;
-
-       xen_setup_cpu_clockevents();
-
-       /* We can take interrupts now: we're officially "up". */
-       local_irq_enable();
-
-       wmb();                  /* make sure everything is out */
-       cpu_idle();
-}
-
-static int xen_smp_intr_init(unsigned int cpu)
-{
-       int rc;
-       const char *resched_name, *callfunc_name;
-
-       per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
-
-       resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
-       rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
-                                   cpu,
-                                   xen_reschedule_interrupt,
-                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
-                                   resched_name,
-                                   NULL);
-       if (rc < 0)
-               goto fail;
-       per_cpu(resched_irq, cpu) = rc;
-
-       callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
-       rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
-                                   cpu,
-                                   xen_call_function_interrupt,
-                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
-                                   callfunc_name,
-                                   NULL);
-       if (rc < 0)
-               goto fail;
-       per_cpu(callfunc_irq, cpu) = rc;
-
-       return 0;
-
- fail:
-       if (per_cpu(resched_irq, cpu) >= 0)
-               unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
-       if (per_cpu(callfunc_irq, cpu) >= 0)
-               unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
-       return rc;
-}
-
-void __init xen_fill_possible_map(void)
-{
-       int i, rc;
-
-       for (i = 0; i < NR_CPUS; i++) {
-               rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-               if (rc >= 0)
-                       cpu_set(i, cpu_possible_map);
-       }
-}
-
-void __init xen_smp_prepare_boot_cpu(void)
-{
-       int cpu;
-
-       BUG_ON(smp_processor_id() != 0);
-       native_smp_prepare_boot_cpu();
-
-       /* We've switched to the "real" per-cpu gdt, so make sure the
-          old memory can be recycled */
-       make_lowmem_page_readwrite(&per_cpu__gdt_page);
-
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
-               cpus_clear(cpu_sibling_map[cpu]);
-               cpus_clear(cpu_core_map[cpu]);
-       }
-
-       xen_setup_vcpu_info_placement();
-}
-
-void __init xen_smp_prepare_cpus(unsigned int max_cpus)
-{
-       unsigned cpu;
-
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
-               cpus_clear(cpu_sibling_map[cpu]);
-               cpus_clear(cpu_core_map[cpu]);
-       }
-
-       smp_store_cpu_info(0);
-       set_cpu_sibling_map(0);
-
-       if (xen_smp_intr_init(0))
-               BUG();
-
-       cpu_initialized_map = cpumask_of_cpu(0);
-
-       /* Restrict the possible_map according to max_cpus. */
-       while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
-               for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
-                       continue;
-               cpu_clear(cpu, cpu_possible_map);
-       }
-
-       for_each_possible_cpu (cpu) {
-               struct task_struct *idle;
-
-               if (cpu == 0)
-                       continue;
-
-               idle = fork_idle(cpu);
-               if (IS_ERR(idle))
-                       panic("failed fork for CPU %d", cpu);
-
-               cpu_set(cpu, cpu_present_map);
-       }
-
-       //init_xenbus_allowed_cpumask();
-}
-
-static __cpuinit int
-cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
-{
-       struct vcpu_guest_context *ctxt;
-       struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
-
-       if (cpu_test_and_set(cpu, cpu_initialized_map))
-               return 0;
-
-       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
-       if (ctxt == NULL)
-               return -ENOMEM;
-
-       ctxt->flags = VGCF_IN_KERNEL;
-       ctxt->user_regs.ds = __USER_DS;
-       ctxt->user_regs.es = __USER_DS;
-       ctxt->user_regs.fs = __KERNEL_PERCPU;
-       ctxt->user_regs.gs = 0;
-       ctxt->user_regs.ss = __KERNEL_DS;
-       ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-       ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
-
-       memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
-
-       xen_copy_trap_info(ctxt->trap_ctxt);
-
-       ctxt->ldt_ents = 0;
-
-       BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
-       make_lowmem_page_readonly(gdt->gdt);
-
-       ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
-       ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
-
-       ctxt->user_regs.cs = __KERNEL_CS;
-       ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
-
-       ctxt->kernel_ss = __KERNEL_DS;
-       ctxt->kernel_sp = idle->thread.esp0;
-
-       ctxt->event_callback_cs     = __KERNEL_CS;
-       ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
-       ctxt->failsafe_callback_cs  = __KERNEL_CS;
-       ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
-
-       per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-       ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
-
-       if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
-               BUG();
-
-       kfree(ctxt);
-       return 0;
-}
-
-int __cpuinit xen_cpu_up(unsigned int cpu)
-{
-       struct task_struct *idle = idle_task(cpu);
-       int rc;
-
-#if 0
-       rc = cpu_up_check(cpu);
-       if (rc)
-               return rc;
-#endif
-
-       init_gdt(cpu);
-       per_cpu(current_task, cpu) = idle;
-       irq_ctx_init(cpu);
-       xen_setup_timer(cpu);
-
-       /* make sure interrupts start blocked */
-       per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
-
-       rc = cpu_initialize_context(cpu, idle);
-       if (rc)
-               return rc;
-
-       if (num_online_cpus() == 1)
-               alternatives_smp_switch(1);
-
-       rc = xen_smp_intr_init(cpu);
-       if (rc)
-               return rc;
-
-       smp_store_cpu_info(cpu);
-       set_cpu_sibling_map(cpu);
-       /* This must be done before setting cpu_online_map */
-       wmb();
-
-       cpu_set(cpu, cpu_online_map);
-
-       rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
-       BUG_ON(rc);
-
-       return 0;
-}
-
-void xen_smp_cpus_done(unsigned int max_cpus)
-{
-}
-
-static void stop_self(void *v)
-{
-       int cpu = smp_processor_id();
-
-       /* make sure we're not pinning something down */
-       load_cr3(swapper_pg_dir);
-       /* should set up a minimal gdt */
-
-       HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
-       BUG();
-}
-
-void xen_smp_send_stop(void)
-{
-       smp_call_function(stop_self, NULL, 0, 0);
-}
-
-void xen_smp_send_reschedule(int cpu)
-{
-       xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
-}
-
-
-static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
-{
-       unsigned cpu;
-
-       cpus_and(mask, mask, cpu_online_map);
-
-       for_each_cpu_mask(cpu, mask)
-               xen_send_IPI_one(cpu, vector);
-}
-
-static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
-{
-       void (*func) (void *info) = call_data->func;
-       void *info = call_data->info;
-       int wait = call_data->wait;
-
-       /*
-        * Notify initiating CPU that I've grabbed the data and am
-        * about to execute the function
-        */
-       mb();
-       atomic_inc(&call_data->started);
-       /*
-        * At this point the info structure may be out of scope unless wait==1
-        */
-       irq_enter();
-       (*func)(info);
-       irq_exit();
-
-       if (wait) {
-               mb();           /* commit everything before setting finished */
-               atomic_inc(&call_data->finished);
-       }
-
-       return IRQ_HANDLED;
-}
-
-int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
-                              void *info, int wait)
-{
-       struct call_data_struct data;
-       int cpus;
-
-       /* Holding any lock stops cpus from going down. */
-       spin_lock(&call_lock);
-
-       cpu_clear(smp_processor_id(), mask);
-
-       cpus = cpus_weight(mask);
-       if (!cpus) {
-               spin_unlock(&call_lock);
-               return 0;
-       }
-
-       /* Can deadlock when called with interrupts disabled */
-       WARN_ON(irqs_disabled());
-
-       data.func = func;
-       data.info = info;
-       atomic_set(&data.started, 0);
-       data.wait = wait;
-       if (wait)
-               atomic_set(&data.finished, 0);
-
-       call_data = &data;
-       mb();                   /* write everything before IPI */
-
-       /* Send a message to other CPUs and wait for them to respond */
-       xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
-
-       /* Make sure other vcpus get a chance to run.
-          XXX too severe?  Maybe we should check the other CPU's states? */
-       HYPERVISOR_sched_op(SCHEDOP_yield, 0);
-
-       /* Wait for response */
-       while (atomic_read(&data.started) != cpus ||
-              (wait && atomic_read(&data.finished) != cpus))
-               cpu_relax();
-
-       spin_unlock(&call_lock);
-
-       return 0;
-}
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
deleted file mode 100644 (file)
index dfd6db6..0000000
+++ /dev/null
@@ -1,593 +0,0 @@
-/*
- * Xen time implementation.
- *
- * This is implemented in terms of a clocksource driver which uses
- * the hypervisor clock as a nanosecond timebase, and a clockevent
- * driver which uses the hypervisor's timer mechanism.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/kernel_stat.h>
-
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/events.h>
-#include <xen/interface/xen.h>
-#include <xen/interface/vcpu.h>
-
-#include "xen-ops.h"
-
-#define XEN_SHIFT 22
-
-/* Xen may fire a timer up to this many ns early */
-#define TIMER_SLOP     100000
-#define NS_PER_TICK    (1000000000LL / HZ)
-
-static cycle_t xen_clocksource_read(void);
-
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
-       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
-       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
-       u32 tsc_to_nsec_mul;
-       int tsc_shift;
-       u32 version;
-};
-
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
-
-/* runstate info updated by Xen */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
-
-/* snapshots of runstate info */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
-
-/* unused ns of stolen and blocked time */
-static DEFINE_PER_CPU(u64, residual_stolen);
-static DEFINE_PER_CPU(u64, residual_blocked);
-
-/* return an consistent snapshot of 64-bit time/counter value */
-static u64 get64(const u64 *p)
-{
-       u64 ret;
-
-       if (BITS_PER_LONG < 64) {
-               u32 *p32 = (u32 *)p;
-               u32 h, l;
-
-               /*
-                * Read high then low, and then make sure high is
-                * still the same; this will only loop if low wraps
-                * and carries into high.
-                * XXX some clean way to make this endian-proof?
-                */
-               do {
-                       h = p32[1];
-                       barrier();
-                       l = p32[0];
-                       barrier();
-               } while (p32[1] != h);
-
-               ret = (((u64)h) << 32) | l;
-       } else
-               ret = *p;
-
-       return ret;
-}
-
-/*
- * Runstate accounting
- */
-static void get_runstate_snapshot(struct vcpu_runstate_info *res)
-{
-       u64 state_time;
-       struct vcpu_runstate_info *state;
-
-       BUG_ON(preemptible());
-
-       state = &__get_cpu_var(runstate);
-
-       /*
-        * The runstate info is always updated by the hypervisor on
-        * the current CPU, so there's no need to use anything
-        * stronger than a compiler barrier when fetching it.
-        */
-       do {
-               state_time = get64(&state->state_entry_time);
-               barrier();
-               *res = *state;
-               barrier();
-       } while (get64(&state->state_entry_time) != state_time);
-}
-
-static void setup_runstate_info(int cpu)
-{
-       struct vcpu_register_runstate_memory_area area;
-
-       area.addr.v = &per_cpu(runstate, cpu);
-
-       if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
-                              cpu, &area))
-               BUG();
-}
-
-static void do_stolen_accounting(void)
-{
-       struct vcpu_runstate_info state;
-       struct vcpu_runstate_info *snap;
-       s64 blocked, runnable, offline, stolen;
-       cputime_t ticks;
-
-       get_runstate_snapshot(&state);
-
-       WARN_ON(state.state != RUNSTATE_running);
-
-       snap = &__get_cpu_var(runstate_snapshot);
-
-       /* work out how much time the VCPU has not been runn*ing*  */
-       blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
-       runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
-       offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
-
-       *snap = state;
-
-       /* Add the appropriate number of ticks of stolen time,
-          including any left-overs from last time.  Passing NULL to
-          account_steal_time accounts the time as stolen. */
-       stolen = runnable + offline + __get_cpu_var(residual_stolen);
-
-       if (stolen < 0)
-               stolen = 0;
-
-       ticks = 0;
-       while (stolen >= NS_PER_TICK) {
-               ticks++;
-               stolen -= NS_PER_TICK;
-       }
-       __get_cpu_var(residual_stolen) = stolen;
-       account_steal_time(NULL, ticks);
-
-       /* Add the appropriate number of ticks of blocked time,
-          including any left-overs from last time.  Passing idle to
-          account_steal_time accounts the time as idle/wait. */
-       blocked += __get_cpu_var(residual_blocked);
-
-       if (blocked < 0)
-               blocked = 0;
-
-       ticks = 0;
-       while (blocked >= NS_PER_TICK) {
-               ticks++;
-               blocked -= NS_PER_TICK;
-       }
-       __get_cpu_var(residual_blocked) = blocked;
-       account_steal_time(idle_task(smp_processor_id()), ticks);
-}
-
-/*
- * Xen sched_clock implementation.  Returns the number of unstolen
- * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
- * states.
- */
-unsigned long long xen_sched_clock(void)
-{
-       struct vcpu_runstate_info state;
-       cycle_t now;
-       u64 ret;
-       s64 offset;
-
-       /*
-        * Ideally sched_clock should be called on a per-cpu basis
-        * anyway, so preempt should already be disabled, but that's
-        * not current practice at the moment.
-        */
-       preempt_disable();
-
-       now = xen_clocksource_read();
-
-       get_runstate_snapshot(&state);
-
-       WARN_ON(state.state != RUNSTATE_running);
-
-       offset = now - state.state_entry_time;
-       if (offset < 0)
-               offset = 0;
-
-       ret = state.time[RUNSTATE_blocked] +
-               state.time[RUNSTATE_running] +
-               offset;
-
-       preempt_enable();
-
-       return ret;
-}
-
-
-/* Get the CPU speed from Xen */
-unsigned long xen_cpu_khz(void)
-{
-       u64 cpu_khz = 1000000ULL << 32;
-       const struct vcpu_time_info *info =
-               &HYPERVISOR_shared_info->vcpu_info[0].time;
-
-       do_div(cpu_khz, info->tsc_to_system_mul);
-       if (info->tsc_shift < 0)
-               cpu_khz <<= -info->tsc_shift;
-       else
-               cpu_khz >>= info->tsc_shift;
-
-       return cpu_khz;
-}
-
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
-{
-       struct vcpu_time_info   *src;
-       struct shadow_time_info *dst;
-
-       /* src is shared memory with the hypervisor, so we need to
-          make sure we get a consistent snapshot, even in the face of
-          being preempted. */
-       src = &__get_cpu_var(xen_vcpu)->time;
-       dst = &__get_cpu_var(shadow_time);
-
-       do {
-               dst->version = src->version;
-               rmb();          /* fetch version before data */
-               dst->tsc_timestamp     = src->tsc_timestamp;
-               dst->system_timestamp  = src->system_time;
-               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
-               dst->tsc_shift         = src->tsc_shift;
-               rmb();          /* test version after fetching data */
-       } while ((src->version & 1) | (dst->version ^ src->version));
-
-       return dst->version;
-}
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
-       u64 product;
-#ifdef __i386__
-       u32 tmp1, tmp2;
-#endif
-
-       if (shift < 0)
-               delta >>= -shift;
-       else
-               delta <<= shift;
-
-#ifdef __i386__
-       __asm__ (
-               "mul  %5       ; "
-               "mov  %4,%%eax ; "
-               "mov  %%edx,%4 ; "
-               "mul  %5       ; "
-               "xor  %5,%5    ; "
-               "add  %4,%%eax ; "
-               "adc  %5,%%edx ; "
-               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
-               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
-       __asm__ (
-               "mul %%rdx ; shrd $32,%%rdx,%%rax"
-               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
-       return product;
-}
-
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
-{
-       u64 now, delta;
-       now = native_read_tsc();
-       delta = now - shadow->tsc_timestamp;
-       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
-
-static cycle_t xen_clocksource_read(void)
-{
-       struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
-       cycle_t ret;
-       unsigned version;
-
-       do {
-               version = get_time_values_from_xen();
-               barrier();
-               ret = shadow->system_timestamp + get_nsec_offset(shadow);
-               barrier();
-       } while (version != __get_cpu_var(xen_vcpu)->time.version);
-
-       put_cpu_var(shadow_time);
-
-       return ret;
-}
-
-static void xen_read_wallclock(struct timespec *ts)
-{
-       const struct shared_info *s = HYPERVISOR_shared_info;
-       u32 version;
-       u64 delta;
-       struct timespec now;
-
-       /* get wallclock at system boot */
-       do {
-               version = s->wc_version;
-               rmb();          /* fetch version before time */
-               now.tv_sec  = s->wc_sec;
-               now.tv_nsec = s->wc_nsec;
-               rmb();          /* fetch time before checking version */
-       } while ((s->wc_version & 1) | (version ^ s->wc_version));
-
-       delta = xen_clocksource_read(); /* time since system boot */
-       delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
-
-       now.tv_nsec = do_div(delta, NSEC_PER_SEC);
-       now.tv_sec = delta;
-
-       set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
-}
-
-unsigned long xen_get_wallclock(void)
-{
-       struct timespec ts;
-
-       xen_read_wallclock(&ts);
-
-       return ts.tv_sec;
-}
-
-int xen_set_wallclock(unsigned long now)
-{
-       /* do nothing for domU */
-       return -1;
-}
-
-static struct clocksource xen_clocksource __read_mostly = {
-       .name = "xen",
-       .rating = 400,
-       .read = xen_clocksource_read,
-       .mask = ~0,
-       .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
-       .shift = XEN_SHIFT,
-       .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
-   Xen clockevent implementation
-
-   Xen has two clockevent implementations:
-
-   The old timer_op one works with all released versions of Xen prior
-   to version 3.0.4.  This version of the hypervisor provides a
-   single-shot timer with nanosecond resolution.  However, sharing the
-   same event channel is a 100Hz tick which is delivered while the
-   vcpu is running.  We don't care about or use this tick, but it will
-   cause the core time code to think the timer fired too soon, and
-   will end up resetting it each time.  It could be filtered, but
-   doing so has complications when the ktime clocksource is not yet
-   the xen clocksource (ie, at boot time).
-
-   The new vcpu_op-based timer interface allows the tick timer period
-   to be changed or turned off.  The tick timer is not useful as a
-   periodic timer because events are only delivered to running vcpus.
-   The one-shot timer can report when a timeout is in the past, so
-   set_next_event is capable of returning -ETIME when appropriate.
-   This interface is used when available.
-*/
-
-
-/*
-  Get a hypervisor absolute time.  In theory we could maintain an
-  offset between the kernel's time and the hypervisor's time, and
-  apply that to a kernel's absolute timeout.  Unfortunately the
-  hypervisor and kernel times can drift even if the kernel is using
-  the Xen clocksource, because ntp can warp the kernel's clocksource.
-*/
-static s64 get_abs_timeout(unsigned long delta)
-{
-       return xen_clocksource_read() + delta;
-}
-
-static void xen_timerop_set_mode(enum clock_event_mode mode,
-                                struct clock_event_device *evt)
-{
-       switch (mode) {
-       case CLOCK_EVT_MODE_PERIODIC:
-               /* unsupported */
-               WARN_ON(1);
-               break;
-
-       case CLOCK_EVT_MODE_ONESHOT:
-       case CLOCK_EVT_MODE_RESUME:
-               break;
-
-       case CLOCK_EVT_MODE_UNUSED:
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               HYPERVISOR_set_timer_op(0);  /* cancel timeout */
-               break;
-       }
-}
-
-static int xen_timerop_set_next_event(unsigned long delta,
-                                     struct clock_event_device *evt)
-{
-       WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-
-       if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
-               BUG();
-
-       /* We may have missed the deadline, but there's no real way of
-          knowing for sure.  If the event was in the past, then we'll
-          get an immediate interrupt. */
-
-       return 0;
-}
-
-static const struct clock_event_device xen_timerop_clockevent = {
-       .name = "xen",
-       .features = CLOCK_EVT_FEAT_ONESHOT,
-
-       .max_delta_ns = 0xffffffff,
-       .min_delta_ns = TIMER_SLOP,
-
-       .mult = 1,
-       .shift = 0,
-       .rating = 500,
-
-       .set_mode = xen_timerop_set_mode,
-       .set_next_event = xen_timerop_set_next_event,
-};
-
-
-
-static void xen_vcpuop_set_mode(enum clock_event_mode mode,
-                               struct clock_event_device *evt)
-{
-       int cpu = smp_processor_id();
-
-       switch (mode) {
-       case CLOCK_EVT_MODE_PERIODIC:
-               WARN_ON(1);     /* unsupported */
-               break;
-
-       case CLOCK_EVT_MODE_ONESHOT:
-               if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
-                       BUG();
-               break;
-
-       case CLOCK_EVT_MODE_UNUSED:
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
-                   HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
-                       BUG();
-               break;
-       case CLOCK_EVT_MODE_RESUME:
-               break;
-       }
-}
-
-static int xen_vcpuop_set_next_event(unsigned long delta,
-                                    struct clock_event_device *evt)
-{
-       int cpu = smp_processor_id();
-       struct vcpu_set_singleshot_timer single;
-       int ret;
-
-       WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-
-       single.timeout_abs_ns = get_abs_timeout(delta);
-       single.flags = VCPU_SSHOTTMR_future;
-
-       ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
-
-       BUG_ON(ret != 0 && ret != -ETIME);
-
-       return ret;
-}
-
-static const struct clock_event_device xen_vcpuop_clockevent = {
-       .name = "xen",
-       .features = CLOCK_EVT_FEAT_ONESHOT,
-
-       .max_delta_ns = 0xffffffff,
-       .min_delta_ns = TIMER_SLOP,
-
-       .mult = 1,
-       .shift = 0,
-       .rating = 500,
-
-       .set_mode = xen_vcpuop_set_mode,
-       .set_next_event = xen_vcpuop_set_next_event,
-};
-
-static const struct clock_event_device *xen_clockevent =
-       &xen_timerop_clockevent;
-static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
-
-static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
-{
-       struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
-       irqreturn_t ret;
-
-       ret = IRQ_NONE;
-       if (evt->event_handler) {
-               evt->event_handler(evt);
-               ret = IRQ_HANDLED;
-       }
-
-       do_stolen_accounting();
-
-       return ret;
-}
-
-void xen_setup_timer(int cpu)
-{
-       const char *name;
-       struct clock_event_device *evt;
-       int irq;
-
-       printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
-
-       name = kasprintf(GFP_KERNEL, "timer%d", cpu);
-       if (!name)
-               name = "<timer kasprintf failed>";
-
-       irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
-                                     IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
-                                     name, NULL);
-
-       evt = &per_cpu(xen_clock_events, cpu);
-       memcpy(evt, xen_clockevent, sizeof(*evt));
-
-       evt->cpumask = cpumask_of_cpu(cpu);
-       evt->irq = irq;
-
-       setup_runstate_info(cpu);
-}
-
-void xen_setup_cpu_clockevents(void)
-{
-       BUG_ON(preemptible());
-
-       clockevents_register_device(&__get_cpu_var(xen_clock_events));
-}
-
-__init void xen_time_init(void)
-{
-       int cpu = smp_processor_id();
-
-       get_time_values_from_xen();
-
-       clocksource_register(&xen_clocksource);
-
-       if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
-               /* Successfully turned off 100Hz tick, so we have the
-                  vcpuop-based timer interface */
-               printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
-               xen_clockevent = &xen_vcpuop_clockevent;
-       }
-
-       /* Set initial system time with full resolution */
-       xen_read_wallclock(&xtime);
-       set_normalized_timespec(&wall_to_monotonic,
-                               -xtime.tv_sec, -xtime.tv_nsec);
-
-       tsc_disable = 0;
-
-       xen_setup_timer(cpu);
-       xen_setup_cpu_clockevents();
-}
diff --git a/arch/i386/xen/vdso.h b/arch/i386/xen/vdso.h
deleted file mode 100644 (file)
index 861fedf..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-/* Bit used for the pseudo-hwcap for non-negative segments.  We use
-   bit 1 to avoid bugs in some versions of glibc when bit 0 is
-   used; the choice is otherwise arbitrary. */
-#define VDSO_NOTE_NONEGSEG_BIT 1
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
deleted file mode 100644 (file)
index 1a43b60..0000000
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
-       Asm versions of Xen pv-ops, suitable for either direct use or inlining.
-       The inline versions are the same as the direct-use versions, with the
-       pre- and post-amble chopped off.
-
-       This code is encoded for size rather than absolute efficiency,
-       with a view to being able to inline as much as possible.
-
-       We only bother with direct forms (ie, vcpu in pda) of the operations
-       here; the indirect forms are better handled in C, since they're
-       generally too large to inline anyway.
- */
-
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/percpu.h>
-#include <asm/processor-flags.h>
-#include <asm/segment.h>
-
-#include <xen/interface/xen.h>
-
-#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x)    .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-/*
-       Enable events.  This clears the event mask and tests the pending
-       event status with one and operation.  If there are pending
-       events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-       /* Clear mask and test pending */
-       andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-       /* Preempt here doesn't matter because that will deal with
-          any pending interrupts.  The pending check may end up being
-          run on the wrong CPU, but that doesn't hurt. */
-       jz 1f
-2:     call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-       ret
-       ENDPROC(xen_irq_enable_direct)
-       RELOC(xen_irq_enable_direct, 2b+1)
-
-
-/*
-       Disabling events is simply a matter of making the event mask
-       non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-       movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
-       ret
-       ENDPROC(xen_irq_disable_direct)
-       RELOC(xen_irq_disable_direct, 0)
-
-/*
-       (xen_)save_fl is used to get the current interrupt enable status.
-       Callers expect the status to be in X86_EFLAGS_IF, and other bits
-       may be set in the return value.  We take advantage of this by
-       making sure that X86_EFLAGS_IF has the right value (and other bits
-       in that byte are 0), but other bits in the return value are
-       undefined.  We need to toggle the state of the bit, because
-       Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
-       testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-       setz %ah
-       addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
-       ret
-       ENDPROC(xen_save_fl_direct)
-       RELOC(xen_save_fl_direct, 0)
-
-
-/*
-       In principle the caller should be passing us a value return
-       from xen_save_fl_direct, but for robustness sake we test only
-       the X86_EFLAGS_IF flag rather than the whole byte. After
-       setting the interrupt mask state, it checks for unmasked
-       pending events and enters the hypervisor to get them delivered
-       if so.
- */
-ENTRY(xen_restore_fl_direct)
-       testb $X86_EFLAGS_IF>>8, %ah
-       setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-       /* Preempt here doesn't matter because that will deal with
-          any pending interrupts.  The pending check may end up being
-          run on the wrong CPU, but that doesn't hurt. */
-
-       /* check for unmasked and pending */
-       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-       jz 1f
-2:     call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
-       ret
-       ENDPROC(xen_restore_fl_direct)
-       RELOC(xen_restore_fl_direct, 2b+1)
-
-/*
-       This is run where a normal iret would be run, with the same stack setup:
-             8: eflags
-             4: cs
-       esp-> 0: eip
-
-       This attempts to make sure that any pending events are dealt
-       with on return to usermode, but there is a small window in
-       which an event can happen just before entering usermode.  If
-       the nested interrupt ends up setting one of the TIF_WORK_MASK
-       pending work flags, they will not be tested again before
-       returning to usermode. This means that a process can end up
-       with pending work, which will be unprocessed until the process
-       enters and leaves the kernel again, which could be an
-       unbounded amount of time.  This means that a pending signal or
-       reschedule event could be indefinitely delayed.
-
-       The fix is to notice a nested interrupt in the critical
-       window, and if one occurs, then fold the nested interrupt into
-       the current interrupt stack frame, and re-process it
-       iteratively rather than recursively.  This means that it will
-       exit via the normal path, and all pending work will be dealt
-       with appropriately.
-
-       Because the nested interrupt handler needs to deal with the
-       current stack state in whatever form its in, we keep things
-       simple by only using a single register which is pushed/popped
-       on the stack.
-
-       Non-direct iret could be done in the same way, but it would
-       require an annoying amount of code duplication.  We'll assume
-       that direct mode will be the common case once the hypervisor
-       support becomes commonplace.
- */
-ENTRY(xen_iret_direct)
-       /* test eflags for special cases */
-       testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
-       jnz hyper_iret
-
-       push %eax
-       ESP_OFFSET=4    # bytes pushed onto stack
-
-       /* Store vcpu_info pointer for easy access.  Do it this
-          way to avoid having to reload %fs */
-#ifdef CONFIG_SMP
-       GET_THREAD_INFO(%eax)
-       movl TI_cpu(%eax),%eax
-       movl __per_cpu_offset(,%eax,4),%eax
-       lea per_cpu__xen_vcpu_info(%eax),%eax
-#else
-       movl $per_cpu__xen_vcpu_info, %eax
-#endif
-
-       /* check IF state we're restoring */
-       testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
-
-       /* Maybe enable events.  Once this happens we could get a
-          recursive event, so the critical region starts immediately
-          afterwards.  However, if that happens we don't end up
-          resuming the code, so we don't have to be worried about
-          being preempted to another CPU. */
-       setz XEN_vcpu_info_mask(%eax)
-xen_iret_start_crit:
-
-       /* check for unmasked and pending */
-       cmpw $0x0001, XEN_vcpu_info_pending(%eax)
-
-       /* If there's something pending, mask events again so we
-          can jump back into xen_hypervisor_callback */
-       sete XEN_vcpu_info_mask(%eax)
-
-       popl %eax
-
-       /* From this point on the registers are restored and the stack
-          updated, so we don't need to worry about it if we're preempted */
-iret_restore_end:
-
-       /* Jump to hypervisor_callback after fixing up the stack.
-          Events are masked, so jumping out of the critical
-          region is OK. */
-       je xen_hypervisor_callback
-
-       iret
-xen_iret_end_crit:
-
-hyper_iret:
-       /* put this out of line since its very rarely used */
-       jmp hypercall_page + __HYPERVISOR_iret * 32
-
-       .globl xen_iret_start_crit, xen_iret_end_crit
-
-/*
-   This is called by xen_hypervisor_callback in entry.S when it sees
-   that the EIP at the time of interrupt was between xen_iret_start_crit
-   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
-   a more refined determination of what to do.
-
-   The stack format at this point is:
-       ----------------
-        ss             : (ss/esp may be present if we came from usermode)
-        esp            :
-        eflags         }  outer exception info
-        cs             }
-        eip            }
-       ---------------- <- edi (copy dest)
-        eax            :  outer eax if it hasn't been restored
-       ----------------
-        eflags         }  nested exception info
-        cs             }   (no ss/esp because we're nested
-        eip            }    from the same ring)
-        orig_eax       }<- esi (copy src)
-        - - - - - - - -
-        fs             }
-        es             }
-        ds             }  SAVE_ALL state
-        eax            }
-         :             :
-        ebx            }
-       ----------------
-        return addr     <- esp
-       ----------------
-
-   In order to deliver the nested exception properly, we need to shift
-   everything from the return addr up to the error code so it
-   sits just under the outer exception info.  This means that when we
-   handle the exception, we do it in the context of the outer exception
-   rather than starting a new one.
-
-   The only caveat is that if the outer eax hasn't been
-   restored yet (ie, it's still on stack), we need to insert
-   its value into the SAVE_ALL state before going on, since
-   it's usermode state which we eventually need to restore.
- */
-ENTRY(xen_iret_crit_fixup)
-       /* offsets +4 for return address */
-
-       /*
-          Paranoia: Make sure we're really coming from userspace.
-          One could imagine a case where userspace jumps into the
-          critical range address, but just before the CPU delivers a GP,
-          it decides to deliver an interrupt instead.  Unlikely?
-          Definitely.  Easy to avoid?  Yes.  The Intel documents
-          explicitly say that the reported EIP for a bad jump is the
-          jump instruction itself, not the destination, but some virtual
-          environments get this wrong.
-        */
-       movl PT_CS+4(%esp), %ecx
-       andl $SEGMENT_RPL_MASK, %ecx
-       cmpl $USER_RPL, %ecx
-       je 2f
-
-       lea PT_ORIG_EAX+4(%esp), %esi
-       lea PT_EFLAGS+4(%esp), %edi
-
-       /* If eip is before iret_restore_end then stack
-          hasn't been restored yet. */
-       cmp $iret_restore_end, %eax
-       jae 1f
-
-       movl 0+4(%edi),%eax             /* copy EAX */
-       movl %eax, PT_EAX+4(%esp)
-
-       lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
-
-       /* set up the copy */
-1:     std
-       mov $(PT_EIP+4) / 4, %ecx       /* copy ret+saved regs up to orig_eax */
-       rep movsl
-       cld
-
-       lea 4(%edi),%esp                /* point esp to new frame */
-2:     ret
-
-
-/*
-       Force an event check by making a hypercall,
-       but preserve regs before making the call.
- */
-check_events:
-       push %eax
-       push %ecx
-       push %edx
-       call force_evtchn_callback
-       pop %edx
-       pop %ecx
-       pop %eax
-       ret
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S
deleted file mode 100644 (file)
index f8d6937..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Xen-specific pieces of head.S, intended to be included in the right
-       place in head.S */
-
-#ifdef CONFIG_XEN
-
-#include <linux/elfnote.h>
-#include <asm/boot.h>
-#include <xen/interface/elfnote.h>
-
-.pushsection .init.text
-ENTRY(startup_xen)
-       movl %esi,xen_start_info
-       cld
-       movl $(init_thread_union+THREAD_SIZE),%esp
-       jmp xen_start_kernel
-.popsection
-
-.pushsection .bss.page_aligned
-       .align PAGE_SIZE_asm
-ENTRY(hypercall_page)
-       .skip 0x1000
-.popsection
-
-       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
-       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
-       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
-       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
-       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
-       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
-       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
-#ifdef CONFIG_X86_PAE
-       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
-#else
-       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
-#endif
-       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
-
-#endif /*CONFIG_XEN */
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
deleted file mode 100644 (file)
index b9aaea4..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef XEN_OPS_H
-#define XEN_OPS_H
-
-#include <linux/init.h>
-
-/* These are code, but not functions.  Defined in entry.S */
-extern const char xen_hypervisor_callback[];
-extern const char xen_failsafe_callback[];
-
-void xen_copy_trap_info(struct trap_info *traps);
-
-DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DECLARE_PER_CPU(unsigned long, xen_cr3);
-
-extern struct start_info *xen_start_info;
-extern struct shared_info *HYPERVISOR_shared_info;
-
-char * __init xen_memory_setup(void);
-void __init xen_arch_setup(void);
-void __init xen_init_IRQ(void);
-
-void xen_setup_timer(int cpu);
-void xen_setup_cpu_clockevents(void);
-unsigned long xen_cpu_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
-
-void xen_mark_init_mm_pinned(void);
-
-DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
-
-static inline unsigned xen_get_lazy_mode(void)
-{
-       return x86_read_percpu(xen_lazy_mode);
-}
-
-void __init xen_fill_possible_map(void);
-
-void __init xen_setup_vcpu_info_placement(void);
-void xen_smp_prepare_boot_cpu(void);
-void xen_smp_prepare_cpus(unsigned int max_cpus);
-int xen_cpu_up(unsigned int cpu);
-void xen_smp_cpus_done(unsigned int max_cpus);
-
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-                          int wait);
-int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-                                int nonatomic, int wait);
-
-int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
-                              void *info, int wait);
-
-
-/* Declare an asm function, along with symbols needed to make it
-   inlineable */
-#define DECL_ASM(ret, name, ...)               \
-       ret name(__VA_ARGS__);                  \
-       extern char name##_end[];               \
-       extern char name##_reloc[]              \
-
-DECL_ASM(void, xen_irq_enable_direct, void);
-DECL_ASM(void, xen_irq_disable_direct, void);
-DECL_ASM(unsigned long, xen_save_fl_direct, void);
-DECL_ASM(void, xen_restore_fl_direct, unsigned long);
-
-void xen_iret_direct(void);
-#endif /* XEN_OPS_H */
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
new file mode 100644 (file)
index 0000000..9df99e1
--- /dev/null
@@ -0,0 +1,11 @@
+#
+# This Kconfig describes xen options
+#
+
+config XEN
+       bool "Enable support for Xen hypervisor"
+       depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+       help
+         This is the Linux Xen port.  Enabling this will allow the
+         kernel to boot in a paravirtualized environment under the
+         Xen hypervisor.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
new file mode 100644 (file)
index 0000000..343df24
--- /dev/null
@@ -0,0 +1,4 @@
+obj-y          := enlighten.o setup.o features.o multicalls.o mmu.o \
+                       events.o time.o manage.o xen-asm.o
+
+obj-$(CONFIG_SMP)      += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
new file mode 100644 (file)
index 0000000..f01bfcd
--- /dev/null
@@ -0,0 +1,1146 @@
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+
+EXPORT_SYMBOL_GPL(hypercall_page);
+
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+
+static /* __initdata */ struct shared_info dummy_shared_info;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs.  We assume it is to start with, and then set it to zero on
+ * the first failure.  This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+
+static void __init xen_vcpu_setup(int cpu)
+{
+       struct vcpu_register_vcpu_info info;
+       int err;
+       struct vcpu_info *vcpup;
+
+       per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+       if (!have_vcpu_info_placement)
+               return;         /* already tested, not available */
+
+       vcpup = &per_cpu(xen_vcpu_info, cpu);
+
+       info.mfn = virt_to_mfn(vcpup);
+       info.offset = offset_in_page(vcpup);
+
+       printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+              cpu, vcpup, info.mfn, info.offset);
+
+       /* Check to see if the hypervisor will put the vcpu_info
+          structure where we want it, which allows direct access via
+          a percpu-variable. */
+       err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+
+       if (err) {
+               printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+               have_vcpu_info_placement = 0;
+       } else {
+               /* This cpu is using the registered vcpu info, even if
+                  later ones fail to. */
+               per_cpu(xen_vcpu, cpu) = vcpup;
+
+               printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+                      cpu, vcpup);
+       }
+}
+
+static void __init xen_banner(void)
+{
+       printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+              paravirt_ops.name);
+       printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+                     unsigned int *ecx, unsigned int *edx)
+{
+       unsigned maskedx = ~0;
+
+       /*
+        * Mask out inconvenient features, to try and disable as many
+        * unsupported kernel subsystems as possible.
+        */
+       if (*eax == 1)
+               maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
+                           (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
+                           (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+
+       asm(XEN_EMULATE_PREFIX "cpuid"
+               : "=a" (*eax),
+                 "=b" (*ebx),
+                 "=c" (*ecx),
+                 "=d" (*edx)
+               : "0" (*eax), "2" (*ecx));
+       *edx &= maskedx;
+}
+
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+       HYPERVISOR_set_debugreg(reg, val);
+}
+
+static unsigned long xen_get_debugreg(int reg)
+{
+       return HYPERVISOR_get_debugreg(reg);
+}
+
+static unsigned long xen_save_fl(void)
+{
+       struct vcpu_info *vcpu;
+       unsigned long flags;
+
+       vcpu = x86_read_percpu(xen_vcpu);
+
+       /* flag has opposite sense of mask */
+       flags = !vcpu->evtchn_upcall_mask;
+
+       /* convert to IF type flag
+          -0 -> 0x00000000
+          -1 -> 0xffffffff
+       */
+       return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+       struct vcpu_info *vcpu;
+
+       /* convert from IF type flag */
+       flags = !(flags & X86_EFLAGS_IF);
+
+       /* There's a one instruction preempt window here.  We need to
+          make sure we're don't switch CPUs between getting the vcpu
+          pointer and updating the mask. */
+       preempt_disable();
+       vcpu = x86_read_percpu(xen_vcpu);
+       vcpu->evtchn_upcall_mask = flags;
+       preempt_enable_no_resched();
+
+       /* Doesn't matter if we get preempted here, because any
+          pending event will get dealt with anyway. */
+
+       if (flags == 0) {
+               preempt_check_resched();
+               barrier(); /* unmask then check (avoid races) */
+               if (unlikely(vcpu->evtchn_upcall_pending))
+                       force_evtchn_callback();
+       }
+}
+
+static void xen_irq_disable(void)
+{
+       /* There's a one instruction preempt window here.  We need to
+          make sure we're don't switch CPUs between getting the vcpu
+          pointer and updating the mask. */
+       preempt_disable();
+       x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+       preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+       struct vcpu_info *vcpu;
+
+       /* There's a one instruction preempt window here.  We need to
+          make sure we're don't switch CPUs between getting the vcpu
+          pointer and updating the mask. */
+       preempt_disable();
+       vcpu = x86_read_percpu(xen_vcpu);
+       vcpu->evtchn_upcall_mask = 0;
+       preempt_enable_no_resched();
+
+       /* Doesn't matter if we get preempted here, because any
+          pending event will get dealt with anyway. */
+
+       barrier(); /* unmask then check (avoid races) */
+       if (unlikely(vcpu->evtchn_upcall_pending))
+               force_evtchn_callback();
+}
+
+static void xen_safe_halt(void)
+{
+       /* Blocking includes an implicit local_irq_enable(). */
+       if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+               BUG();
+}
+
+static void xen_halt(void)
+{
+       if (irqs_disabled())
+               HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+       else
+               xen_safe_halt();
+}
+
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+       BUG_ON(preemptible());
+
+       switch (mode) {
+       case PARAVIRT_LAZY_NONE:
+               BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+               break;
+
+       case PARAVIRT_LAZY_MMU:
+       case PARAVIRT_LAZY_CPU:
+               BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+               break;
+
+       case PARAVIRT_LAZY_FLUSH:
+               /* flush if necessary, but don't change state */
+               if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+                       xen_mc_flush();
+               return;
+       }
+
+       xen_mc_flush();
+       x86_write_percpu(xen_lazy_mode, mode);
+}
+
+static unsigned long xen_store_tr(void)
+{
+       return 0;
+}
+
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+       unsigned long linear_addr = (unsigned long)addr;
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_SET_LDT;
+       if (linear_addr) {
+               /* ldt my be vmalloced, use arbitrary_virt_to_machine */
+               xmaddr_t maddr;
+               maddr = arbitrary_virt_to_machine((unsigned long)addr);
+               linear_addr = (unsigned long)maddr.maddr;
+       }
+       op->arg1.linear_addr = linear_addr;
+       op->arg2.nr_ents = entries;
+
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+       unsigned long *frames;
+       unsigned long va = dtr->address;
+       unsigned int size = dtr->size + 1;
+       unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+       int f;
+       struct multicall_space mcs;
+
+       /* A GDT can be up to 64k in size, which corresponds to 8192
+          8-byte entries, or 16 4k pages.. */
+
+       BUG_ON(size > 65536);
+       BUG_ON(va & ~PAGE_MASK);
+
+       mcs = xen_mc_entry(sizeof(*frames) * pages);
+       frames = mcs.args;
+
+       for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+               frames[f] = virt_to_mfn(va);
+               make_lowmem_page_readonly((void *)va);
+       }
+
+       MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+                               unsigned int cpu, unsigned int i)
+{
+       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+       xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+       struct multicall_space mc = __xen_mc_entry(0);
+
+       MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+       xen_mc_batch();
+
+       load_TLS_descriptor(t, cpu, 0);
+       load_TLS_descriptor(t, cpu, 1);
+       load_TLS_descriptor(t, cpu, 2);
+
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+       /*
+        * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+        * it means we're in a context switch, and %gs has just been
+        * saved.  This means we can zero it out to prevent faults on
+        * exit from the hypervisor if the next process has no %gs.
+        * Either way, it has been saved, and the new value will get
+        * loaded properly.  This will go away as soon as Xen has been
+        * modified to not save/restore %gs for normal hypercalls.
+        */
+       if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+               loadsegment(gs, 0);
+}
+
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+                               u32 low, u32 high)
+{
+       unsigned long lp = (unsigned long)&dt[entrynum];
+       xmaddr_t mach_lp = virt_to_machine(lp);
+       u64 entry = (u64)high << 32 | low;
+
+       preempt_disable();
+
+       xen_mc_flush();
+       if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+               BUG();
+
+       preempt_enable();
+}
+
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+                           struct trap_info *info)
+{
+       u8 type, dpl;
+
+       type = (high >> 8) & 0x1f;
+       dpl = (high >> 13) & 3;
+
+       if (type != 0xf && type != 0xe)
+               return 0;
+
+       info->vector = vector;
+       info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+       info->cs = low >> 16;
+       info->flags = dpl;
+       /* interrupt gates clear IF */
+       if (type == 0xe)
+               info->flags |= 4;
+
+       return 1;
+}
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+                               u32 low, u32 high)
+{
+       unsigned long p = (unsigned long)&dt[entrynum];
+       unsigned long start, end;
+
+       preempt_disable();
+
+       start = __get_cpu_var(idt_desc).address;
+       end = start + __get_cpu_var(idt_desc).size + 1;
+
+       xen_mc_flush();
+
+       write_dt_entry(dt, entrynum, low, high);
+
+       if (p >= start && (p + 8) <= end) {
+               struct trap_info info[2];
+
+               info[1].address = 0;
+
+               if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+                       if (HYPERVISOR_set_trap_table(info))
+                               BUG();
+       }
+
+       preempt_enable();
+}
+
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+                                 struct trap_info *traps)
+{
+       unsigned in, out, count;
+
+       count = (desc->size+1) / 8;
+       BUG_ON(count > 256);
+
+       for (in = out = 0; in < count; in++) {
+               const u32 *entry = (u32 *)(desc->address + in * 8);
+
+               if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+                       out++;
+       }
+       traps[out].address = 0;
+}
+
+void xen_copy_trap_info(struct trap_info *traps)
+{
+       const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+
+       xen_convert_trap_info(desc, traps);
+}
+
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+       static DEFINE_SPINLOCK(lock);
+       static struct trap_info traps[257];
+
+       spin_lock(&lock);
+
+       __get_cpu_var(idt_desc) = *desc;
+
+       xen_convert_trap_info(desc, traps);
+
+       xen_mc_flush();
+       if (HYPERVISOR_set_trap_table(traps))
+               BUG();
+
+       spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+                               u32 low, u32 high)
+{
+       preempt_disable();
+
+       switch ((high >> 8) & 0xff) {
+       case DESCTYPE_LDT:
+       case DESCTYPE_TSS:
+               /* ignore */
+               break;
+
+       default: {
+               xmaddr_t maddr = virt_to_machine(&dt[entry]);
+               u64 desc = (u64)high << 32 | low;
+
+               xen_mc_flush();
+               if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+                       BUG();
+       }
+
+       }
+
+       preempt_enable();
+}
+
+static void xen_load_esp0(struct tss_struct *tss,
+                         struct thread_struct *thread)
+{
+       struct multicall_space mcs = xen_mc_entry(0);
+       MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_set_iopl_mask(unsigned mask)
+{
+       struct physdev_set_iopl set_iopl;
+
+       /* Force the change at ring 0. */
+       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+
+static void xen_io_delay(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+       return 0;
+}
+
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+       /* Warn to see if there's any stray references */
+       WARN_ON(1);
+}
+#endif
+
+static void xen_flush_tlb(void)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_INVLPG_LOCAL;
+       op->arg1.linear_addr = addr & PAGE_MASK;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+                                unsigned long va)
+{
+       struct {
+               struct mmuext_op op;
+               cpumask_t mask;
+       } *args;
+       cpumask_t cpumask = *cpus;
+       struct multicall_space mcs;
+
+       /*
+        * A couple of (to be removed) sanity checks:
+        *
+        * - current CPU must not be in mask
+        * - mask must exist :)
+        */
+       BUG_ON(cpus_empty(cpumask));
+       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+       BUG_ON(!mm);
+
+       /* If a CPU which we ran on has gone down, OK. */
+       cpus_and(cpumask, cpumask, cpu_online_map);
+       if (cpus_empty(cpumask))
+               return;
+
+       mcs = xen_mc_entry(sizeof(*args));
+       args = mcs.args;
+       args->mask = cpumask;
+       args->op.arg2.vcpumask = &args->mask;
+
+       if (va == TLB_FLUSH_ALL) {
+               args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+       } else {
+               args->op.cmd = MMUEXT_INVLPG_MULTI;
+               args->op.arg1.linear_addr = va;
+       }
+
+       MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+       x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+       return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+
+static unsigned long xen_read_cr2_direct(void)
+{
+       return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_write_cr4(unsigned long cr4)
+{
+       /* Just ignore cr4 changes; Xen doesn't allow us to do
+          anything anyway. */
+}
+
+static unsigned long xen_read_cr3(void)
+{
+       return x86_read_percpu(xen_cr3);
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+       BUG_ON(preemptible());
+
+       if (cr3 == x86_read_percpu(xen_cr3)) {
+               /* just a simple tlb flush */
+               xen_flush_tlb();
+               return;
+       }
+
+       x86_write_percpu(xen_cr3, cr3);
+
+
+       {
+               struct mmuext_op *op;
+               struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+               unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
+               op = mcs.args;
+               op->cmd = MMUEXT_NEW_BASEPTR;
+               op->arg1.mfn = mfn;
+
+               MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+               xen_mc_issue(PARAVIRT_LAZY_CPU);
+       }
+}
+
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+{
+       BUG_ON(mem_map);        /* should only be used early */
+       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+       struct page *page = pfn_to_page(pfn);
+
+       if (PagePinned(virt_to_page(mm->pgd))) {
+               SetPagePinned(page);
+
+               if (!PageHighMem(page))
+                       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+               else
+                       /* make sure there are no stray mappings of
+                          this page */
+                       kmap_flush_unused();
+       }
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_pt(u32 pfn)
+{
+       struct page *page = pfn_to_page(pfn);
+
+       if (PagePinned(page)) {
+               if (!PageHighMem(page))
+                       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+       }
+}
+
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+       pgprot_t prot = PAGE_KERNEL;
+
+       if (PagePinned(page))
+               prot = PAGE_KERNEL_RO;
+
+       if (0 && PageHighMem(page))
+               printk("mapping highpte %lx type %d prot %s\n",
+                      page_to_pfn(page), type,
+                      (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+       return kmap_atomic_prot(page, type, prot);
+}
+#endif
+
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+       /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+       if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+               pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+                              pte_val_ma(pte));
+
+       return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+       pte = mask_rw_pte(ptep, pte);
+
+       xen_set_pte(ptep, pte);
+}
+
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+       pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+
+       /* special set_pte for pagetable initialization */
+       paravirt_ops.set_pte = xen_set_pte_init;
+
+       init_mm.pgd = base;
+       /*
+        * copy top-level of Xen-supplied pagetable into place.  For
+        * !PAE we can use this as-is, but for PAE it is a stand-in
+        * while we copy the pmd pages.
+        */
+       memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+
+       if (PTRS_PER_PMD > 1) {
+               int i;
+               /*
+                * For PAE, need to allocate new pmds, rather than
+                * share Xen's, since Xen doesn't like pmd's being
+                * shared between address spaces.
+                */
+               for (i = 0; i < PTRS_PER_PGD; i++) {
+                       if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+                               pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+
+                               memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+                                      PAGE_SIZE);
+
+                               make_lowmem_page_readonly(pmd);
+
+                               set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+                       } else
+                               pgd_clear(&base[i]);
+               }
+       }
+
+       /* make sure zero_page is mapped RO so we can use it in pagetables */
+       make_lowmem_page_readonly(empty_zero_page);
+       make_lowmem_page_readonly(base);
+       /*
+        * Switch to new pagetable.  This is done before
+        * pagetable_init has done anything so that the new pages
+        * added to the table can be prepared properly for Xen.
+        */
+       xen_write_cr3(__pa(base));
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+       /* This will work as long as patching hasn't happened yet
+          (which it hasn't) */
+       paravirt_ops.alloc_pt = xen_alloc_pt;
+       paravirt_ops.set_pte = xen_set_pte;
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               /*
+                * Create a mapping for the shared info page.
+                * Should be set_fixmap(), but shared_info is a machine
+                * address with no corresponding pseudo-phys address.
+                */
+               set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+                           PFN_DOWN(xen_start_info->shared_info),
+                           PAGE_KERNEL);
+
+               HYPERVISOR_shared_info =
+                       (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+       } else
+               HYPERVISOR_shared_info =
+                       (struct shared_info *)__va(xen_start_info->shared_info);
+
+       /* Actually pin the pagetable down, but we can't set PG_pinned
+          yet because the page structures don't exist yet. */
+       {
+               struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+               op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+               op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+               op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+               if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+                       BUG();
+       }
+}
+
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               xen_vcpu_setup(cpu);
+
+       /* xen_vcpu_setup managed to place the vcpu_info within the
+          percpu area for all cpus, so make use of it */
+       if (have_vcpu_info_placement) {
+               printk(KERN_INFO "Xen: using vcpu_info placement\n");
+
+               paravirt_ops.save_fl = xen_save_fl_direct;
+               paravirt_ops.restore_fl = xen_restore_fl_direct;
+               paravirt_ops.irq_disable = xen_irq_disable_direct;
+               paravirt_ops.irq_enable = xen_irq_enable_direct;
+               paravirt_ops.read_cr2 = xen_read_cr2_direct;
+               paravirt_ops.iret = xen_iret_direct;
+       }
+}
+
+static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
+                         unsigned long addr, unsigned len)
+{
+       char *start, *end, *reloc;
+       unsigned ret;
+
+       start = end = reloc = NULL;
+
+#define SITE(x)                                                                \
+       case PARAVIRT_PATCH(x):                                         \
+       if (have_vcpu_info_placement) {                                 \
+               start = (char *)xen_##x##_direct;                       \
+               end = xen_##x##_direct_end;                             \
+               reloc = xen_##x##_direct_reloc;                         \
+       }                                                               \
+       goto patch_site
+
+       switch (type) {
+               SITE(irq_enable);
+               SITE(irq_disable);
+               SITE(save_fl);
+               SITE(restore_fl);
+#undef SITE
+
+       patch_site:
+               if (start == NULL || (end-start) > len)
+                       goto default_patch;
+
+               ret = paravirt_patch_insns(insnbuf, len, start, end);
+
+               /* Note: because reloc is assigned from something that
+                  appears to be an array, gcc assumes it's non-null,
+                  but doesn't know its relationship with start and
+                  end. */
+               if (reloc > start && reloc < end) {
+                       int reloc_off = reloc - start;
+                       long *relocp = (long *)(insnbuf + reloc_off);
+                       long delta = start - (char *)addr;
+
+                       *relocp += delta;
+               }
+               break;
+
+       default_patch:
+       default:
+               ret = paravirt_patch_default(type, clobbers, insnbuf,
+                                            addr, len);
+               break;
+       }
+
+       return ret;
+}
+
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+       .paravirt_enabled = 1,
+       .shared_kernel_pmd = 0,
+
+       .name = "Xen",
+       .banner = xen_banner,
+
+       .patch = xen_patch,
+
+       .memory_setup = xen_memory_setup,
+       .arch_setup = xen_arch_setup,
+       .init_IRQ = xen_init_IRQ,
+       .post_allocator_init = xen_mark_init_mm_pinned,
+
+       .time_init = xen_time_init,
+       .set_wallclock = xen_set_wallclock,
+       .get_wallclock = xen_get_wallclock,
+       .get_cpu_khz = xen_cpu_khz,
+       .sched_clock = xen_sched_clock,
+
+       .cpuid = xen_cpuid,
+
+       .set_debugreg = xen_set_debugreg,
+       .get_debugreg = xen_get_debugreg,
+
+       .clts = native_clts,
+
+       .read_cr0 = native_read_cr0,
+       .write_cr0 = native_write_cr0,
+
+       .read_cr2 = xen_read_cr2,
+       .write_cr2 = xen_write_cr2,
+
+       .read_cr3 = xen_read_cr3,
+       .write_cr3 = xen_write_cr3,
+
+       .read_cr4 = native_read_cr4,
+       .read_cr4_safe = native_read_cr4_safe,
+       .write_cr4 = xen_write_cr4,
+
+       .save_fl = xen_save_fl,
+       .restore_fl = xen_restore_fl,
+       .irq_disable = xen_irq_disable,
+       .irq_enable = xen_irq_enable,
+       .safe_halt = xen_safe_halt,
+       .halt = xen_halt,
+       .wbinvd = native_wbinvd,
+
+       .read_msr = native_read_msr_safe,
+       .write_msr = native_write_msr_safe,
+       .read_tsc = native_read_tsc,
+       .read_pmc = native_read_pmc,
+
+       .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+       .irq_enable_sysexit = NULL,  /* never called */
+
+       .load_tr_desc = paravirt_nop,
+       .set_ldt = xen_set_ldt,
+       .load_gdt = xen_load_gdt,
+       .load_idt = xen_load_idt,
+       .load_tls = xen_load_tls,
+
+       .store_gdt = native_store_gdt,
+       .store_idt = native_store_idt,
+       .store_tr = xen_store_tr,
+
+       .write_ldt_entry = xen_write_ldt_entry,
+       .write_gdt_entry = xen_write_gdt_entry,
+       .write_idt_entry = xen_write_idt_entry,
+       .load_esp0 = xen_load_esp0,
+
+       .set_iopl_mask = xen_set_iopl_mask,
+       .io_delay = xen_io_delay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       .apic_write = xen_apic_write,
+       .apic_write_atomic = xen_apic_write,
+       .apic_read = xen_apic_read,
+       .setup_boot_clock = paravirt_nop,
+       .setup_secondary_clock = paravirt_nop,
+       .startup_ipi_hook = paravirt_nop,
+#endif
+
+       .flush_tlb_user = xen_flush_tlb,
+       .flush_tlb_kernel = xen_flush_tlb,
+       .flush_tlb_single = xen_flush_tlb_single,
+       .flush_tlb_others = xen_flush_tlb_others,
+
+       .pte_update = paravirt_nop,
+       .pte_update_defer = paravirt_nop,
+
+       .pagetable_setup_start = xen_pagetable_setup_start,
+       .pagetable_setup_done = xen_pagetable_setup_done,
+
+       .alloc_pt = xen_alloc_pt_init,
+       .release_pt = xen_release_pt,
+       .alloc_pd = paravirt_nop,
+       .alloc_pd_clone = paravirt_nop,
+       .release_pd = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+       .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+       .set_pte = NULL,        /* see xen_pagetable_setup_* */
+       .set_pte_at = xen_set_pte_at,
+       .set_pmd = xen_set_pmd,
+
+       .pte_val = xen_pte_val,
+       .pgd_val = xen_pgd_val,
+
+       .make_pte = xen_make_pte,
+       .make_pgd = xen_make_pgd,
+
+#ifdef CONFIG_X86_PAE
+       .set_pte_atomic = xen_set_pte_atomic,
+       .set_pte_present = xen_set_pte_at,
+       .set_pud = xen_set_pud,
+       .pte_clear = xen_pte_clear,
+       .pmd_clear = xen_pmd_clear,
+
+       .make_pmd = xen_make_pmd,
+       .pmd_val = xen_pmd_val,
+#endif /* PAE */
+
+       .activate_mm = xen_activate_mm,
+       .dup_mmap = xen_dup_mmap,
+       .exit_mmap = xen_exit_mmap,
+
+       .set_lazy_mode = xen_set_lazy_mode,
+};
+
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+       .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+       .smp_prepare_cpus = xen_smp_prepare_cpus,
+       .cpu_up = xen_cpu_up,
+       .smp_cpus_done = xen_smp_cpus_done,
+
+       .smp_send_stop = xen_smp_send_stop,
+       .smp_send_reschedule = xen_smp_send_reschedule,
+       .smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif /* CONFIG_SMP */
+
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+       smp_send_stop();
+#endif
+
+       if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+               BUG();
+}
+
+static void xen_restart(char *msg)
+{
+       xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_emergency_restart(void)
+{
+       xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_machine_halt(void)
+{
+       xen_reboot(SHUTDOWN_poweroff);
+}
+
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+       xen_reboot(SHUTDOWN_crash);
+}
+
+static const struct machine_ops __initdata xen_machine_ops = {
+       .restart = xen_restart,
+       .halt = xen_machine_halt,
+       .power_off = xen_machine_halt,
+       .shutdown = xen_machine_halt,
+       .crash_shutdown = xen_crash_shutdown,
+       .emergency_restart = xen_emergency_restart,
+};
+
+
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+       pgd_t *pgd;
+
+       if (!xen_start_info)
+               return;
+
+       BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+
+       /* Install Xen paravirt ops */
+       paravirt_ops = xen_paravirt_ops;
+       machine_ops = xen_machine_ops;
+
+#ifdef CONFIG_SMP
+       smp_ops = xen_smp_ops;
+#endif
+
+       xen_setup_features();
+
+       /* Get mfn list */
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+
+       pgd = (pgd_t *)xen_start_info->pt_base;
+
+       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+
+       init_mm.pgd = pgd; /* use the Xen pagetables to start */
+
+       /* keep using Xen gdt for now; no urgent need to change it */
+
+       x86_write_percpu(xen_cr3, __pa(pgd));
+
+#ifdef CONFIG_SMP
+       /* Don't do the full vcpu_info placement stuff until we have a
+          possible map. */
+       per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+       /* May as well do it now, since there's no good time to call
+          it later on UP. */
+       xen_setup_vcpu_info_placement();
+#endif
+
+       paravirt_ops.kernel_rpl = 1;
+       if (xen_feature(XENFEAT_supervisor_mode_kernel))
+               paravirt_ops.kernel_rpl = 0;
+
+       /* set the limit of our address space */
+       reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+
+       /* set up basic CPUID stuff */
+       cpu_detect(&new_cpu_data);
+       new_cpu_data.hard_math = 1;
+       new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+       /* Poke various useful things into boot_params */
+       LOADER_TYPE = (9 << 4) | 0;
+       INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+       INITRD_SIZE = xen_start_info->mod_len;
+
+       /* Start the world */
+       start_kernel();
+}
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
new file mode 100644 (file)
index 0000000..da1b173
--- /dev/null
@@ -0,0 +1,591 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels.  Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels.  The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip.  When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications.  This includes all the virtual
+ *    device events, since they're driven by front-ends in another domain
+ *    (typically dom0).
+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "xen-ops.h"
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+       unsigned short evtchn;
+       unsigned char index;
+       unsigned char type;
+};
+
+static struct packed_irq irq_info[NR_IRQS];
+
+/* Binding types. */
+enum {
+       IRQT_UNBOUND,
+       IRQT_PIRQ,
+       IRQT_VIRQ,
+       IRQT_IPI,
+       IRQT_EVTCHN
+};
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+       [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)      ((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+       (void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+       return (struct packed_irq) { evtchn, index, type };
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+       return irq_info[irq].evtchn;
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+       return irq_info[irq].index;
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+       return irq_info[irq].type;
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+                                          struct shared_info *sh,
+                                          unsigned int idx)
+{
+       return (sh->evtchn_pending[idx] &
+               cpu_evtchn_mask[cpu][idx] &
+               ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+       int irq = evtchn_to_irq[chn];
+
+       BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+       irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+
+       __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+       __set_bit(chn, cpu_evtchn_mask[cpu]);
+
+       cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+       int i;
+       /* By default all event channels notify CPU#0. */
+       for (i = 0; i < NR_IRQS; i++)
+               irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+
+       memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+       memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+       return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+       struct shared_info *s = HYPERVISOR_shared_info;
+       sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+       struct shared_info *s = HYPERVISOR_shared_info;
+       sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void mask_evtchn(int port)
+{
+       struct shared_info *s = HYPERVISOR_shared_info;
+       sync_set_bit(port, &s->evtchn_mask[0]);
+}
+
+static void unmask_evtchn(int port)
+{
+       struct shared_info *s = HYPERVISOR_shared_info;
+       unsigned int cpu = get_cpu();
+
+       BUG_ON(!irqs_disabled());
+
+       /* Slow path (hypercall) if this is a non-local port. */
+       if (unlikely(cpu != cpu_from_evtchn(port))) {
+               struct evtchn_unmask unmask = { .port = port };
+               (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+       } else {
+               struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+
+               sync_clear_bit(port, &s->evtchn_mask[0]);
+
+               /*
+                * The following is basically the equivalent of
+                * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+                * the interrupt edge' if the channel is masked.
+                */
+               if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+                   !sync_test_and_set_bit(port / BITS_PER_LONG,
+                                          &vcpu_info->evtchn_pending_sel))
+                       vcpu_info->evtchn_upcall_pending = 1;
+       }
+
+       put_cpu();
+}
+
+static int find_unbound_irq(void)
+{
+       int irq;
+
+       /* Only allocate from dynirq range */
+       for (irq = 0; irq < NR_IRQS; irq++)
+               if (irq_bindcount[irq] == 0)
+                       break;
+
+       if (irq == NR_IRQS)
+               panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+       return irq;
+}
+
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+       int irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       irq = evtchn_to_irq[evtchn];
+
+       if (irq == -1) {
+               irq = find_unbound_irq();
+
+               dynamic_irq_init(irq);
+               set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                             handle_level_irq, "event");
+
+               evtchn_to_irq[evtchn] = irq;
+               irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+       }
+
+       irq_bindcount[irq]++;
+
+       spin_unlock(&irq_mapping_update_lock);
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+       struct evtchn_bind_ipi bind_ipi;
+       int evtchn, irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       irq = per_cpu(ipi_to_irq, cpu)[ipi];
+       if (irq == -1) {
+               irq = find_unbound_irq();
+               if (irq < 0)
+                       goto out;
+
+               dynamic_irq_init(irq);
+               set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                             handle_level_irq, "ipi");
+
+               bind_ipi.vcpu = cpu;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+                                               &bind_ipi) != 0)
+                       BUG();
+               evtchn = bind_ipi.port;
+
+               evtchn_to_irq[evtchn] = irq;
+               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+               per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+               bind_evtchn_to_cpu(evtchn, cpu);
+       }
+
+       irq_bindcount[irq]++;
+
+ out:
+       spin_unlock(&irq_mapping_update_lock);
+       return irq;
+}
+
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+       struct evtchn_bind_virq bind_virq;
+       int evtchn, irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       irq = per_cpu(virq_to_irq, cpu)[virq];
+
+       if (irq == -1) {
+               bind_virq.virq = virq;
+               bind_virq.vcpu = cpu;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                               &bind_virq) != 0)
+                       BUG();
+               evtchn = bind_virq.port;
+
+               irq = find_unbound_irq();
+
+               dynamic_irq_init(irq);
+               set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                             handle_level_irq, "virq");
+
+               evtchn_to_irq[evtchn] = irq;
+               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+               per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+               bind_evtchn_to_cpu(evtchn, cpu);
+       }
+
+       irq_bindcount[irq]++;
+
+       spin_unlock(&irq_mapping_update_lock);
+
+       return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+       struct evtchn_close close;
+       int evtchn = evtchn_from_irq(irq);
+
+       spin_lock(&irq_mapping_update_lock);
+
+       if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+               close.port = evtchn;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+                       BUG();
+
+               switch (type_from_irq(irq)) {
+               case IRQT_VIRQ:
+                       per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+                               [index_from_irq(irq)] = -1;
+                       break;
+               default:
+                       break;
+               }
+
+               /* Closed ports are implicitly re-bound to VCPU0. */
+               bind_evtchn_to_cpu(evtchn, 0);
+
+               evtchn_to_irq[evtchn] = -1;
+               irq_info[irq] = IRQ_UNBOUND;
+
+               dynamic_irq_init(irq);
+       }
+
+       spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+                             irqreturn_t (*handler)(int, void *),
+                             unsigned long irqflags,
+                             const char *devname, void *dev_id)
+{
+       unsigned int irq;
+       int retval;
+
+       irq = bind_evtchn_to_irq(evtchn);
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+                           irqreturn_t (*handler)(int, void *),
+                           unsigned long irqflags, const char *devname, void *dev_id)
+{
+       unsigned int irq;
+       int retval;
+
+       irq = bind_virq_to_irq(virq, cpu);
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+                          unsigned int cpu,
+                          irq_handler_t handler,
+                          unsigned long irqflags,
+                          const char *devname,
+                          void *dev_id)
+{
+       int irq, retval;
+
+       irq = bind_ipi_to_irq(ipi, cpu);
+       if (irq < 0)
+               return irq;
+
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+       free_irq(irq, dev_id);
+       unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+       int irq = per_cpu(ipi_to_irq, cpu)[vector];
+       BUG_ON(irq < 0);
+       notify_remote_via_irq(irq);
+}
+
+
+/*
+ * Search the CPUs pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+       int cpu = get_cpu();
+       struct shared_info *s = HYPERVISOR_shared_info;
+       struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+       unsigned long pending_words;
+
+       vcpu_info->evtchn_upcall_pending = 0;
+
+       /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+       pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+       while (pending_words != 0) {
+               unsigned long pending_bits;
+               int word_idx = __ffs(pending_words);
+               pending_words &= ~(1UL << word_idx);
+
+               while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+                       int bit_idx = __ffs(pending_bits);
+                       int port = (word_idx * BITS_PER_LONG) + bit_idx;
+                       int irq = evtchn_to_irq[port];
+
+                       if (irq != -1) {
+                               regs->orig_eax = ~irq;
+                               do_IRQ(regs);
+                       }
+               }
+       }
+
+       put_cpu();
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+       struct evtchn_bind_vcpu bind_vcpu;
+       int evtchn = evtchn_from_irq(irq);
+
+       if (!VALID_EVTCHN(evtchn))
+               return;
+
+       /* Send future instances of this interrupt to other vcpu. */
+       bind_vcpu.port = evtchn;
+       bind_vcpu.vcpu = tcpu;
+
+       /*
+        * If this fails, it usually just indicates that we're dealing with a
+        * virq or IPI channel, which don't actually need to be rebound. Ignore
+        * it, but don't do the xenlinux-level rebind in that case.
+        */
+       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+               bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+       unsigned tcpu = first_cpu(dest);
+       rebind_irq_to_cpu(irq, tcpu);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       move_native_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+       int ret = 0;
+
+       if (VALID_EVTCHN(evtchn)) {
+               set_evtchn(evtchn);
+               ret = 1;
+       }
+
+       return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+       .name           = "xen-dyn",
+       .mask           = disable_dynirq,
+       .unmask         = enable_dynirq,
+       .ack            = ack_dynirq,
+       .set_affinity   = set_affinity_irq,
+       .retrigger      = retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+       int i;
+
+       init_evtchn_cpu_bindings();
+
+       /* No event channels are 'live' right now. */
+       for (i = 0; i < NR_EVENT_CHANNELS; i++)
+               mask_evtchn(i);
+
+       /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+       for (i = 0; i < NR_IRQS; i++)
+               irq_bindcount[i] = 0;
+
+       irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c
new file mode 100644 (file)
index 0000000..0707714
--- /dev/null
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+       struct xen_feature_info fi;
+       int i, j;
+
+       for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+               fi.submap_idx = i;
+               if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+                       break;
+               for (j = 0; j < 32; j++)
+                       xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+       }
+}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
new file mode 100644 (file)
index 0000000..aa7af9e
--- /dev/null
@@ -0,0 +1,143 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+
+#include <xen/xenbus.h>
+
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT      4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+static void shutdown_handler(struct xenbus_watch *watch,
+                            const char **vec, unsigned int len)
+{
+       char *str;
+       struct xenbus_transaction xbt;
+       int err;
+
+       if (shutting_down != SHUTDOWN_INVALID)
+               return;
+
+ again:
+       err = xenbus_transaction_start(&xbt);
+       if (err)
+               return;
+
+       str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+       /* Ignore read errors and empty reads. */
+       if (XENBUS_IS_ERR_READ(str)) {
+               xenbus_transaction_end(xbt, 1);
+               return;
+       }
+
+       xenbus_write(xbt, "control", "shutdown", "");
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN) {
+               kfree(str);
+               goto again;
+       }
+
+       if (strcmp(str, "poweroff") == 0 ||
+           strcmp(str, "halt") == 0)
+               orderly_poweroff(false);
+       else if (strcmp(str, "reboot") == 0)
+               ctrl_alt_del();
+       else {
+               printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+               shutting_down = SHUTDOWN_INVALID;
+       }
+
+       kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+                         unsigned int len)
+{
+       char sysrq_key = '\0';
+       struct xenbus_transaction xbt;
+       int err;
+
+ again:
+       err = xenbus_transaction_start(&xbt);
+       if (err)
+               return;
+       if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+               printk(KERN_ERR "Unable to read sysrq code in "
+                      "control/sysrq\n");
+               xenbus_transaction_end(xbt, 1);
+               return;
+       }
+
+       if (sysrq_key != '\0')
+               xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+
+       if (sysrq_key != '\0')
+               handle_sysrq(sysrq_key, NULL);
+}
+
+static struct xenbus_watch shutdown_watch = {
+       .node = "control/shutdown",
+       .callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+       .node = "control/sysrq",
+       .callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(void)
+{
+       int err;
+
+       err = register_xenbus_watch(&shutdown_watch);
+       if (err) {
+               printk(KERN_ERR "Failed to set shutdown watcher\n");
+               return err;
+       }
+
+       err = register_xenbus_watch(&sysrq_watch);
+       if (err) {
+               printk(KERN_ERR "Failed to set sysrq watcher\n");
+               return err;
+       }
+
+       return 0;
+}
+
+static int shutdown_event(struct notifier_block *notifier,
+                         unsigned long event,
+                         void *data)
+{
+       setup_shutdown_watcher();
+       return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+       static struct notifier_block xenstore_notifier = {
+               .notifier_call = shutdown_event
+       };
+       register_xenstore_notifier(&xenstore_notifier);
+
+       return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
new file mode 100644 (file)
index 0000000..874db0c
--- /dev/null
@@ -0,0 +1,567 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion.  In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable.  When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest.  This prevents uncontrolled
+ * guest updates to the pagetable.  Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow.  The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use.  This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+
+#include "multicalls.h"
+#include "mmu.h"
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+       pte_t *pte = lookup_address(address);
+       unsigned offset = address & PAGE_MASK;
+
+       BUG_ON(pte == NULL);
+
+       return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+       pte_t *pte, ptev;
+       unsigned long address = (unsigned long)vaddr;
+
+       pte = lookup_address(address);
+       BUG_ON(pte == NULL);
+
+       ptev = pte_wrprotect(*pte);
+
+       if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+               BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+       pte_t *pte, ptev;
+       unsigned long address = (unsigned long)vaddr;
+
+       pte = lookup_address(address);
+       BUG_ON(pte == NULL);
+
+       ptev = pte_mkwrite(*pte);
+
+       if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+               BUG();
+}
+
+
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+       struct multicall_space mcs;
+       struct mmu_update *u;
+
+       preempt_disable();
+
+       mcs = xen_mc_entry(sizeof(*u));
+       u = mcs.args;
+       u->ptr = virt_to_machine(ptr).maddr;
+       u->val = pmd_val_ma(val);
+       MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+       preempt_enable();
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       /* <mfn,flags> stored as-is, to permit clearing entries */
+       xen_set_pte(pte, mfn_pte(mfn, flags));
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                   pte_t *ptep, pte_t pteval)
+{
+       if (mm == current->mm || mm == &init_mm) {
+               if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+                       struct multicall_space mcs;
+                       mcs = xen_mc_entry(0);
+
+                       MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+                       xen_mc_issue(PARAVIRT_LAZY_MMU);
+                       return;
+               } else
+                       if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+                               return;
+       }
+       xen_set_pte(ptep, pteval);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+       struct multicall_space mcs;
+       struct mmu_update *u;
+
+       preempt_disable();
+
+       mcs = xen_mc_entry(sizeof(*u));
+       u = mcs.args;
+       u->ptr = virt_to_machine(ptr).maddr;
+       u->val = pud_val_ma(val);
+       MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+       preempt_enable();
+}
+
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+       ptep->pte_high = pte.pte_high;
+       smp_wmb();
+       ptep->pte_low = pte.pte_low;
+}
+
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+       set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+       ptep->pte_low = 0;
+       smp_wmb();              /* make sure low gets written first */
+       ptep->pte_high = 0;
+}
+
+void xen_pmd_clear(pmd_t *pmdp)
+{
+       xen_set_pmd(pmdp, __pmd(0));
+}
+
+unsigned long long xen_pte_val(pte_t pte)
+{
+       unsigned long long ret = 0;
+
+       if (pte.pte_low) {
+               ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+       }
+
+       return ret;
+}
+
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+       unsigned long long ret = pmd.pmd;
+       if (ret)
+               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+       return ret;
+}
+
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+       unsigned long long ret = pgd.pgd;
+       if (ret)
+               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+       return ret;
+}
+
+pte_t xen_make_pte(unsigned long long pte)
+{
+       if (pte & 1)
+               pte = phys_to_machine(XPADDR(pte)).maddr;
+
+       return (pte_t){ pte, pte >> 32 };
+}
+
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+       if (pmd & 1)
+               pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
+       return (pmd_t){ pmd };
+}
+
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+       if (pgd & _PAGE_PRESENT)
+               pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+       return (pgd_t){ pgd };
+}
+#else  /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+       *ptep = pte;
+}
+
+unsigned long xen_pte_val(pte_t pte)
+{
+       unsigned long ret = pte.pte_low;
+
+       if (ret & _PAGE_PRESENT)
+               ret = machine_to_phys(XMADDR(ret)).paddr;
+
+       return ret;
+}
+
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+       unsigned long ret = pgd.pgd;
+       if (ret)
+               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+       return ret;
+}
+
+pte_t xen_make_pte(unsigned long pte)
+{
+       if (pte & _PAGE_PRESENT)
+               pte = phys_to_machine(XPADDR(pte)).maddr;
+
+       return (pte_t){ pte };
+}
+
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+       if (pgd & _PAGE_PRESENT)
+               pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+       return (pgd_t){ pgd };
+}
+#endif /* CONFIG_X86_PAE */
+
+
+
+/*
+  (Yet another) pagetable walker.  This one is intended for pinning a
+  pagetable.  This means that it walks a pagetable and calls the
+  callback function on each page it finds making up the page table,
+  at every level.  It walks the entire pagetable, but it only bothers
+  pinning pte pages which are below pte_limit.  In the normal case
+  this will be TASK_SIZE, but at boot we need to pin up to
+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
+  there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+                   unsigned long limit)
+{
+       pgd_t *pgd = pgd_base;
+       int flush = 0;
+       unsigned long addr = 0;
+       unsigned long pgd_next;
+
+       BUG_ON(limit > FIXADDR_TOP);
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return 0;
+
+       for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+               pud_t *pud;
+               unsigned long pud_limit, pud_next;
+
+               pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+
+               if (!pgd_val(*pgd))
+                       continue;
+
+               pud = pud_offset(pgd, 0);
+
+               if (PTRS_PER_PUD > 1) /* not folded */
+                       flush |= (*func)(virt_to_page(pud), 0);
+
+               for (; addr != pud_limit; pud++, addr = pud_next) {
+                       pmd_t *pmd;
+                       unsigned long pmd_limit;
+
+                       pud_next = pud_addr_end(addr, pud_limit);
+
+                       if (pud_next < limit)
+                               pmd_limit = pud_next;
+                       else
+                               pmd_limit = limit;
+
+                       if (pud_none(*pud))
+                               continue;
+
+                       pmd = pmd_offset(pud, 0);
+
+                       if (PTRS_PER_PMD > 1) /* not folded */
+                               flush |= (*func)(virt_to_page(pmd), 0);
+
+                       for (; addr != pmd_limit; pmd++) {
+                               addr += (PAGE_SIZE * PTRS_PER_PTE);
+                               if ((pmd_limit-1) < (addr-1)) {
+                                       addr = pmd_limit;
+                                       break;
+                               }
+
+                               if (pmd_none(*pmd))
+                                       continue;
+
+                               flush |= (*func)(pmd_page(*pmd), 0);
+                       }
+               }
+       }
+
+       flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+
+       return flush;
+}
+
+static int pin_page(struct page *page, unsigned flags)
+{
+       unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+       int flush;
+
+       if (pgfl)
+               flush = 0;              /* already pinned */
+       else if (PageHighMem(page))
+               /* kmaps need flushing if we found an unpinned
+                  highpage */
+               flush = 1;
+       else {
+               void *pt = lowmem_page_address(page);
+               unsigned long pfn = page_to_pfn(page);
+               struct multicall_space mcs = __xen_mc_entry(0);
+
+               flush = 0;
+
+               MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                       pfn_pte(pfn, PAGE_KERNEL_RO),
+                                       flags);
+       }
+
+       return flush;
+}
+
+/* This is called just after a mm has been created, but it has not
+   been used yet.  We need to make sure that its pagetable is all
+   read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+       struct multicall_space mcs;
+       struct mmuext_op *op;
+
+       xen_mc_batch();
+
+       if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+               /* re-enable interrupts for kmap_flush_unused */
+               xen_mc_issue(0);
+               kmap_flush_unused();
+               xen_mc_batch();
+       }
+
+       mcs = __xen_mc_entry(sizeof(*op));
+       op = mcs.args;
+
+#ifdef CONFIG_X86_PAE
+       op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+       op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+       op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(0);
+}
+
+/* The init_mm pagetable is really pinned as soon as its created, but
+   that's before we have page structures to store the bits.  So do all
+   the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+       SetPagePinned(page);
+       return 0;
+}
+
+void __init xen_mark_init_mm_pinned(void)
+{
+       pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+
+static int unpin_page(struct page *page, unsigned flags)
+{
+       unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+
+       if (pgfl && !PageHighMem(page)) {
+               void *pt = lowmem_page_address(page);
+               unsigned long pfn = page_to_pfn(page);
+               struct multicall_space mcs = __xen_mc_entry(0);
+
+               MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                       pfn_pte(pfn, PAGE_KERNEL),
+                                       flags);
+       }
+
+       return 0;               /* never need to flush on unpin */
+}
+
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs;
+
+       xen_mc_batch();
+
+       mcs = __xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_UNPIN_TABLE;
+       op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       pgd_walk(pgd, unpin_page, TASK_SIZE);
+
+       xen_mc_issue(0);
+}
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+       spin_lock(&next->page_table_lock);
+       xen_pgd_pin(next->pgd);
+       spin_unlock(&next->page_table_lock);
+}
+
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+       spin_lock(&mm->page_table_lock);
+       xen_pgd_pin(mm->pgd);
+       spin_unlock(&mm->page_table_lock);
+}
+
+
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+   we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+       struct mm_struct *mm = info;
+
+       if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+               leave_mm(smp_processor_id());
+}
+
+static void drop_mm_ref(struct mm_struct *mm)
+{
+       if (current->active_mm == mm) {
+               if (current->mm == mm)
+                       load_cr3(swapper_pg_dir);
+               else
+                       leave_mm(smp_processor_id());
+       }
+
+       if (!cpus_empty(mm->cpu_vm_mask))
+               xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+                                          mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+       if (current->active_mm == mm)
+               load_cr3(swapper_pg_dir);
+}
+#endif
+
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+       get_cpu();              /* make sure we don't move around */
+       drop_mm_ref(mm);
+       put_cpu();
+
+       spin_lock(&mm->page_table_lock);
+
+       /* pgd may not be pinned in the error exit path of execve */
+       if (PagePinned(virt_to_page(mm->pgd)))
+               xen_pgd_unpin(mm->pgd);
+       spin_unlock(&mm->page_table_lock);
+}
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
new file mode 100644 (file)
index 0000000..c9ff27f
--- /dev/null
@@ -0,0 +1,60 @@
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                   pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+
+void xen_pgd_pin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
+
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                   pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+
+
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+
+#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
new file mode 100644 (file)
index 0000000..c837e8e
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface.  This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls.  There's a
+ * per-cpu buffer of outstanding multicalls.  When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc).  When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested.  There's no way to get per-multicall
+ * return results back.  It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+
+#include <asm/xen/hypercall.h>
+
+#include "multicalls.h"
+
+#define MC_BATCH       32
+#define MC_ARGS                (MC_BATCH * 16 / sizeof(u64))
+
+struct mc_buffer {
+       struct multicall_entry entries[MC_BATCH];
+       u64 args[MC_ARGS];
+       unsigned mcidx, argidx;
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+void xen_mc_flush(void)
+{
+       struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+       int ret = 0;
+       unsigned long flags;
+
+       BUG_ON(preemptible());
+
+       /* Disable interrupts in case someone comes in and queues
+          something in the middle */
+       local_irq_save(flags);
+
+       if (b->mcidx) {
+               int i;
+
+               if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+                       BUG();
+               for (i = 0; i < b->mcidx; i++)
+                       if (b->entries[i].result < 0)
+                               ret++;
+               b->mcidx = 0;
+               b->argidx = 0;
+       } else
+               BUG_ON(b->argidx != 0);
+
+       local_irq_restore(flags);
+
+       BUG_ON(ret);
+}
+
+struct multicall_space __xen_mc_entry(size_t args)
+{
+       struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+       struct multicall_space ret;
+       unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+
+       BUG_ON(preemptible());
+       BUG_ON(argspace > MC_ARGS);
+
+       if (b->mcidx == MC_BATCH ||
+           (b->argidx + argspace) > MC_ARGS)
+               xen_mc_flush();
+
+       ret.mc = &b->entries[b->mcidx];
+       b->mcidx++;
+       ret.args = &b->args[b->argidx];
+       b->argidx += argspace;
+
+       return ret;
+}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
new file mode 100644 (file)
index 0000000..e6f7530
--- /dev/null
@@ -0,0 +1,45 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+#include "xen-ops.h"
+
+/* Multicalls */
+struct multicall_space
+{
+       struct multicall_entry *mc;
+       void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
+   paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+       /* need to disable interrupts until this entry is complete */
+       local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+       xen_mc_batch();
+       return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+       if ((xen_get_lazy_mode() & mode) == 0)
+               xen_mc_flush();
+
+       /* restore flags saved in xen_mc_batch */
+       local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
new file mode 100644 (file)
index 0000000..f84e772
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+
+#include "xen-ops.h"
+#include "vdso.h"
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+
+char * __init xen_memory_setup(void)
+{
+       unsigned long max_pfn = xen_start_info->nr_pages;
+
+       e820.nr_map = 0;
+       add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+       return "Xen";
+}
+
+static void xen_idle(void)
+{
+       local_irq_disable();
+
+       if (need_resched())
+               local_irq_enable();
+       else {
+               current_thread_info()->status &= ~TS_POLLING;
+               smp_mb__after_clear_bit();
+               safe_halt();
+               current_thread_info()->status |= TS_POLLING;
+       }
+}
+
+/*
+ * Set the bit indicating "nosegneg" library variants should be used.
+ */
+static void fiddle_vdso(void)
+{
+       extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S.  */
+       extern char vsyscall_int80_start;
+       u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
+                            &vsyscall_int80_start);
+       *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+}
+
+void __init xen_arch_setup(void)
+{
+       struct physdev_set_iopl set_iopl;
+       int rc;
+
+       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+
+       HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+                                __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+
+       set_iopl.iopl = 1;
+       rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+       if (rc != 0)
+               printk(KERN_INFO "physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_ACPI
+       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+               disable_acpi();
+       }
+#endif
+
+       memcpy(boot_command_line, xen_start_info->cmd_line,
+              MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+              COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+       pm_idle = xen_idle;
+
+#ifdef CONFIG_SMP
+       /* fill cpus_possible with all available cpus */
+       xen_fill_possible_map();
+#endif
+
+       paravirt_disable_iospace();
+
+       fiddle_vdso();
+}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
new file mode 100644 (file)
index 0000000..557b8e2
--- /dev/null
@@ -0,0 +1,404 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
+ * very straightforward.  Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of.  As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/page.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+       void (*func) (void *info);
+       void *info;
+       atomic_t started;
+       atomic_t finished;
+       int wait;
+};
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+
+static struct call_data_struct *call_data;
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+       return IRQ_HANDLED;
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+       int cpu = smp_processor_id();
+
+       cpu_init();
+
+       preempt_disable();
+       per_cpu(cpu_state, cpu) = CPU_ONLINE;
+
+       xen_setup_cpu_clockevents();
+
+       /* We can take interrupts now: we're officially "up". */
+       local_irq_enable();
+
+       wmb();                  /* make sure everything is out */
+       cpu_idle();
+}
+
+static int xen_smp_intr_init(unsigned int cpu)
+{
+       int rc;
+       const char *resched_name, *callfunc_name;
+
+       per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+
+       resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+       rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+                                   cpu,
+                                   xen_reschedule_interrupt,
+                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                   resched_name,
+                                   NULL);
+       if (rc < 0)
+               goto fail;
+       per_cpu(resched_irq, cpu) = rc;
+
+       callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+       rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+                                   cpu,
+                                   xen_call_function_interrupt,
+                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                   callfunc_name,
+                                   NULL);
+       if (rc < 0)
+               goto fail;
+       per_cpu(callfunc_irq, cpu) = rc;
+
+       return 0;
+
+ fail:
+       if (per_cpu(resched_irq, cpu) >= 0)
+               unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+       if (per_cpu(callfunc_irq, cpu) >= 0)
+               unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+       return rc;
+}
+
+void __init xen_fill_possible_map(void)
+{
+       int i, rc;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+               if (rc >= 0)
+                       cpu_set(i, cpu_possible_map);
+       }
+}
+
+void __init xen_smp_prepare_boot_cpu(void)
+{
+       int cpu;
+
+       BUG_ON(smp_processor_id() != 0);
+       native_smp_prepare_boot_cpu();
+
+       /* We've switched to the "real" per-cpu gdt, so make sure the
+          old memory can be recycled */
+       make_lowmem_page_readwrite(&per_cpu__gdt_page);
+
+       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+               cpus_clear(cpu_sibling_map[cpu]);
+               cpus_clear(cpu_core_map[cpu]);
+       }
+
+       xen_setup_vcpu_info_placement();
+}
+
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+       unsigned cpu;
+
+       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+               cpus_clear(cpu_sibling_map[cpu]);
+               cpus_clear(cpu_core_map[cpu]);
+       }
+
+       smp_store_cpu_info(0);
+       set_cpu_sibling_map(0);
+
+       if (xen_smp_intr_init(0))
+               BUG();
+
+       cpu_initialized_map = cpumask_of_cpu(0);
+
+       /* Restrict the possible_map according to max_cpus. */
+       while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+               for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+                       continue;
+               cpu_clear(cpu, cpu_possible_map);
+       }
+
+       for_each_possible_cpu (cpu) {
+               struct task_struct *idle;
+
+               if (cpu == 0)
+                       continue;
+
+               idle = fork_idle(cpu);
+               if (IS_ERR(idle))
+                       panic("failed fork for CPU %d", cpu);
+
+               cpu_set(cpu, cpu_present_map);
+       }
+
+       //init_xenbus_allowed_cpumask();
+}
+
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+       struct vcpu_guest_context *ctxt;
+       struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+
+       if (cpu_test_and_set(cpu, cpu_initialized_map))
+               return 0;
+
+       ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (ctxt == NULL)
+               return -ENOMEM;
+
+       ctxt->flags = VGCF_IN_KERNEL;
+       ctxt->user_regs.ds = __USER_DS;
+       ctxt->user_regs.es = __USER_DS;
+       ctxt->user_regs.fs = __KERNEL_PERCPU;
+       ctxt->user_regs.gs = 0;
+       ctxt->user_regs.ss = __KERNEL_DS;
+       ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+       ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+
+       memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
+       xen_copy_trap_info(ctxt->trap_ctxt);
+
+       ctxt->ldt_ents = 0;
+
+       BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+       make_lowmem_page_readonly(gdt->gdt);
+
+       ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+       ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+
+       ctxt->user_regs.cs = __KERNEL_CS;
+       ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+
+       ctxt->kernel_ss = __KERNEL_DS;
+       ctxt->kernel_sp = idle->thread.esp0;
+
+       ctxt->event_callback_cs     = __KERNEL_CS;
+       ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
+       ctxt->failsafe_callback_cs  = __KERNEL_CS;
+       ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+
+       per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+       ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+       if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+               BUG();
+
+       kfree(ctxt);
+       return 0;
+}
+
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+       struct task_struct *idle = idle_task(cpu);
+       int rc;
+
+#if 0
+       rc = cpu_up_check(cpu);
+       if (rc)
+               return rc;
+#endif
+
+       init_gdt(cpu);
+       per_cpu(current_task, cpu) = idle;
+       irq_ctx_init(cpu);
+       xen_setup_timer(cpu);
+
+       /* make sure interrupts start blocked */
+       per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+
+       rc = cpu_initialize_context(cpu, idle);
+       if (rc)
+               return rc;
+
+       if (num_online_cpus() == 1)
+               alternatives_smp_switch(1);
+
+       rc = xen_smp_intr_init(cpu);
+       if (rc)
+               return rc;
+
+       smp_store_cpu_info(cpu);
+       set_cpu_sibling_map(cpu);
+       /* This must be done before setting cpu_online_map */
+       wmb();
+
+       cpu_set(cpu, cpu_online_map);
+
+       rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+       BUG_ON(rc);
+
+       return 0;
+}
+
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+static void stop_self(void *v)
+{
+       int cpu = smp_processor_id();
+
+       /* make sure we're not pinning something down */
+       load_cr3(swapper_pg_dir);
+       /* should set up a minimal gdt */
+
+       HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+       BUG();
+}
+
+void xen_smp_send_stop(void)
+{
+       smp_call_function(stop_self, NULL, 0, 0);
+}
+
+void xen_smp_send_reschedule(int cpu)
+{
+       xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+
+
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+       unsigned cpu;
+
+       cpus_and(mask, mask, cpu_online_map);
+
+       for_each_cpu_mask(cpu, mask)
+               xen_send_IPI_one(cpu, vector);
+}
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+       void (*func) (void *info) = call_data->func;
+       void *info = call_data->info;
+       int wait = call_data->wait;
+
+       /*
+        * Notify initiating CPU that I've grabbed the data and am
+        * about to execute the function
+        */
+       mb();
+       atomic_inc(&call_data->started);
+       /*
+        * At this point the info structure may be out of scope unless wait==1
+        */
+       irq_enter();
+       (*func)(info);
+       irq_exit();
+
+       if (wait) {
+               mb();           /* commit everything before setting finished */
+               atomic_inc(&call_data->finished);
+       }
+
+       return IRQ_HANDLED;
+}
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+                              void *info, int wait)
+{
+       struct call_data_struct data;
+       int cpus;
+
+       /* Holding any lock stops cpus from going down. */
+       spin_lock(&call_lock);
+
+       cpu_clear(smp_processor_id(), mask);
+
+       cpus = cpus_weight(mask);
+       if (!cpus) {
+               spin_unlock(&call_lock);
+               return 0;
+       }
+
+       /* Can deadlock when called with interrupts disabled */
+       WARN_ON(irqs_disabled());
+
+       data.func = func;
+       data.info = info;
+       atomic_set(&data.started, 0);
+       data.wait = wait;
+       if (wait)
+               atomic_set(&data.finished, 0);
+
+       call_data = &data;
+       mb();                   /* write everything before IPI */
+
+       /* Send a message to other CPUs and wait for them to respond */
+       xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+       /* Make sure other vcpus get a chance to run.
+          XXX too severe?  Maybe we should check the other CPU's states? */
+       HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+
+       /* Wait for response */
+       while (atomic_read(&data.started) != cpus ||
+              (wait && atomic_read(&data.finished) != cpus))
+               cpu_relax();
+
+       spin_unlock(&call_lock);
+
+       return 0;
+}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
new file mode 100644 (file)
index 0000000..dfd6db6
--- /dev/null
@@ -0,0 +1,593 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP     100000
+#define NS_PER_TICK    (1000000000LL / HZ)
+
+static cycle_t xen_clocksource_read(void);
+
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+       u32 tsc_to_nsec_mul;
+       int tsc_shift;
+       u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+       u64 ret;
+
+       if (BITS_PER_LONG < 64) {
+               u32 *p32 = (u32 *)p;
+               u32 h, l;
+
+               /*
+                * Read high then low, and then make sure high is
+                * still the same; this will only loop if low wraps
+                * and carries into high.
+                * XXX some clean way to make this endian-proof?
+                */
+               do {
+                       h = p32[1];
+                       barrier();
+                       l = p32[0];
+                       barrier();
+               } while (p32[1] != h);
+
+               ret = (((u64)h) << 32) | l;
+       } else
+               ret = *p;
+
+       return ret;
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+       u64 state_time;
+       struct vcpu_runstate_info *state;
+
+       BUG_ON(preemptible());
+
+       state = &__get_cpu_var(runstate);
+
+       /*
+        * The runstate info is always updated by the hypervisor on
+        * the current CPU, so there's no need to use anything
+        * stronger than a compiler barrier when fetching it.
+        */
+       do {
+               state_time = get64(&state->state_entry_time);
+               barrier();
+               *res = *state;
+               barrier();
+       } while (get64(&state->state_entry_time) != state_time);
+}
+
+static void setup_runstate_info(int cpu)
+{
+       struct vcpu_register_runstate_memory_area area;
+
+       area.addr.v = &per_cpu(runstate, cpu);
+
+       if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+                              cpu, &area))
+               BUG();
+}
+
+static void do_stolen_accounting(void)
+{
+       struct vcpu_runstate_info state;
+       struct vcpu_runstate_info *snap;
+       s64 blocked, runnable, offline, stolen;
+       cputime_t ticks;
+
+       get_runstate_snapshot(&state);
+
+       WARN_ON(state.state != RUNSTATE_running);
+
+       snap = &__get_cpu_var(runstate_snapshot);
+
+       /* work out how much time the VCPU has not been runn*ing*  */
+       blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+       runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+       offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+
+       *snap = state;
+
+       /* Add the appropriate number of ticks of stolen time,
+          including any left-overs from last time.  Passing NULL to
+          account_steal_time accounts the time as stolen. */
+       stolen = runnable + offline + __get_cpu_var(residual_stolen);
+
+       if (stolen < 0)
+               stolen = 0;
+
+       ticks = 0;
+       while (stolen >= NS_PER_TICK) {
+               ticks++;
+               stolen -= NS_PER_TICK;
+       }
+       __get_cpu_var(residual_stolen) = stolen;
+       account_steal_time(NULL, ticks);
+
+       /* Add the appropriate number of ticks of blocked time,
+          including any left-overs from last time.  Passing idle to
+          account_steal_time accounts the time as idle/wait. */
+       blocked += __get_cpu_var(residual_blocked);
+
+       if (blocked < 0)
+               blocked = 0;
+
+       ticks = 0;
+       while (blocked >= NS_PER_TICK) {
+               ticks++;
+               blocked -= NS_PER_TICK;
+       }
+       __get_cpu_var(residual_blocked) = blocked;
+       account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+       struct vcpu_runstate_info state;
+       cycle_t now;
+       u64 ret;
+       s64 offset;
+
+       /*
+        * Ideally sched_clock should be called on a per-cpu basis
+        * anyway, so preempt should already be disabled, but that's
+        * not current practice at the moment.
+        */
+       preempt_disable();
+
+       now = xen_clocksource_read();
+
+       get_runstate_snapshot(&state);
+
+       WARN_ON(state.state != RUNSTATE_running);
+
+       offset = now - state.state_entry_time;
+       if (offset < 0)
+               offset = 0;
+
+       ret = state.time[RUNSTATE_blocked] +
+               state.time[RUNSTATE_running] +
+               offset;
+
+       preempt_enable();
+
+       return ret;
+}
+
+
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+       u64 cpu_khz = 1000000ULL << 32;
+       const struct vcpu_time_info *info =
+               &HYPERVISOR_shared_info->vcpu_info[0].time;
+
+       do_div(cpu_khz, info->tsc_to_system_mul);
+       if (info->tsc_shift < 0)
+               cpu_khz <<= -info->tsc_shift;
+       else
+               cpu_khz >>= info->tsc_shift;
+
+       return cpu_khz;
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       /* src is shared memory with the hypervisor, so we need to
+          make sure we get a consistent snapshot, even in the face of
+          being preempted. */
+       src = &__get_cpu_var(xen_vcpu)->time;
+       dst = &__get_cpu_var(shadow_time);
+
+       do {
+               dst->version = src->version;
+               rmb();          /* fetch version before data */
+               dst->tsc_timestamp     = src->tsc_timestamp;
+               dst->system_timestamp  = src->system_time;
+               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+               dst->tsc_shift         = src->tsc_shift;
+               rmb();          /* test version after fetching data */
+       } while ((src->version & 1) | (dst->version ^ src->version));
+
+       return dst->version;
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+       u64 product;
+#ifdef __i386__
+       u32 tmp1, tmp2;
+#endif
+
+       if (shift < 0)
+               delta >>= -shift;
+       else
+               delta <<= shift;
+
+#ifdef __i386__
+       __asm__ (
+               "mul  %5       ; "
+               "mov  %4,%%eax ; "
+               "mov  %%edx,%4 ; "
+               "mul  %5       ; "
+               "xor  %5,%5    ; "
+               "add  %4,%%eax ; "
+               "adc  %5,%%edx ; "
+               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+       __asm__ (
+               "mul %%rdx ; shrd $32,%%rdx,%%rax"
+               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+       return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+       u64 now, delta;
+       now = native_read_tsc();
+       delta = now - shadow->tsc_timestamp;
+       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static cycle_t xen_clocksource_read(void)
+{
+       struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+       cycle_t ret;
+       unsigned version;
+
+       do {
+               version = get_time_values_from_xen();
+               barrier();
+               ret = shadow->system_timestamp + get_nsec_offset(shadow);
+               barrier();
+       } while (version != __get_cpu_var(xen_vcpu)->time.version);
+
+       put_cpu_var(shadow_time);
+
+       return ret;
+}
+
+static void xen_read_wallclock(struct timespec *ts)
+{
+       const struct shared_info *s = HYPERVISOR_shared_info;
+       u32 version;
+       u64 delta;
+       struct timespec now;
+
+       /* get wallclock at system boot */
+       do {
+               version = s->wc_version;
+               rmb();          /* fetch version before time */
+               now.tv_sec  = s->wc_sec;
+               now.tv_nsec = s->wc_nsec;
+               rmb();          /* fetch time before checking version */
+       } while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+       delta = xen_clocksource_read(); /* time since system boot */
+       delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+       now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+       now.tv_sec = delta;
+
+       set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+unsigned long xen_get_wallclock(void)
+{
+       struct timespec ts;
+
+       xen_read_wallclock(&ts);
+
+       return ts.tv_sec;
+}
+
+int xen_set_wallclock(unsigned long now)
+{
+       /* do nothing for domU */
+       return -1;
+}
+
+static struct clocksource xen_clocksource __read_mostly = {
+       .name = "xen",
+       .rating = 400,
+       .read = xen_clocksource_read,
+       .mask = ~0,
+       .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
+       .shift = XEN_SHIFT,
+       .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+/*
+   Xen clockevent implementation
+
+   Xen has two clockevent implementations:
+
+   The old timer_op one works with all released versions of Xen prior
+   to version 3.0.4.  This version of the hypervisor provides a
+   single-shot timer with nanosecond resolution.  However, sharing the
+   same event channel is a 100Hz tick which is delivered while the
+   vcpu is running.  We don't care about or use this tick, but it will
+   cause the core time code to think the timer fired too soon, and
+   will end up resetting it each time.  It could be filtered, but
+   doing so has complications when the ktime clocksource is not yet
+   the xen clocksource (ie, at boot time).
+
+   The new vcpu_op-based timer interface allows the tick timer period
+   to be changed or turned off.  The tick timer is not useful as a
+   periodic timer because events are only delivered to running vcpus.
+   The one-shot timer can report when a timeout is in the past, so
+   set_next_event is capable of returning -ETIME when appropriate.
+   This interface is used when available.
+*/
+
+
+/*
+  Get a hypervisor absolute time.  In theory we could maintain an
+  offset between the kernel's time and the hypervisor's time, and
+  apply that to a kernel's absolute timeout.  Unfortunately the
+  hypervisor and kernel times can drift even if the kernel is using
+  the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+       return xen_clocksource_read() + delta;
+}
+
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+                                struct clock_event_device *evt)
+{
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+               /* unsupported */
+               WARN_ON(1);
+               break;
+
+       case CLOCK_EVT_MODE_ONESHOT:
+       case CLOCK_EVT_MODE_RESUME:
+               break;
+
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               HYPERVISOR_set_timer_op(0);  /* cancel timeout */
+               break;
+       }
+}
+
+static int xen_timerop_set_next_event(unsigned long delta,
+                                     struct clock_event_device *evt)
+{
+       WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+       if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+               BUG();
+
+       /* We may have missed the deadline, but there's no real way of
+          knowing for sure.  If the event was in the past, then we'll
+          get an immediate interrupt. */
+
+       return 0;
+}
+
+static const struct clock_event_device xen_timerop_clockevent = {
+       .name = "xen",
+       .features = CLOCK_EVT_FEAT_ONESHOT,
+
+       .max_delta_ns = 0xffffffff,
+       .min_delta_ns = TIMER_SLOP,
+
+       .mult = 1,
+       .shift = 0,
+       .rating = 500,
+
+       .set_mode = xen_timerop_set_mode,
+       .set_next_event = xen_timerop_set_next_event,
+};
+
+
+
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+                               struct clock_event_device *evt)
+{
+       int cpu = smp_processor_id();
+
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+               WARN_ON(1);     /* unsupported */
+               break;
+
+       case CLOCK_EVT_MODE_ONESHOT:
+               if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                       BUG();
+               break;
+
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+                   HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                       BUG();
+               break;
+       case CLOCK_EVT_MODE_RESUME:
+               break;
+       }
+}
+
+static int xen_vcpuop_set_next_event(unsigned long delta,
+                                    struct clock_event_device *evt)
+{
+       int cpu = smp_processor_id();
+       struct vcpu_set_singleshot_timer single;
+       int ret;
+
+       WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+       single.timeout_abs_ns = get_abs_timeout(delta);
+       single.flags = VCPU_SSHOTTMR_future;
+
+       ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+
+       BUG_ON(ret != 0 && ret != -ETIME);
+
+       return ret;
+}
+
+static const struct clock_event_device xen_vcpuop_clockevent = {
+       .name = "xen",
+       .features = CLOCK_EVT_FEAT_ONESHOT,
+
+       .max_delta_ns = 0xffffffff,
+       .min_delta_ns = TIMER_SLOP,
+
+       .mult = 1,
+       .shift = 0,
+       .rating = 500,
+
+       .set_mode = xen_vcpuop_set_mode,
+       .set_next_event = xen_vcpuop_set_next_event,
+};
+
+static const struct clock_event_device *xen_clockevent =
+       &xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+       struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+       irqreturn_t ret;
+
+       ret = IRQ_NONE;
+       if (evt->event_handler) {
+               evt->event_handler(evt);
+               ret = IRQ_HANDLED;
+       }
+
+       do_stolen_accounting();
+
+       return ret;
+}
+
+void xen_setup_timer(int cpu)
+{
+       const char *name;
+       struct clock_event_device *evt;
+       int irq;
+
+       printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+
+       name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+       if (!name)
+               name = "<timer kasprintf failed>";
+
+       irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+                                     IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                     name, NULL);
+
+       evt = &per_cpu(xen_clock_events, cpu);
+       memcpy(evt, xen_clockevent, sizeof(*evt));
+
+       evt->cpumask = cpumask_of_cpu(cpu);
+       evt->irq = irq;
+
+       setup_runstate_info(cpu);
+}
+
+void xen_setup_cpu_clockevents(void)
+{
+       BUG_ON(preemptible());
+
+       clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+
+__init void xen_time_init(void)
+{
+       int cpu = smp_processor_id();
+
+       get_time_values_from_xen();
+
+       clocksource_register(&xen_clocksource);
+
+       if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+               /* Successfully turned off 100Hz tick, so we have the
+                  vcpuop-based timer interface */
+               printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+               xen_clockevent = &xen_vcpuop_clockevent;
+       }
+
+       /* Set initial system time with full resolution */
+       xen_read_wallclock(&xtime);
+       set_normalized_timespec(&wall_to_monotonic,
+                               -xtime.tv_sec, -xtime.tv_nsec);
+
+       tsc_disable = 0;
+
+       xen_setup_timer(cpu);
+       xen_setup_cpu_clockevents();
+}
diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h
new file mode 100644 (file)
index 0000000..861fedf
--- /dev/null
@@ -0,0 +1,4 @@
+/* Bit used for the pseudo-hwcap for non-negative segments.  We use
+   bit 1 to avoid bugs in some versions of glibc when bit 0 is
+   used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT 1
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644 (file)
index 0000000..1a43b60
--- /dev/null
@@ -0,0 +1,291 @@
+/*
+       Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+       The inline versions are the same as the direct-use versions, with the
+       pre- and post-amble chopped off.
+
+       This code is encoded for size rather than absolute efficiency,
+       with a view to being able to inline as much as possible.
+
+       We only bother with direct forms (ie, vcpu in pda) of the operations
+       here; the indirect forms are better handled in C, since they're
+       generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)    .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+/*
+       Enable events.  This clears the event mask and tests the pending
+       event status with one and operation.  If there are pending
+       events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+       /* Clear mask and test pending */
+       andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+       jz 1f
+2:     call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+       ret
+       ENDPROC(xen_irq_enable_direct)
+       RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+       Disabling events is simply a matter of making the event mask
+       non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+       movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+       ret
+       ENDPROC(xen_irq_disable_direct)
+       RELOC(xen_irq_disable_direct, 0)
+
+/*
+       (xen_)save_fl is used to get the current interrupt enable status.
+       Callers expect the status to be in X86_EFLAGS_IF, and other bits
+       may be set in the return value.  We take advantage of this by
+       making sure that X86_EFLAGS_IF has the right value (and other bits
+       in that byte are 0), but other bits in the return value are
+       undefined.  We need to toggle the state of the bit, because
+       Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+       setz %ah
+       addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+       ret
+       ENDPROC(xen_save_fl_direct)
+       RELOC(xen_save_fl_direct, 0)
+
+
+/*
+       In principle the caller should be passing us a value return
+       from xen_save_fl_direct, but for robustness sake we test only
+       the X86_EFLAGS_IF flag rather than the whole byte. After
+       setting the interrupt mask state, it checks for unmasked
+       pending events and enters the hypervisor to get them delivered
+       if so.
+ */
+ENTRY(xen_restore_fl_direct)
+       testb $X86_EFLAGS_IF>>8, %ah
+       setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* check for unmasked and pending */
+       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+       jz 1f
+2:     call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+       ret
+       ENDPROC(xen_restore_fl_direct)
+       RELOC(xen_restore_fl_direct, 2b+1)
+
+/*
+       This is run where a normal iret would be run, with the same stack setup:
+             8: eflags
+             4: cs
+       esp-> 0: eip
+
+       This attempts to make sure that any pending events are dealt
+       with on return to usermode, but there is a small window in
+       which an event can happen just before entering usermode.  If
+       the nested interrupt ends up setting one of the TIF_WORK_MASK
+       pending work flags, they will not be tested again before
+       returning to usermode. This means that a process can end up
+       with pending work, which will be unprocessed until the process
+       enters and leaves the kernel again, which could be an
+       unbounded amount of time.  This means that a pending signal or
+       reschedule event could be indefinitely delayed.
+
+       The fix is to notice a nested interrupt in the critical
+       window, and if one occurs, then fold the nested interrupt into
+       the current interrupt stack frame, and re-process it
+       iteratively rather than recursively.  This means that it will
+       exit via the normal path, and all pending work will be dealt
+       with appropriately.
+
+       Because the nested interrupt handler needs to deal with the
+       current stack state in whatever form its in, we keep things
+       simple by only using a single register which is pushed/popped
+       on the stack.
+
+       Non-direct iret could be done in the same way, but it would
+       require an annoying amount of code duplication.  We'll assume
+       that direct mode will be the common case once the hypervisor
+       support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+       /* test eflags for special cases */
+       testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+       jnz hyper_iret
+
+       push %eax
+       ESP_OFFSET=4    # bytes pushed onto stack
+
+       /* Store vcpu_info pointer for easy access.  Do it this
+          way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+       GET_THREAD_INFO(%eax)
+       movl TI_cpu(%eax),%eax
+       movl __per_cpu_offset(,%eax,4),%eax
+       lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+       movl $per_cpu__xen_vcpu_info, %eax
+#endif
+
+       /* check IF state we're restoring */
+       testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+       /* Maybe enable events.  Once this happens we could get a
+          recursive event, so the critical region starts immediately
+          afterwards.  However, if that happens we don't end up
+          resuming the code, so we don't have to be worried about
+          being preempted to another CPU. */
+       setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+       /* check for unmasked and pending */
+       cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+       /* If there's something pending, mask events again so we
+          can jump back into xen_hypervisor_callback */
+       sete XEN_vcpu_info_mask(%eax)
+
+       popl %eax
+
+       /* From this point on the registers are restored and the stack
+          updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+       /* Jump to hypervisor_callback after fixing up the stack.
+          Events are masked, so jumping out of the critical
+          region is OK. */
+       je xen_hypervisor_callback
+
+       iret
+xen_iret_end_crit:
+
+hyper_iret:
+       /* put this out of line since its very rarely used */
+       jmp hypercall_page + __HYPERVISOR_iret * 32
+
+       .globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+
+   The stack format at this point is:
+       ----------------
+        ss             : (ss/esp may be present if we came from usermode)
+        esp            :
+        eflags         }  outer exception info
+        cs             }
+        eip            }
+       ---------------- <- edi (copy dest)
+        eax            :  outer eax if it hasn't been restored
+       ----------------
+        eflags         }  nested exception info
+        cs             }   (no ss/esp because we're nested
+        eip            }    from the same ring)
+        orig_eax       }<- esi (copy src)
+        - - - - - - - -
+        fs             }
+        es             }
+        ds             }  SAVE_ALL state
+        eax            }
+         :             :
+        ebx            }
+       ----------------
+        return addr     <- esp
+       ----------------
+
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+       /* offsets +4 for return address */
+
+       /*
+          Paranoia: Make sure we're really coming from userspace.
+          One could imagine a case where userspace jumps into the
+          critical range address, but just before the CPU delivers a GP,
+          it decides to deliver an interrupt instead.  Unlikely?
+          Definitely.  Easy to avoid?  Yes.  The Intel documents
+          explicitly say that the reported EIP for a bad jump is the
+          jump instruction itself, not the destination, but some virtual
+          environments get this wrong.
+        */
+       movl PT_CS+4(%esp), %ecx
+       andl $SEGMENT_RPL_MASK, %ecx
+       cmpl $USER_RPL, %ecx
+       je 2f
+
+       lea PT_ORIG_EAX+4(%esp), %esi
+       lea PT_EFLAGS+4(%esp), %edi
+
+       /* If eip is before iret_restore_end then stack
+          hasn't been restored yet. */
+       cmp $iret_restore_end, %eax
+       jae 1f
+
+       movl 0+4(%edi),%eax             /* copy EAX */
+       movl %eax, PT_EAX+4(%esp)
+
+       lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
+
+       /* set up the copy */
+1:     std
+       mov $(PT_EIP+4) / 4, %ecx       /* copy ret+saved regs up to orig_eax */
+       rep movsl
+       cld
+
+       lea 4(%edi),%esp                /* point esp to new frame */
+2:     ret
+
+
+/*
+       Force an event check by making a hypercall,
+       but preserve regs before making the call.
+ */
+check_events:
+       push %eax
+       push %ecx
+       push %edx
+       call force_evtchn_callback
+       pop %edx
+       pop %ecx
+       pop %eax
+       ret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
new file mode 100644 (file)
index 0000000..f8d6937
--- /dev/null
@@ -0,0 +1,38 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+       place in head.S */
+
+#ifdef CONFIG_XEN
+
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+
+.pushsection .init.text
+ENTRY(startup_xen)
+       movl %esi,xen_start_info
+       cld
+       movl $(init_thread_union+THREAD_SIZE),%esp
+       jmp xen_start_kernel
+.popsection
+
+.pushsection .bss.page_aligned
+       .align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+       .skip 0x1000
+.popsection
+
+       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
+       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
+       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+#else
+       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+#endif
+       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+
+#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
new file mode 100644 (file)
index 0000000..b9aaea4
--- /dev/null
@@ -0,0 +1,71 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+void xen_copy_trap_info(struct trap_info *traps);
+
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);
+
+void xen_mark_init_mm_pinned(void);
+
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+static inline unsigned xen_get_lazy_mode(void)
+{
+       return x86_read_percpu(xen_lazy_mode);
+}
+
+void __init xen_fill_possible_map(void);
+
+void __init xen_setup_vcpu_info_placement(void);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+                          int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                                int nonatomic, int wait);
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+                              void *info, int wait);
+
+
+/* Declare an asm function, along with symbols needed to make it
+   inlineable */
+#define DECL_ASM(ret, name, ...)               \
+       ret name(__VA_ARGS__);                  \
+       extern char name##_end[];               \
+       extern char name##_reloc[]              \
+
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+
+void xen_iret_direct(void);
+#endif /* XEN_OPS_H */