Merge branch 'linus' into x86/x2apic
authorIngo Molnar <mingo@elte.hu>
Fri, 18 Jul 2008 20:50:34 +0000 (22:50 +0200)
committerIngo Molnar <mingo@elte.hu>
Fri, 18 Jul 2008 20:50:34 +0000 (22:50 +0200)
17 files changed:
1  2 
Documentation/kernel-parameters.txt
arch/x86/Kconfig
arch/x86/kernel/Makefile
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/apic_32.c
arch/x86/kernel/cpu/common_64.c
arch/x86/kernel/io_apic_32.c
arch/x86/kernel/io_apic_64.c
arch/x86/kernel/setup.c
arch/x86/kernel/smpboot.c
arch/x86/xen/enlighten.c
drivers/pci/Makefile
include/asm-x86/genapic_64.h
include/asm-x86/hw_irq.h
include/asm-x86/smp.h
include/linux/irq.h
kernel/irq/manage.c

index 88b08acf2ff20ce7e874eaa0d52fc8f5500d7feb,09ad7450647bc81dff32a3eaf7ea3c0858f4a896..556b4187d016bb7eba08de99d4ffa173f44313b1
@@@ -147,10 -147,14 +147,14 @@@ and is between 256 and 4096 characters
                        default: 0
  
        acpi_sleep=     [HW,ACPI] Sleep options
-                       Format: { s3_bios, s3_mode, s3_beep }
+                       Format: { s3_bios, s3_mode, s3_beep, old_ordering }
                        See Documentation/power/video.txt for s3_bios and s3_mode.
                        s3_beep is for debugging; it makes the PC's speaker beep
                        as soon as the kernel's real-mode entry point is called.
+                       old_ordering causes the ACPI 1.0 ordering of the _PTS
+                       control method, wrt putting devices into low power
+                       states, to be enforced (the ACPI 2.0 ordering of _PTS is
+                       used by default).
  
        acpi_sci=       [HW,ACPI] ACPI System Control Interrupt trigger mode
                        Format: { level | edge | high | low }
  
        debug_objects   [KNL] Enable object debugging
  
+       debugpat        [X86] Enable PAT debugging
        decnet.addr=    [HW,NET]
                        Format: <area>[,<node>]
                        See also Documentation/networking/decnet.txt.
        hd=             [EIDE] (E)IDE hard drive subsystem geometry
                        Format: <cyl>,<head>,<sect>
  
-       hd?=            [HW] (E)IDE subsystem
-       hd?lun=         See Documentation/ide/ide.txt.
        highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact
                        size of <nn>. This works even on boxes that have no
                        highmem otherwise. This also works to reduce highmem
                        See Documentation/ide/ide.txt.
  
        idle=           [X86]
-                       Format: idle=poll or idle=mwait
+                       Format: idle=poll or idle=mwait, idle=halt, idle=nomwait
                        Poll forces a polling idle loop that can slightly improves the performance
                        of waking up a idle CPU, but will use a lot of power and make the system
                        run hot. Not recommended.
                        to not use it because it doesn't save as much power as a normal idle
                        loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same
                        as idle=poll.
+                       idle=halt. Halt is forced to be used for CPU idle.
+                       In such case C2/C3 won't be used again.
+                       idle=nomwait. Disable mwait for CPU C-states
  
        ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
                        Claim all unknown PCI IDE storage controllers.
        mtdparts=       [MTD]
                        See drivers/mtd/cmdlinepart.c.
  
+       mtdset=         [ARM]
+                       ARM/S3C2412 JIVE boot control
+                       See arch/arm/mach-s3c2412/mach-jive.c
        mtouchusb.raw_coordinates=
                        [HW] Make the MicroTouch USB driver use raw coordinates
                        ('y', default) or cooked coordinates ('n')
  
        nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
  
 +      nox2apic        [X86-64,APIC] Do not enable x2APIC mode.
 +
 +      x2apic_phys     [X86-64,APIC] Use x2apic physical mode instead of
 +                      default x2apic cluster mode on platforms
 +                      supporting x2apic.
 +
        noltlbs         [PPC] Do not use large page/tlb entries for kernel
                        lowmem mapping on PPC40x.
  
                                Use with caution as certain devices share
                                address decoders between ROMs and other
                                resources.
+               norom           [X86-32,X86_64] Do not assign address space to
+                               expansion ROMs that do not already have
+                               BIOS assigned address ranges.
                irqmask=0xMMMM  [X86-32] Set a bit mask of IRQs allowed to be
                                assigned automatically to PCI devices. You can
                                make the kernel exclude IRQs of your ISA cards
                        Format: { parport<nr> | timid | 0 }
                        See also Documentation/parport.txt.
  
+       pmtmr=          [X86] Manual setup of pmtmr I/O Port. 
+                       Override pmtimer IOPort with a hex value.
+                       e.g. pmtmr=0x508
        pnpacpi=        [ACPI]
                        { off }
  
diff --combined arch/x86/Kconfig
index 0bf8391a7f2daaebf074eed219584c97b47c6f1b,96e0c2ebc3885713a5d6290f5e8eb959d0d0d36e..baca5545500548d4f1e65c7083d9f5e0b9f89c4c
@@@ -23,6 -23,8 +23,8 @@@ config X8
        select HAVE_OPROFILE
        select HAVE_KPROBES
        select HAVE_KRETPROBES
+       select HAVE_DYNAMIC_FTRACE
+       select HAVE_FTRACE
        select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
        select HAVE_ARCH_KGDB if !X86_VOYAGER
  
@@@ -168,6 -170,7 +170,7 @@@ config GENERIC_PENDING_IR
  config X86_SMP
        bool
        depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
+       select USE_GENERIC_SMP_HELPERS
        default y
  
  config X86_32_SMP
@@@ -445,7 -448,6 +448,6 @@@ config PARAVIRT_DEBU
  config MEMTEST
        bool "Memtest"
        depends on X86_64
-       default y
        help
          This option adds a kernel parameter 'memtest', which allows memtest
          to be set.
                memtest=1, mean do 1 test pattern;
                ...
                memtest=4, mean do 4 test patterns.
-         If you are unsure how to answer this question, answer Y.
+         If you are unsure how to answer this question, answer N.
  
  config X86_SUMMIT_NUMA
        def_bool y
@@@ -1133,21 -1135,18 +1135,18 @@@ config MTR
          See <file:Documentation/mtrr.txt> for more information.
  
  config MTRR_SANITIZER
-       def_bool y
+       bool
        prompt "MTRR cleanup support"
        depends on MTRR
        help
-         Convert MTRR layout from continuous to discrete, so some X driver
-         could add WB entries.
+         Convert MTRR layout from continuous to discrete, so X drivers can
+         add writeback entries.
  
-         Say N here if you see bootup problems (boot crash, boot hang,
-         spontaneous reboots).
+         Can be disabled with disable_mtrr_cleanup on the kernel command line.
+         The largest mtrr entry size for a continous block can be set with
+         mtrr_chunk_size.
  
-         Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size
-         could be used to send largest mtrr entry size for continuous block
-         to hold holes (aka. UC entries)
-         If unsure, say Y.
+         If unsure, say N.
  
  config MTRR_SANITIZER_ENABLE_DEFAULT
        int "MTRR cleanup enable value (0-1)"
@@@ -1164,7 -1163,7 +1163,7 @@@ config MTRR_SANITIZER_SPARE_REG_NR_DEFA
        depends on MTRR_SANITIZER
        help
          mtrr cleanup spare entries default, it can be changed via
-         mtrr_spare_reg_nr=
+         mtrr_spare_reg_nr=N on the kernel command line.
  
  config X86_PAT
        bool
@@@ -1651,14 -1650,6 +1650,14 @@@ config DMAR_FLOPPY_W
         workaround will setup a 1:1 mapping for the first
         16M to make floppy (an ISA device) work.
  
 +config INTR_REMAP
 +      bool "Support for Interrupt Remapping (EXPERIMENTAL)"
 +      depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
 +      help
 +       Supports Interrupt remapping for IO-APIC and MSI devices.
 +       To use x2apic mode in the CPU's which support x2APIC enhancements or
 +       to support platforms with CPU's having > 8 bit APIC ID, say Y.
 +
  source "drivers/pci/pcie/Kconfig"
  
  source "drivers/pci/Kconfig"
diff --combined arch/x86/kernel/Makefile
index 81280e93e792db2596f88aabc1ecdbb230fa3859,da140611bb57593ed401a5de6ee63a84cc408349..673f1d12b420bfc85a52e42258d44365d6c45fa9
@@@ -6,6 -6,12 +6,12 @@@ extra-y                := head_$(BITS).
  
  CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
  
+ ifdef CONFIG_FTRACE
+ # Do not profile debug utilities
+ CFLAGS_REMOVE_tsc.o = -pg
+ CFLAGS_REMOVE_rtc.o = -pg
+ endif
  #
  # vsyscalls (which work on the user stack) should have
  # no stack-protector checks:
@@@ -57,6 -63,7 +63,7 @@@ obj-$(CONFIG_X86_MPPARSE)     += mpparse.
  obj-$(CONFIG_X86_LOCAL_APIC)  += apic_$(BITS).o nmi.o
  obj-$(CONFIG_X86_IO_APIC)     += io_apic_$(BITS).o
  obj-$(CONFIG_X86_REBOOTFIXUPS)        += reboot_fixups_32.o
+ obj-$(CONFIG_DYNAMIC_FTRACE)  += ftrace.o
  obj-$(CONFIG_KEXEC)           += machine_kexec_$(BITS).o
  obj-$(CONFIG_KEXEC)           += relocate_kernel_$(BITS).o crash.o
  obj-$(CONFIG_CRASH_DUMP)      += crash_dump_$(BITS).o
@@@ -95,8 -102,6 +102,8 @@@ obj-$(CONFIG_OLPC)          += olpc.
  # 64 bit specific files
  ifeq ($(CONFIG_X86_64),y)
          obj-y                         += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
 +        obj-y                         += genx2apic_cluster.o
 +        obj-y                         += genx2apic_phys.o
          obj-$(CONFIG_X86_PM_TIMER)    += pmtimer_64.o
          obj-$(CONFIG_AUDIT)           += audit_64.o
  
index b314bcd084067ef4a968f2cf3ac240023a8d8cc4,f489d7a9be92ffd00c662c4b18554637e4311406..b41b27af33e6cb9cf31e8196397f2ab9f17fc1f4
@@@ -761,7 -761,7 +761,7 @@@ static void __init acpi_register_lapic_
  
        set_fixmap_nocache(FIX_APIC_BASE, address);
        if (boot_cpu_physical_apicid == -1U) {
 -              boot_cpu_physical_apicid  = GET_APIC_ID(read_apic_id());
 +              boot_cpu_physical_apicid  = read_apic_id();
  #ifdef CONFIG_X86_32
                apic_version[boot_cpu_physical_apicid] =
                         GET_APIC_VERSION(apic_read(APIC_LVR));
@@@ -1337,9 -1337,7 +1337,9 @@@ static void __init acpi_process_madt(vo
                                acpi_ioapic = 1;
  
                                smp_found_config = 1;
 +#ifdef CONFIG_X86_32
                                setup_apic_routing();
 +#endif
                        }
                }
                if (error == -EINVAL) {
@@@ -1411,7 -1409,6 +1411,6 @@@ static int __init dmi_ignore_irq0_timer
  {
        pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
        acpi_skip_timer_override = 1;
-       force_mask_ioapic_irq_2();
        return 0;
  }
  
index cb54d9e20f94682f1a5153ac0c8371f857c250a8,a437d027f20b6d8d7ba3dc88400220e796afe41e..34101962fb0ed02828acfdfe3d18a29eddd135e2
@@@ -145,18 -145,13 +145,18 @@@ static int modern_apic(void
        return lapic_get_version() >= 0x14;
  }
  
 -void apic_wait_icr_idle(void)
 +/*
 + * Paravirt kernels also might be using these below ops. So we still
 + * use generic apic_read()/apic_write(), which might be pointing to different
 + * ops in PARAVIRT case.
 + */
 +void xapic_wait_icr_idle(void)
  {
        while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
                cpu_relax();
  }
  
 -u32 safe_apic_wait_icr_idle(void)
 +u32 safe_xapic_wait_icr_idle(void)
  {
        u32 send_status;
        int timeout;
        return send_status;
  }
  
 +void xapic_icr_write(u32 low, u32 id)
 +{
 +      apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(id));
 +      apic_write_around(APIC_ICR, low);
 +}
 +
 +u64 xapic_icr_read(void)
 +{
 +      u32 icr1, icr2;
 +
 +      icr2 = apic_read(APIC_ICR2);
 +      icr1 = apic_read(APIC_ICR);
 +
 +      return icr1 | ((u64)icr2 << 32);
 +}
 +
 +static struct apic_ops xapic_ops = {
 +      .read = native_apic_mem_read,
 +      .write = native_apic_mem_write,
 +      .write_atomic = native_apic_mem_write_atomic,
 +      .icr_read = xapic_icr_read,
 +      .icr_write = xapic_icr_write,
 +      .wait_icr_idle = xapic_wait_icr_idle,
 +      .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
 +};
 +
 +struct apic_ops __read_mostly *apic_ops = &xapic_ops;
 +EXPORT_SYMBOL_GPL(apic_ops);
 +
  /**
   * enable_NMI_through_LVT0 - enable NMI through local vector table 0
   */
@@@ -1235,7 -1201,7 +1235,7 @@@ void __init init_apic_mappings(void
         * default configuration (or the MP table is broken).
         */
        if (boot_cpu_physical_apicid == -1U)
 -              boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
 +              boot_cpu_physical_apicid = read_apic_id();
  
  }
  
@@@ -1275,7 -1241,7 +1275,7 @@@ int __init APIC_init_uniprocessor(void
         * might be zero if read from MP tables. Get it from LAPIC.
         */
  #ifdef CONFIG_CRASH_DUMP
 -      boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
 +      boot_cpu_physical_apicid = read_apic_id();
  #endif
        physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
  
@@@ -1374,6 -1340,10 +1374,10 @@@ void __init smp_intr_init(void
  
        /* IPI for generic function call */
        alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+       /* IPI for single call function */
+       set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+                               call_function_single_interrupt);
  }
  #endif
  
index e7bf3c2dc5fe60a717b0ce0e991bf3d550c55698,7b8cc72feb40e3ed8bfd02437fe3b6324b024f67..c6bee77ca9e6797fbd87f292aefb2f9dc4c0c892
@@@ -98,7 -98,7 +98,7 @@@ int __cpuinit get_model_name(struct cpu
  
  void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
  {
-       unsigned int n, dummy, eax, ebx, ecx, edx;
+       unsigned int n, dummy, ebx, ecx, edx;
  
        n = c->extended_cpuid_level;
  
                printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
                c->x86_cache_size, ecx & 0xFF);
        }
-       if (n >= 0x80000008) {
-               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
-               c->x86_virt_bits = (eax >> 8) & 0xff;
-               c->x86_phys_bits = eax & 0xff;
-       }
  }
  
  void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@@ -314,6 -309,13 +309,13 @@@ static void __cpuinit early_identify_cp
        if (c->extended_cpuid_level >= 0x80000007)
                c->x86_power = cpuid_edx(0x80000007);
  
+       if (c->extended_cpuid_level >= 0x80000008) {
+               u32 eax = cpuid_eax(0x80000008);
+               c->x86_virt_bits = (eax >> 8) & 0xff;
+               c->x86_phys_bits = eax & 0xff;
+       }
        /* Assume all 64-bit CPUs support 32-bit syscall */
        set_cpu_cap(c, X86_FEATURE_SYSCALL32);
  
@@@ -606,8 -608,6 +608,8 @@@ void __cpuinit cpu_init(void
        barrier();
  
        check_efer();
 +      if (cpu != 0 && x2apic)
 +              enable_x2apic();
  
        /*
         * set up and load the per-CPU TSS
index 382208d11f8dc7629c3e9df944cb98c111b86486,558abf4c796afa0d7dd7ad2622e3bd42f28e8d39..a82065b0699ec167cc8b75c72e58ffa65f41b94d
@@@ -59,13 -59,6 +59,6 @@@ static struct { int pin, apic; } ioapic
  static DEFINE_SPINLOCK(ioapic_lock);
  static DEFINE_SPINLOCK(vector_lock);
  
- static bool mask_ioapic_irq_2 __initdata;
- void __init force_mask_ioapic_irq_2(void)
- {
-       mask_ioapic_irq_2 = true;
- }
  int timer_through_8259 __initdata;
  
  /*
@@@ -1501,7 -1494,7 +1494,7 @@@ void /*__init*/ print_local_APIC(void *
                smp_processor_id(), hard_smp_processor_id());
        v = apic_read(APIC_ID);
        printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v,
 -                      GET_APIC_ID(read_apic_id()));
 +                      GET_APIC_ID(v));
        v = apic_read(APIC_LVR);
        printk(KERN_INFO "... APIC VERSION: %08x\n", v);
        ver = GET_APIC_VERSION(v);
  
  void print_all_local_APICs(void)
  {
-       on_each_cpu(print_local_APIC, NULL, 1, 1);
+       on_each_cpu(print_local_APIC, NULL, 1);
  }
  
  void /*__init*/ print_PIC(void)
@@@ -1709,7 -1702,8 +1702,7 @@@ void disable_IO_APIC(void
                entry.dest_mode       = 0; /* Physical */
                entry.delivery_mode   = dest_ExtINT; /* ExtInt */
                entry.vector          = 0;
 -              entry.dest.physical.physical_dest =
 -                                      GET_APIC_ID(read_apic_id());
 +              entry.dest.physical.physical_dest = read_apic_id();
  
                /*
                 * Add it to the IO-APIC irq-routing table:
@@@ -2186,9 -2180,6 +2179,6 @@@ static inline void __init check_timer(v
        printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
                vector, apic1, pin1, apic2, pin2);
  
-       if (mask_ioapic_irq_2)
-               mask_IO_APIC_irq(2);
        /*
         * Some BIOS writers are clueless and report the ExtINTA
         * I/O APIC input from the cascaded 8259A as the timer
index 2db0f98e2af5456104137efc9a49f3d7322939f1,6510cde36b3549149eabefa4aaf724c72e5a0959..39f0be37e9a16b7f0b10df3af743b6338f0532fa
@@@ -37,7 -37,6 +37,7 @@@
  #include <acpi/acpi_bus.h>
  #endif
  #include <linux/bootmem.h>
 +#include <linux/dmar.h>
  
  #include <asm/idle.h>
  #include <asm/io.h>
@@@ -49,7 -48,6 +49,7 @@@
  #include <asm/nmi.h>
  #include <asm/msidef.h>
  #include <asm/hypertransport.h>
 +#include <asm/irq_remapping.h>
  
  #include <mach_ipi.h>
  #include <mach_apic.h>
@@@ -96,13 -94,6 +96,6 @@@ static int no_timer_check
  
  static int disable_timer_pin_1 __initdata;
  
- static bool mask_ioapic_irq_2 __initdata;
- void __init force_mask_ioapic_irq_2(void)
- {
-       mask_ioapic_irq_2 = true;
- }
  int timer_through_8259 __initdata;
  
  /* Where if anywhere is the i8259 connect in external int mode */
@@@ -116,9 -107,6 +109,9 @@@ DEFINE_SPINLOCK(vector_lock)
   */
  int nr_ioapic_registers[MAX_IO_APICS];
  
 +/* I/O APIC RTE contents at the OS boot up */
 +struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
 +
  /* I/O APIC entries */
  struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
  int nr_ioapics;
@@@ -314,12 -302,7 +307,12 @@@ static void __target_IO_APIC_irq(unsign
                pin = entry->pin;
                if (pin == -1)
                        break;
 -              io_apic_write(apic, 0x11 + pin*2, dest);
 +              /*
 +               * With interrupt-remapping, destination information comes
 +               * from interrupt-remapping table entry.
 +               */
 +              if (!irq_remapped(irq))
 +                      io_apic_write(apic, 0x11 + pin*2, dest);
                reg = io_apic_read(apic, 0x10 + pin*2);
                reg &= ~IO_APIC_REDIR_VECTOR_MASK;
                reg |= vector;
@@@ -456,69 -439,6 +449,69 @@@ static void clear_IO_APIC (void
                        clear_IO_APIC_pin(apic, pin);
  }
  
 +/*
 + * Saves and masks all the unmasked IO-APIC RTE's
 + */
 +int save_mask_IO_APIC_setup(void)
 +{
 +      union IO_APIC_reg_01 reg_01;
 +      unsigned long flags;
 +      int apic, pin;
 +
 +      /*
 +       * The number of IO-APIC IRQ registers (== #pins):
 +       */
 +      for (apic = 0; apic < nr_ioapics; apic++) {
 +              spin_lock_irqsave(&ioapic_lock, flags);
 +              reg_01.raw = io_apic_read(apic, 1);
 +              spin_unlock_irqrestore(&ioapic_lock, flags);
 +              nr_ioapic_registers[apic] = reg_01.bits.entries+1;
 +      }
 +
 +      for (apic = 0; apic < nr_ioapics; apic++) {
 +              early_ioapic_entries[apic] =
 +                      kzalloc(sizeof(struct IO_APIC_route_entry) *
 +                              nr_ioapic_registers[apic], GFP_KERNEL);
 +              if (!early_ioapic_entries[apic])
 +                      return -ENOMEM;
 +      }
 +
 +      for (apic = 0; apic < nr_ioapics; apic++)
 +              for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 +                      struct IO_APIC_route_entry entry;
 +
 +                      entry = early_ioapic_entries[apic][pin] =
 +                              ioapic_read_entry(apic, pin);
 +                      if (!entry.mask) {
 +                              entry.mask = 1;
 +                              ioapic_write_entry(apic, pin, entry);
 +                      }
 +              }
 +      return 0;
 +}
 +
 +void restore_IO_APIC_setup(void)
 +{
 +      int apic, pin;
 +
 +      for (apic = 0; apic < nr_ioapics; apic++)
 +              for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
 +                      ioapic_write_entry(apic, pin,
 +                                         early_ioapic_entries[apic][pin]);
 +}
 +
 +void reinit_intr_remapped_IO_APIC(int intr_remapping)
 +{
 +      /*
 +       * for now plain restore of previous settings.
 +       * TBD: In the case of OS enabling interrupt-remapping,
 +       * IO-APIC RTE's need to be setup to point to interrupt-remapping
 +       * table entries. for now, do a plain restore, and wait for
 +       * the setup_IO_APIC_irqs() to do proper initialization.
 +       */
 +      restore_IO_APIC_setup();
 +}
 +
  int skip_ioapic_setup;
  int ioapic_force;
  
@@@ -913,98 -833,18 +906,98 @@@ void setup_vector_irq(int cpu
  
  
  static struct irq_chip ioapic_chip;
 +#ifdef CONFIG_INTR_REMAP
 +static struct irq_chip ir_ioapic_chip;
 +#endif
  
  static void ioapic_register_intr(int irq, unsigned long trigger)
  {
 -      if (trigger) {
 +      if (trigger)
                irq_desc[irq].status |= IRQ_LEVEL;
 -              set_irq_chip_and_handler_name(irq, &ioapic_chip,
 -                                            handle_fasteoi_irq, "fasteoi");
 -      } else {
 +      else
                irq_desc[irq].status &= ~IRQ_LEVEL;
 +
 +#ifdef CONFIG_INTR_REMAP
 +      if (irq_remapped(irq)) {
 +              irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
 +              if (trigger)
 +                      set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
 +                                                    handle_fasteoi_irq,
 +                                                   "fasteoi");
 +              else
 +                      set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
 +                                                    handle_edge_irq, "edge");
 +              return;
 +      }
 +#endif
 +      if (trigger)
 +              set_irq_chip_and_handler_name(irq, &ioapic_chip,
 +                                            handle_fasteoi_irq,
 +                                            "fasteoi");
 +      else
                set_irq_chip_and_handler_name(irq, &ioapic_chip,
                                              handle_edge_irq, "edge");
 +}
 +
 +static int setup_ioapic_entry(int apic, int irq,
 +                            struct IO_APIC_route_entry *entry,
 +                            unsigned int destination, int trigger,
 +                            int polarity, int vector)
 +{
 +      /*
 +       * add it to the IO-APIC irq-routing table:
 +       */
 +      memset(entry,0,sizeof(*entry));
 +
 +#ifdef CONFIG_INTR_REMAP
 +      if (intr_remapping_enabled) {
 +              struct intel_iommu *iommu = map_ioapic_to_ir(apic);
 +              struct irte irte;
 +              struct IR_IO_APIC_route_entry *ir_entry =
 +                      (struct IR_IO_APIC_route_entry *) entry;
 +              int index;
 +
 +              if (!iommu)
 +                      panic("No mapping iommu for ioapic %d\n", apic);
 +
 +              index = alloc_irte(iommu, irq, 1);
 +              if (index < 0)
 +                      panic("Failed to allocate IRTE for ioapic %d\n", apic);
 +
 +              memset(&irte, 0, sizeof(irte));
 +
 +              irte.present = 1;
 +              irte.dst_mode = INT_DEST_MODE;
 +              irte.trigger_mode = trigger;
 +              irte.dlvry_mode = INT_DELIVERY_MODE;
 +              irte.vector = vector;
 +              irte.dest_id = IRTE_DEST(destination);
 +
 +              modify_irte(irq, &irte);
 +
 +              ir_entry->index2 = (index >> 15) & 0x1;
 +              ir_entry->zero = 0;
 +              ir_entry->format = 1;
 +              ir_entry->index = (index & 0x7fff);
 +      } else
 +#endif
 +      {
 +              entry->delivery_mode = INT_DELIVERY_MODE;
 +              entry->dest_mode = INT_DEST_MODE;
 +              entry->dest = destination;
        }
 +
 +      entry->mask = 0;                                /* enable IRQ */
 +      entry->trigger = trigger;
 +      entry->polarity = polarity;
 +      entry->vector = vector;
 +
 +      /* Mask level triggered irqs.
 +       * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
 +       */
 +      if (trigger)
 +              entry->mask = 1;
 +      return 0;
  }
  
  static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
                    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
                    irq, trigger, polarity);
  
 -      /*
 -       * add it to the IO-APIC irq-routing table:
 -       */
 -      memset(&entry,0,sizeof(entry));
 -
 -      entry.delivery_mode = INT_DELIVERY_MODE;
 -      entry.dest_mode = INT_DEST_MODE;
 -      entry.dest = cpu_mask_to_apicid(mask);
 -      entry.mask = 0;                         /* enable IRQ */
 -      entry.trigger = trigger;
 -      entry.polarity = polarity;
 -      entry.vector = cfg->vector;
  
 -      /* Mask level triggered irqs.
 -       * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
 -       */
 -      if (trigger)
 -              entry.mask = 1;
 +      if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
 +                             cpu_mask_to_apicid(mask), trigger, polarity,
 +                             cfg->vector)) {
 +              printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 +                     mp_ioapics[apic].mp_apicid, pin);
 +              __clear_irq_vector(irq);
 +              return;
 +      }
  
        ioapic_register_intr(irq, trigger);
        if (irq < 16)
@@@ -1089,9 -938,6 +1082,9 @@@ static void __init setup_timer_IRQ0_pin
  {
        struct IO_APIC_route_entry entry;
  
 +      if (intr_remapping_enabled)
 +              return;
 +
        memset(&entry, 0, sizeof(entry));
  
        /*
@@@ -1238,7 -1084,6 +1231,7 @@@ static __apicdebuginit void print_APIC_
  void __apicdebuginit print_local_APIC(void * dummy)
  {
        unsigned int v, ver, maxlvt;
 +      unsigned long icr;
  
        if (apic_verbosity == APIC_QUIET)
                return;
        printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
                smp_processor_id(), hard_smp_processor_id());
        v = apic_read(APIC_ID);
 -      printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
 +      printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, read_apic_id());
        v = apic_read(APIC_LVR);
        printk(KERN_INFO "... APIC VERSION: %08x\n", v);
        ver = GET_APIC_VERSION(v);
        v = apic_read(APIC_ESR);
        printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
  
 -      v = apic_read(APIC_ICR);
 -      printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
 -      v = apic_read(APIC_ICR2);
 -      printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
 +      icr = apic_icr_read();
 +      printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
 +      printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
  
        v = apic_read(APIC_LVTT);
        printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
  
  void print_all_local_APICs (void)
  {
-       on_each_cpu(print_local_APIC, NULL, 1, 1);
+       on_each_cpu(print_local_APIC, NULL, 1);
  }
  
  void __apicdebuginit print_PIC(void)
@@@ -1439,7 -1285,7 +1432,7 @@@ void disable_IO_APIC(void
                entry.dest_mode       = 0; /* Physical */
                entry.delivery_mode   = dest_ExtINT; /* ExtInt */
                entry.vector          = 0;
 -              entry.dest          = GET_APIC_ID(read_apic_id());
 +              entry.dest            = read_apic_id();
  
                /*
                 * Add it to the IO-APIC irq-routing table:
@@@ -1547,147 -1393,6 +1540,147 @@@ static int ioapic_retrigger_irq(unsigne
   */
  
  #ifdef CONFIG_SMP
 +
 +#ifdef CONFIG_INTR_REMAP
 +static void ir_irq_migration(struct work_struct *work);
 +
 +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
 +
 +/*
 + * Migrate the IO-APIC irq in the presence of intr-remapping.
 + *
 + * For edge triggered, irq migration is a simple atomic update(of vector
 + * and cpu destination) of IRTE and flush the hardware cache.
 + *
 + * For level triggered, we need to modify the io-apic RTE aswell with the update
 + * vector information, along with modifying IRTE with vector and destination.
 + * So irq migration for level triggered is little  bit more complex compared to
 + * edge triggered migration. But the good news is, we use the same algorithm
 + * for level triggered migration as we have today, only difference being,
 + * we now initiate the irq migration from process context instead of the
 + * interrupt context.
 + *
 + * In future, when we do a directed EOI (combined with cpu EOI broadcast
 + * suppression) to the IO-APIC, level triggered irq migration will also be
 + * as simple as edge triggered migration and we can do the irq migration
 + * with a simple atomic update to IO-APIC RTE.
 + */
 +static void migrate_ioapic_irq(int irq, cpumask_t mask)
 +{
 +      struct irq_cfg *cfg = irq_cfg + irq;
 +      struct irq_desc *desc = irq_desc + irq;
 +      cpumask_t tmp, cleanup_mask;
 +      struct irte irte;
 +      int modify_ioapic_rte = desc->status & IRQ_LEVEL;
 +      unsigned int dest;
 +      unsigned long flags;
 +
 +      cpus_and(tmp, mask, cpu_online_map);
 +      if (cpus_empty(tmp))
 +              return;
 +
 +      if (get_irte(irq, &irte))
 +              return;
 +
 +      if (assign_irq_vector(irq, mask))
 +              return;
 +
 +      cpus_and(tmp, cfg->domain, mask);
 +      dest = cpu_mask_to_apicid(tmp);
 +
 +      if (modify_ioapic_rte) {
 +              spin_lock_irqsave(&ioapic_lock, flags);
 +              __target_IO_APIC_irq(irq, dest, cfg->vector);
 +              spin_unlock_irqrestore(&ioapic_lock, flags);
 +      }
 +
 +      irte.vector = cfg->vector;
 +      irte.dest_id = IRTE_DEST(dest);
 +
 +      /*
 +       * Modified the IRTE and flushes the Interrupt entry cache.
 +       */
 +      modify_irte(irq, &irte);
 +
 +      if (cfg->move_in_progress) {
 +              cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 +              cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 +              send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 +              cfg->move_in_progress = 0;
 +      }
 +
 +      irq_desc[irq].affinity = mask;
 +}
 +
 +static int migrate_irq_remapped_level(int irq)
 +{
 +      int ret = -1;
 +
 +      mask_IO_APIC_irq(irq);
 +
 +      if (io_apic_level_ack_pending(irq)) {
 +              /*
 +               * Interrupt in progress. Migrating irq now will change the
 +               * vector information in the IO-APIC RTE and that will confuse
 +               * the EOI broadcast performed by cpu.
 +               * So, delay the irq migration to the next instance.
 +               */
 +              schedule_delayed_work(&ir_migration_work, 1);
 +              goto unmask;
 +      }
 +
 +      /* everthing is clear. we have right of way */
 +      migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
 +
 +      ret = 0;
 +      irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
 +      cpus_clear(irq_desc[irq].pending_mask);
 +
 +unmask:
 +      unmask_IO_APIC_irq(irq);
 +      return ret;
 +}
 +
 +static void ir_irq_migration(struct work_struct *work)
 +{
 +      int irq;
 +
 +      for (irq = 0; irq < NR_IRQS; irq++) {
 +              struct irq_desc *desc = irq_desc + irq;
 +              if (desc->status & IRQ_MOVE_PENDING) {
 +                      unsigned long flags;
 +
 +                      spin_lock_irqsave(&desc->lock, flags);
 +                      if (!desc->chip->set_affinity ||
 +                          !(desc->status & IRQ_MOVE_PENDING)) {
 +                              desc->status &= ~IRQ_MOVE_PENDING;
 +                              spin_unlock_irqrestore(&desc->lock, flags);
 +                              continue;
 +                      }
 +
 +                      desc->chip->set_affinity(irq,
 +                                               irq_desc[irq].pending_mask);
 +                      spin_unlock_irqrestore(&desc->lock, flags);
 +              }
 +      }
 +}
 +
 +/*
 + * Migrates the IRQ destination in the process context.
 + */
 +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 +{
 +      if (irq_desc[irq].status & IRQ_LEVEL) {
 +              irq_desc[irq].status |= IRQ_MOVE_PENDING;
 +              irq_desc[irq].pending_mask = mask;
 +              migrate_irq_remapped_level(irq);
 +              return;
 +      }
 +
 +      migrate_ioapic_irq(irq, mask);
 +}
 +#endif
 +
  asmlinkage void smp_irq_move_cleanup_interrupt(void)
  {
        unsigned vector, me;
@@@ -1744,17 -1449,6 +1737,17 @@@ static void irq_complete_move(unsigned 
  #else
  static inline void irq_complete_move(unsigned int irq) {}
  #endif
 +#ifdef CONFIG_INTR_REMAP
 +static void ack_x2apic_level(unsigned int irq)
 +{
 +      ack_x2APIC_irq();
 +}
 +
 +static void ack_x2apic_edge(unsigned int irq)
 +{
 +      ack_x2APIC_irq();
 +}
 +#endif
  
  static void ack_apic_edge(unsigned int irq)
  {
@@@ -1829,21 -1523,6 +1822,21 @@@ static struct irq_chip ioapic_chip __re
        .retrigger      = ioapic_retrigger_irq,
  };
  
 +#ifdef CONFIG_INTR_REMAP
 +static struct irq_chip ir_ioapic_chip __read_mostly = {
 +      .name           = "IR-IO-APIC",
 +      .startup        = startup_ioapic_irq,
 +      .mask           = mask_IO_APIC_irq,
 +      .unmask         = unmask_IO_APIC_irq,
 +      .ack            = ack_x2apic_edge,
 +      .eoi            = ack_x2apic_level,
 +#ifdef CONFIG_SMP
 +      .set_affinity   = set_ir_ioapic_affinity_irq,
 +#endif
 +      .retrigger      = ioapic_retrigger_irq,
 +};
 +#endif
 +
  static inline void init_IO_APIC_traps(void)
  {
        int irq;
@@@ -2020,9 -1699,6 +2013,6 @@@ static inline void __init check_timer(v
        apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
                cfg->vector, apic1, pin1, apic2, pin2);
  
-       if (mask_ioapic_irq_2)
-               mask_IO_APIC_irq(2);
        /*
         * Some BIOS writers are clueless and report the ExtINTA
         * I/O APIC input from the cascaded 8259A as the timer
         * 8259A.
         */
        if (pin1 == -1) {
 +              if (intr_remapping_enabled)
 +                      panic("BIOS bug: timer not connected to IO-APIC");
                pin1 = pin2;
                apic1 = apic2;
                no_pin1 = 1;
                                clear_IO_APIC_pin(0, pin1);
                        goto out;
                }
 +              if (intr_remapping_enabled)
 +                      panic("timer doesn't work through Interrupt-remapped IO-APIC");
                clear_IO_APIC_pin(apic1, pin1);
                if (!no_pin1)
                        apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
@@@ -2297,9 -1969,6 +2287,9 @@@ void destroy_irq(unsigned int irq
  
        dynamic_irq_cleanup(irq);
  
 +#ifdef CONFIG_INTR_REMAP
 +      free_irte(irq);
 +#endif
        spin_lock_irqsave(&vector_lock, flags);
        __clear_irq_vector(irq);
        spin_unlock_irqrestore(&vector_lock, flags);
@@@ -2318,41 -1987,10 +2308,41 @@@ static int msi_compose_msg(struct pci_d
  
        tmp = TARGET_CPUS;
        err = assign_irq_vector(irq, tmp);
 -      if (!err) {
 -              cpus_and(tmp, cfg->domain, tmp);
 -              dest = cpu_mask_to_apicid(tmp);
 +      if (err)
 +              return err;
 +
 +      cpus_and(tmp, cfg->domain, tmp);
 +      dest = cpu_mask_to_apicid(tmp);
 +
 +#ifdef CONFIG_INTR_REMAP
 +      if (irq_remapped(irq)) {
 +              struct irte irte;
 +              int ir_index;
 +              u16 sub_handle;
 +
 +              ir_index = map_irq_to_irte_handle(irq, &sub_handle);
 +              BUG_ON(ir_index == -1);
 +
 +              memset (&irte, 0, sizeof(irte));
 +
 +              irte.present = 1;
 +              irte.dst_mode = INT_DEST_MODE;
 +              irte.trigger_mode = 0; /* edge */
 +              irte.dlvry_mode = INT_DELIVERY_MODE;
 +              irte.vector = cfg->vector;
 +              irte.dest_id = IRTE_DEST(dest);
 +
 +              modify_irte(irq, &irte);
  
 +              msg->address_hi = MSI_ADDR_BASE_HI;
 +              msg->data = sub_handle;
 +              msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
 +                                MSI_ADDR_IR_SHV |
 +                                MSI_ADDR_IR_INDEX1(ir_index) |
 +                                MSI_ADDR_IR_INDEX2(ir_index);
 +      } else
 +#endif
 +      {
                msg->address_hi = MSI_ADDR_BASE_HI;
                msg->address_lo =
                        MSI_ADDR_BASE_LO |
@@@ -2403,55 -2041,6 +2393,55 @@@ static void set_msi_irq_affinity(unsign
        write_msi_msg(irq, &msg);
        irq_desc[irq].affinity = mask;
  }
 +
 +#ifdef CONFIG_INTR_REMAP
 +/*
 + * Migrate the MSI irq to another cpumask. This migration is
 + * done in the process context using interrupt-remapping hardware.
 + */
 +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 +{
 +      struct irq_cfg *cfg = irq_cfg + irq;
 +      unsigned int dest;
 +      cpumask_t tmp, cleanup_mask;
 +      struct irte irte;
 +
 +      cpus_and(tmp, mask, cpu_online_map);
 +      if (cpus_empty(tmp))
 +              return;
 +
 +      if (get_irte(irq, &irte))
 +              return;
 +
 +      if (assign_irq_vector(irq, mask))
 +              return;
 +
 +      cpus_and(tmp, cfg->domain, mask);
 +      dest = cpu_mask_to_apicid(tmp);
 +
 +      irte.vector = cfg->vector;
 +      irte.dest_id = IRTE_DEST(dest);
 +
 +      /*
 +       * atomically update the IRTE with the new destination and vector.
 +       */
 +      modify_irte(irq, &irte);
 +
 +      /*
 +       * After this point, all the interrupts will start arriving
 +       * at the new destination. So, time to cleanup the previous
 +       * vector allocation.
 +       */
 +      if (cfg->move_in_progress) {
 +              cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 +              cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 +              send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 +              cfg->move_in_progress = 0;
 +      }
 +
 +      irq_desc[irq].affinity = mask;
 +}
 +#endif
  #endif /* CONFIG_SMP */
  
  /*
@@@ -2469,157 -2058,26 +2459,157 @@@ static struct irq_chip msi_chip = 
        .retrigger      = ioapic_retrigger_irq,
  };
  
 -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 +#ifdef CONFIG_INTR_REMAP
 +static struct irq_chip msi_ir_chip = {
 +      .name           = "IR-PCI-MSI",
 +      .unmask         = unmask_msi_irq,
 +      .mask           = mask_msi_irq,
 +      .ack            = ack_x2apic_edge,
 +#ifdef CONFIG_SMP
 +      .set_affinity   = ir_set_msi_irq_affinity,
 +#endif
 +      .retrigger      = ioapic_retrigger_irq,
 +};
 +
 +/*
 + * Map the PCI dev to the corresponding remapping hardware unit
 + * and allocate 'nvec' consecutive interrupt-remapping table entries
 + * in it.
 + */
 +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
  {
 +      struct intel_iommu *iommu;
 +      int index;
 +
 +      iommu = map_dev_to_ir(dev);
 +      if (!iommu) {
 +              printk(KERN_ERR
 +                     "Unable to map PCI %s to iommu\n", pci_name(dev));
 +              return -ENOENT;
 +      }
 +
 +      index = alloc_irte(iommu, irq, nvec);
 +      if (index < 0) {
 +              printk(KERN_ERR
 +                     "Unable to allocate %d IRTE for PCI %s\n", nvec,
 +                      pci_name(dev));
 +              return -ENOSPC;
 +      }
 +      return index;
 +}
 +#endif
 +
 +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
 +{
 +      int ret;
        struct msi_msg msg;
 +
 +      ret = msi_compose_msg(dev, irq, &msg);
 +      if (ret < 0)
 +              return ret;
 +
 +      set_irq_msi(irq, desc);
 +      write_msi_msg(irq, &msg);
 +
 +#ifdef CONFIG_INTR_REMAP
 +      if (irq_remapped(irq)) {
 +              struct irq_desc *desc = irq_desc + irq;
 +              /*
 +               * irq migration in process context
 +               */
 +              desc->status |= IRQ_MOVE_PCNTXT;
 +              set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
 +      } else
 +#endif
 +              set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 +
 +      return 0;
 +}
 +
 +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 +{
        int irq, ret;
 +
        irq = create_irq();
        if (irq < 0)
                return irq;
  
 -      ret = msi_compose_msg(dev, irq, &msg);
 +#ifdef CONFIG_INTR_REMAP
 +      if (!intr_remapping_enabled)
 +              goto no_ir;
 +
 +      ret = msi_alloc_irte(dev, irq, 1);
 +      if (ret < 0)
 +              goto error;
 +no_ir:
 +#endif
 +      ret = setup_msi_irq(dev, desc, irq);
        if (ret < 0) {
                destroy_irq(irq);
                return ret;
        }
 +      return 0;
  
 -      set_irq_msi(irq, desc);
 -      write_msi_msg(irq, &msg);
 +#ifdef CONFIG_INTR_REMAP
 +error:
 +      destroy_irq(irq);
 +      return ret;
 +#endif
 +}
  
 -      set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 +{
 +      int irq, ret, sub_handle;
 +      struct msi_desc *desc;
 +#ifdef CONFIG_INTR_REMAP
 +      struct intel_iommu *iommu = 0;
 +      int index = 0;
 +#endif
 +
 +      sub_handle = 0;
 +      list_for_each_entry(desc, &dev->msi_list, list) {
 +              irq = create_irq();
 +              if (irq < 0)
 +                      return irq;
 +#ifdef CONFIG_INTR_REMAP
 +              if (!intr_remapping_enabled)
 +                      goto no_ir;
  
 +              if (!sub_handle) {
 +                      /*
 +                       * allocate the consecutive block of IRTE's
 +                       * for 'nvec'
 +                       */
 +                      index = msi_alloc_irte(dev, irq, nvec);
 +                      if (index < 0) {
 +                              ret = index;
 +                              goto error;
 +                      }
 +              } else {
 +                      iommu = map_dev_to_ir(dev);
 +                      if (!iommu) {
 +                              ret = -ENOENT;
 +                              goto error;
 +                      }
 +                      /*
 +                       * setup the mapping between the irq and the IRTE
 +                       * base index, the sub_handle pointing to the
 +                       * appropriate interrupt remap table entry.
 +                       */
 +                      set_irte_irq(irq, iommu, index, sub_handle);
 +              }
 +no_ir:
 +#endif
 +              ret = setup_msi_irq(dev, desc, irq);
 +              if (ret < 0)
 +                      goto error;
 +              sub_handle++;
 +      }
        return 0;
 +
 +error:
 +      destroy_irq(irq);
 +      return ret;
  }
  
  void arch_teardown_msi_irq(unsigned int irq)
@@@ -2867,10 -2325,6 +2857,10 @@@ void __init setup_ioapic_dest(void
                                setup_IO_APIC_irq(ioapic, pin, irq,
                                                  irq_trigger(irq_entry),
                                                  irq_polarity(irq_entry));
 +#ifdef CONFIG_INTR_REMAP
 +                      else if (intr_remapping_enabled)
 +                              set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
 +#endif
                        else
                                set_ioapic_affinity_irq(irq, TARGET_CPUS);
                }
diff --combined arch/x86/kernel/setup.c
index 2e78a143dec3ee60a4e490eca593c694cc28720e,531b55b8e81a1de1827eac5691d5f8aef1d8d10c..6121ffd46b9eb587ebbd7e380eadb29f698ec638
@@@ -684,6 -684,11 +684,11 @@@ void __init setup_arch(char **cmdline_p
                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
        }
  
+ #ifdef CONFIG_PCI
+       if (pci_early_dump_regs)
+               early_dump_pci_devices();
+ #endif
        finish_e820_parsing();
  
  #ifdef CONFIG_X86_32
        num_physpages = max_pfn;
  
        check_efer();
 +      if (cpu_has_x2apic)
 +              check_x2apic();
  
        /* How many end-of-memory variables you have, grandma! */
        /* need this before calling reserve_initrd */
        init_cpu_to_node();
  #endif
  
+ #ifdef CONFIG_X86_NUMAQ
+       /*
+        * need to check online nodes num, call it
+        * here before time_init/tsc_init
+        */
+       numaq_tsc_disable();
+ #endif
        init_apic_mappings();
        ioapic_init_mappings();
  
index 6cd002f3e20efcc5b1d34dcff0f34f5edd24cf6f,687376ab07e82ece4ab1eeb609d738d76245d226..23c3b3d1f4ccf55fcd7c4aa6c087b88565a3331d
@@@ -123,6 -123,7 +123,6 @@@ EXPORT_PER_CPU_SYMBOL(cpu_info)
  
  static atomic_t init_deasserted;
  
 -static int boot_cpu_logical_apicid;
  
  /* representing cpus for which sibling maps can be computed */
  static cpumask_t cpu_sibling_setup_map;
@@@ -164,8 -165,6 +164,8 @@@ static void unmap_cpu_to_node(int cpu
  #endif
  
  #ifdef CONFIG_X86_32
 +static int boot_cpu_logical_apicid;
 +
  u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
                                        { [0 ... NR_CPUS-1] = BAD_APICID };
  
@@@ -211,7 -210,7 +211,7 @@@ static void __cpuinit smp_callin(void
        /*
         * (This works even if the APIC is not enabled.)
         */
 -      phys_id = GET_APIC_ID(read_apic_id());
 +      phys_id = read_apic_id();
        cpuid = smp_processor_id();
        if (cpu_isset(cpuid, cpu_callin_map)) {
                panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
@@@ -328,12 -327,12 +328,12 @@@ static void __cpuinit start_secondary(v
         * lock helps us to not include this cpu in a currently in progress
         * smp_call_function().
         */
-       lock_ipi_call_lock();
+       ipi_call_lock_irq();
  #ifdef CONFIG_X86_IO_APIC
        setup_vector_irq(smp_processor_id());
  #endif
        cpu_set(smp_processor_id(), cpu_online_map);
-       unlock_ipi_call_lock();
+       ipi_call_unlock_irq();
        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
  
        setup_secondary_clock();
@@@ -547,7 -546,8 +547,7 @@@ static inline void __inquire_remote_api
                        printk(KERN_CONT
                               "a previous APIC delivery may have failed\n");
  
 -              apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
 -              apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
 +              apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
  
                timeout = 0;
                do {
@@@ -579,9 -579,11 +579,9 @@@ wakeup_secondary_cpu(int logical_apicid
        int maxlvt;
  
        /* Target chip */
 -      apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
 -
        /* Boot on the stack */
        /* Kick the second */
 -      apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
 +      apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
  
        Dprintk("Waiting for send to finish...\n");
        send_status = safe_apic_wait_icr_idle();
@@@ -637,11 -639,13 +637,11 @@@ wakeup_secondary_cpu(int phys_apicid, u
        /*
         * Turn INIT on target chip
         */
 -      apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 -
        /*
         * Send IPI
         */
 -      apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
 -                              | APIC_DM_INIT);
 +      apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
 +                     phys_apicid);
  
        Dprintk("Waiting for send to finish...\n");
        send_status = safe_apic_wait_icr_idle();
        Dprintk("Deasserting INIT.\n");
  
        /* Target chip */
 -      apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 -
        /* Send IPI */
 -      apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
 +      apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
  
        Dprintk("Waiting for send to finish...\n");
        send_status = safe_apic_wait_icr_idle();
                 */
  
                /* Target chip */
 -              apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 -
                /* Boot on the stack */
                /* Kick the second */
 -              apic_write_around(APIC_ICR, APIC_DM_STARTUP
 -                                      | (start_eip >> 12));
 +              apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
 +                             phys_apicid);
  
                /*
                 * Give the other CPU some time to accept the IPI.
@@@ -1139,17 -1147,10 +1139,17 @@@ void __init native_smp_prepare_cpus(uns
         * Setup boot CPU information
         */
        smp_store_cpu_info(0); /* Final full version of the data */
 +#ifdef CONFIG_X86_32
        boot_cpu_logical_apicid = logical_smp_processor_id();
 +#endif
        current_thread_info()->cpu = 0;  /* needed? */
        set_cpu_sibling_map(0);
  
 +#ifdef CONFIG_X86_64
 +      enable_IR_x2apic();
 +      setup_apic_routing();
 +#endif
 +
        if (smp_sanity_check(max_cpus) < 0) {
                printk(KERN_INFO "SMP disabled\n");
                disable_smp();
        }
  
        preempt_disable();
 -      if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) {
 +      if (read_apic_id() != boot_cpu_physical_apicid) {
                panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
 -                   GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid);
 +                   read_apic_id(), boot_cpu_physical_apicid);
                /* Or can we switch back to PIC here? */
        }
        preempt_enable();
diff --combined arch/x86/xen/enlighten.c
index d11dda7ebd7a874c1d8742220e950a309d97f5a3,bb508456ef523e1fa50f77a2993bf481b06f03f0..402f3e2c7bee727ebdea7ce8d03309b708cdda43
@@@ -548,48 -548,16 +548,48 @@@ static void xen_io_delay(void
  }
  
  #ifdef CONFIG_X86_LOCAL_APIC
 -static u32 xen_apic_read(unsigned long reg)
 +static u32 xen_apic_read(u32 reg)
  {
        return 0;
  }
  
 -static void xen_apic_write(unsigned long reg, u32 val)
 +static void xen_apic_write(u32 reg, u32 val)
  {
        /* Warn to see if there's any stray references */
        WARN_ON(1);
  }
 +
 +static u64 xen_apic_icr_read(void)
 +{
 +      return 0;
 +}
 +
 +static void xen_apic_icr_write(u32 low, u32 id)
 +{
 +      /* Warn to see if there's any stray references */
 +      WARN_ON(1);
 +}
 +
 +static void xen_apic_wait_icr_idle(void)
 +{
 +        return;
 +}
 +
 +static u32 xen_safe_apic_wait_icr_idle(void)
 +{
 +        return 0;
 +}
 +
 +static struct apic_ops xen_basic_apic_ops = {
 +      .read = xen_apic_read,
 +      .write = xen_apic_write,
 +      .write_atomic = xen_apic_write,
 +      .icr_read = xen_apic_icr_read,
 +      .icr_write = xen_apic_icr_write,
 +      .wait_icr_idle = xen_apic_wait_icr_idle,
 +      .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
 +};
 +
  #endif
  
  static void xen_flush_tlb(void)
@@@ -1162,6 -1130,9 +1162,6 @@@ static const struct pv_irq_ops xen_irq_
  
  static const struct pv_apic_ops xen_apic_ops __initdata = {
  #ifdef CONFIG_X86_LOCAL_APIC
 -      .apic_write = xen_apic_write,
 -      .apic_write_atomic = xen_apic_write,
 -      .apic_read = xen_apic_read,
        .setup_boot_clock = paravirt_nop,
        .setup_secondary_clock = paravirt_nop,
        .startup_ipi_hook = paravirt_nop,
@@@ -1243,7 -1214,9 +1243,9 @@@ static const struct smp_ops xen_smp_op
  
        .smp_send_stop = xen_smp_send_stop,
        .smp_send_reschedule = xen_smp_send_reschedule,
-       .smp_call_function_mask = xen_smp_call_function_mask,
+       .send_call_func_ipi = xen_smp_send_call_function_ipi,
+       .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
  };
  #endif        /* CONFIG_SMP */
  
@@@ -1321,13 -1294,6 +1323,13 @@@ asmlinkage void __init xen_start_kernel
        pv_apic_ops = xen_apic_ops;
        pv_mmu_ops = xen_mmu_ops;
  
 +#ifdef CONFIG_X86_LOCAL_APIC
 +      /*
 +       * set up the basic apic ops.
 +       */
 +      apic_ops = &xen_basic_apic_ops;
 +#endif
 +
        if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
                pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
                pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
diff --combined drivers/pci/Makefile
index 1c409c7b0d567b5bf909a60de6d8db9161b0fd74,7d63f8ced24b12568fc49ebfa06fd588e49b6fc9..4b47f4ece5b70904c9ef68c06f468489ec70939a
@@@ -2,7 -2,7 +2,7 @@@
  # Makefile for the PCI bus specific drivers.
  #
  
- obj-y         += access.o bus.o probe.o remove.o pci.o quirks.o \
+ obj-y         += access.o bus.o probe.o remove.o pci.o quirks.o slot.o \
                        pci-driver.o search.o pci-sysfs.o rom.o setup-res.o
  obj-$(CONFIG_PROC_FS) += proc.o
  
@@@ -26,8 -26,6 +26,8 @@@ obj-$(CONFIG_HT_IRQ) += htirq.
  # Build Intel IOMMU support
  obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
  
 +obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
 +
  #
  # Some architectures use the generic PCI setup functions
  #
index 8ff2589da93bb00a58adede7b36cd7d17c3f46eb,0f8504627c41ce8c6d9f168e8ff038a8d15c3d12..2871b3fccb21333590c601e0a312cbd67c859be8
@@@ -24,24 -24,17 +24,24 @@@ struct genapic 
        void (*send_IPI_mask)(cpumask_t mask, int vector);
        void (*send_IPI_allbutself)(int vector);
        void (*send_IPI_all)(int vector);
 +      void (*send_IPI_self)(int vector);
        /* */
        unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
        unsigned int (*phys_pkg_id)(int index_msb);
 +      unsigned int (*get_apic_id)(unsigned long x);
 +      unsigned long (*set_apic_id)(unsigned int id);
 +      unsigned long apic_id_mask;
  };
  
  extern struct genapic *genapic;
  
  extern struct genapic apic_flat;
  extern struct genapic apic_physflat;
 +extern struct genapic apic_x2apic_cluster;
 +extern struct genapic apic_x2apic_phys;
  extern int acpi_madt_oem_check(char *, char *);
  
 +extern void apic_send_IPI_self(int vector);
  enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
  extern enum uv_system_type get_uv_system_type(void);
  extern int is_uv_system(void);
@@@ -53,10 -46,4 +53,4 @@@ extern int uv_wakeup_secondary(int phys
  
  extern void setup_apic_routing(void);
  
- #ifdef CONFIG_X86_IO_APIC
- extern void force_mask_ioapic_irq_2(void);
- #else
- static inline void force_mask_ioapic_irq_2(void) { }
- #endif
  #endif
diff --combined include/asm-x86/hw_irq.h
index 2ae47e7c106378e0f74db849bc751be302986ac0,77ba51df56680fcd9e28b4529eb647c0eea07292..ef7a995ee81fe320a650bd32cddcd5e522d581c5
@@@ -48,6 -48,7 +48,7 @@@ extern void irq_move_cleanup_interrupt(
  extern void threshold_interrupt(void);
  
  extern void call_function_interrupt(void);
+ extern void call_function_single_interrupt(void);
  
  /* PIC specific functions */
  extern void disable_8259A_irq(unsigned int irq);
@@@ -72,9 -73,7 +73,9 @@@ extern void enable_IO_APIC(void)
  #endif
  
  /* IPI functions */
 +#ifdef CONFIG_X86_32
  extern void send_IPI_self(int vector);
 +#endif
  extern void send_IPI(int dest, int vector);
  
  /* Statistics */
diff --combined include/asm-x86/smp.h
index 3b43ca202c3b8fee3d0189e66b43a74c4ef62639,c2784b3e0b77e23269a1c61407aaeb2db39cdc9d..1896cdb0076a4f90391c40fac796f826bf5896dc
@@@ -50,9 -50,9 +50,9 @@@ struct smp_ops 
  
        void (*smp_send_stop)(void);
        void (*smp_send_reschedule)(int cpu);
-       int (*smp_call_function_mask)(cpumask_t mask,
-                                     void (*func)(void *info), void *info,
-                                     int wait);
+       void (*send_call_func_ipi)(cpumask_t mask);
+       void (*send_call_func_single_ipi)(int cpu);
  };
  
  /* Globals due to paravirt */
@@@ -94,17 -94,22 +94,22 @@@ static inline void smp_send_reschedule(
        smp_ops.smp_send_reschedule(cpu);
  }
  
- static inline int smp_call_function_mask(cpumask_t mask,
-                                        void (*func) (void *info), void *info,
-                                        int wait)
+ static inline void arch_send_call_function_single_ipi(int cpu)
+ {
+       smp_ops.send_call_func_single_ipi(cpu);
+ }
+ static inline void arch_send_call_function_ipi(cpumask_t mask)
  {
-       return smp_ops.smp_call_function_mask(mask, func, info, wait);
+       smp_ops.send_call_func_ipi(mask);
  }
  
  void native_smp_prepare_boot_cpu(void);
  void native_smp_prepare_cpus(unsigned int max_cpus);
  void native_smp_cpus_done(unsigned int max_cpus);
  int native_cpu_up(unsigned int cpunum);
+ void native_send_call_func_ipi(cpumask_t mask);
+ void native_send_call_func_single_ipi(int cpu);
  
  extern int __cpu_disable(void);
  extern void __cpu_die(unsigned int cpu);
@@@ -158,33 -163,30 +163,33 @@@ extern int safe_smp_processor_id(void)
  
  #ifdef CONFIG_X86_LOCAL_APIC
  
 +#ifndef CONFIG_X86_64
  static inline int logical_smp_processor_id(void)
  {
        /* we don't want to mark this access volatile - bad code generation */
        return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
  }
  
 -#ifndef CONFIG_X86_64
 +#include <mach_apicdef.h>
  static inline unsigned int read_apic_id(void)
  {
 -      return *(u32 *)(APIC_BASE + APIC_ID);
 +      unsigned int reg;
 +
 +      reg = *(u32 *)(APIC_BASE + APIC_ID);
 +
 +      return GET_APIC_ID(reg);
  }
 -#else
 -extern unsigned int read_apic_id(void);
  #endif
  
  
 -# ifdef APIC_DEFINITION
 +# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
  extern int hard_smp_processor_id(void);
  # else
 -#  include <mach_apicdef.h>
 +#include <mach_apicdef.h>
  static inline int hard_smp_processor_id(void)
  {
        /* we don't want to mark this access volatile - bad code generation */
 -      return GET_APIC_ID(read_apic_id());
 +      return read_apic_id();
  }
  # endif /* APIC_DEFINITION */
  
  extern void cpu_uninit(void);
  #endif
  
- extern void lock_ipi_call_lock(void);
- extern void unlock_ipi_call_lock(void);
  #endif /* __ASSEMBLY__ */
  #endif
diff --combined include/linux/irq.h
index c211984b55e548477ddcecd972a04317295af6de,8ccb462ea42c4cb3c4813c51ad285fb168efc2f9..8d9411bc60f6f9356e0237cc601330576283d22a
@@@ -62,7 -62,6 +62,7 @@@ typedef       void (*irq_flow_handler_t)(unsi
  #define IRQ_MOVE_PENDING      0x00200000      /* need to re-target IRQ destination */
  #define IRQ_NO_BALANCING      0x00400000      /* IRQ is excluded from balancing */
  #define IRQ_SPURIOUS_DISABLED 0x00800000      /* IRQ was disabled by the spurious trap */
 +#define IRQ_MOVE_PCNTXT       0x01000000      /* IRQ migration from process context */
  
  #ifdef CONFIG_IRQ_PER_CPU
  # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
@@@ -245,15 -244,6 +245,6 @@@ static inline void set_balance_irq_affi
  }
  #endif
  
- #ifdef CONFIG_AUTO_IRQ_AFFINITY
- extern int select_smp_affinity(unsigned int irq);
- #else
- static inline int select_smp_affinity(unsigned int irq)
- {
-       return 1;
- }
- #endif
  extern int no_irq_affinity;
  
  static inline int irq_balancing_disabled(unsigned int irq)
diff --combined kernel/irq/manage.c
index 628b5572a7c2debb55a44390339a66bad4604eee,77a51be360103c98c8ce3a165668ea423f69d119..909b2231fa93cd82e69f4432e1eb2f99c5146a99
@@@ -17,6 -17,8 +17,8 @@@
  
  #ifdef CONFIG_SMP
  
+ cpumask_t irq_default_affinity = CPU_MASK_ALL;
  /**
   *    synchronize_irq - wait for pending IRQ handlers (on other CPUs)
   *    @irq: interrupt number to wait for
@@@ -87,14 -89,7 +89,14 @@@ int irq_set_affinity(unsigned int irq, 
        set_balance_irq_affinity(irq, cpumask);
  
  #ifdef CONFIG_GENERIC_PENDING_IRQ
 -      set_pending_irq(irq, cpumask);
 +      if (desc->status & IRQ_MOVE_PCNTXT) {
 +              unsigned long flags;
 +
 +              spin_lock_irqsave(&desc->lock, flags);
 +              desc->chip->set_affinity(irq, cpumask);
 +              spin_unlock_irqrestore(&desc->lock, flags);
 +      } else
 +              set_pending_irq(irq, cpumask);
  #else
        desc->affinity = cpumask;
        desc->chip->set_affinity(irq, cpumask);
        return 0;
  }
  
+ #ifndef CONFIG_AUTO_IRQ_AFFINITY
+ /*
+  * Generic version of the affinity autoselector.
+  */
+ int irq_select_affinity(unsigned int irq)
+ {
+       cpumask_t mask;
+       if (!irq_can_set_affinity(irq))
+               return 0;
+       cpus_and(mask, cpu_online_map, irq_default_affinity);
+       irq_desc[irq].affinity = mask;
+       irq_desc[irq].chip->set_affinity(irq, mask);
+       set_balance_irq_affinity(irq, mask);
+       return 0;
+ }
+ #endif
  #endif
  
  /**
@@@ -361,7 -377,7 +384,7 @@@ int setup_irq(unsigned int irq, struct 
  
                /* Setup the type (level, edge polarity) if configured: */
                if (new->flags & IRQF_TRIGGER_MASK) {
-                       if (desc->chip && desc->chip->set_type)
+                       if (desc->chip->set_type)
                                desc->chip->set_type(irq,
                                                new->flags & IRQF_TRIGGER_MASK);
                        else
                                 */
                                printk(KERN_WARNING "No IRQF_TRIGGER set_type "
                                       "function for IRQ %d (%s)\n", irq,
-                                      desc->chip ? desc->chip->name :
-                                      "unknown");
+                                      desc->chip->name);
                } else
                        compat_irq_chip_set_default_handler(desc);
  
                } else
                        /* Undo nested disables: */
                        desc->depth = 1;
+               /* Set default affinity mask once everything is setup */
+               irq_select_affinity(irq);
        }
        /* Reset broken irq detection when installing new handler */
        desc->irq_count = 0;
@@@ -578,8 -596,6 +603,6 @@@ int request_irq(unsigned int irq, irq_h
        action->next = NULL;
        action->dev_id = dev_id;
  
-       select_smp_affinity(irq);
  #ifdef CONFIG_DEBUG_SHIRQ
        if (irqflags & IRQF_SHARED) {
                /*