Merge remote-tracking branch 'origin/x86/boot' into x86/mm2

author H. Peter Anvin <hpa@linux.intel.com>

Tue, 29 Jan 2013 22:59:09 +0000 (14:59 -0800)

committer H. Peter Anvin <hpa@linux.intel.com>

Tue, 29 Jan 2013 23:10:15 +0000 (15:10 -0800)
author H. Peter Anvin <hpa@linux.intel.com>
Tue, 29 Jan 2013 22:59:09 +0000 (14:59 -0800)
committer H. Peter Anvin <hpa@linux.intel.com>
Tue, 29 Jan 2013 23:10:15 +0000 (15:10 -0800)
diff --combined arch/sparc/mm/init_64.c

index b24bac238e345050de6edee6fef6d347ae5357f9,c3b72423c846465373e6c06ab961ac99efce4ea6..fc5a7c4bd9e8dab06e452589df3ab8b950d0ca66
--- 1/arch/sparc/mm/init_64.c
--- 2/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@@ -87,8 -87,8 +87,8 @@@ static unsigned long cpu_pgsz_mask
   
   #define MAX_BANKS     32
   
- static struct linux_prom64_registers pavail[MAX_BANKS] __devinitdata;
- static int pavail_ents __devinitdata;
+ static struct linux_prom64_registers pavail[MAX_BANKS];
+ static int pavail_ents;
   
   static int cmp_p64(const void *a, const void *b)
   {
@@@ -624,7 -624,7 +624,7 @@@ static void __init inherit_prom_mapping
   void prom_world(int enter)
   {
         if (!enter)
-               set_fs((mm_segment_t) { get_thread_current_ds() });
+               set_fs(get_fs());
   
         __asm__ __volatile__("flushw");
   }
@@@ -1931,7 -1931,7 +1931,7 @@@ void __init paging_init(void
         printk("Booting Linux...\n");
   }
   
- int __devinit page_in_phys_avail(unsigned long paddr)
+ int page_in_phys_avail(unsigned long paddr)
   {
         int i;
   
@@@ -2021,16 -2021,6 +2021,16 @@@ static void __init patch_tlb_miss_handl
         flushi(&valid_addr_bitmap_insn[0]);
   }
   
+ +static void __init register_page_bootmem_info(void)
+ +{
+ +#ifdef CONFIG_NEED_MULTIPLE_NODES
+ +      int i;
+ +
+ +      for_each_online_node(i)
+ +              if (NODE_DATA(i)->node_spanned_pages)
+ +                      register_page_bootmem_info_node(NODE_DATA(i));
+ +#endif
+ +}
   void __init mem_init(void)
   {
         unsigned long codepages, datapages, initpages;
@@@ -2048,8 -2038,20 +2048,8 @@@
   
         high_memory = __va(last_valid_pfn << PAGE_SHIFT);
   
- -#ifdef CONFIG_NEED_MULTIPLE_NODES
- -      {
- -              int i;
- -              for_each_online_node(i) {
- -                      if (NODE_DATA(i)->node_spanned_pages != 0) {
- -                              totalram_pages +=
- -                                      free_all_bootmem_node(NODE_DATA(i));
- -                      }
- -              }
- -              totalram_pages += free_low_memory_core_early(MAX_NUMNODES);
- -      }
- -#else
+ +      register_page_bootmem_info();
         totalram_pages = free_all_bootmem();
- -#endif
   
         /* We subtract one to account for the mem_map_zero page
          * allocated below.
diff --combined arch/x86/include/asm/pgtable.h

index 6991a3e1bf81ae1ff0809f3acb1ec93e3194da15,5199db2923d31ff88b94c54397daae2b279a7bc7..3c7c6985045d1c84109f1bb59432cee143793f24
--- 1/arch/x86/include/asm/pgtable.h
--- 2/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@@ -404,7 -404,14 +404,14 @@@ static inline int pte_same(pte_t a, pte
   
   static inline int pte_present(pte_t a)
   {
-       return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+       return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
+                              _PAGE_NUMA);
+ }
+ 
+ #define pte_accessible pte_accessible
+ static inline int pte_accessible(pte_t a)
+ {
+       return pte_flags(a) & _PAGE_PRESENT;
   }
   
   static inline int pte_hidden(pte_t pte)
@@@ -420,7 -427,8 +427,8 @@@ static inline int pmd_present(pmd_t pmd
          * the _PAGE_PSE flag will remain set at all times while the
          * _PAGE_PRESENT bit is clear).
          */
-       return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
+       return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
+                                _PAGE_NUMA);
   }
   
   static inline int pmd_none(pmd_t pmd)
@@@ -479,6 -487,11 +487,11 @@@ static inline pte_t *pte_offset_kernel(
   
   static inline int pmd_bad(pmd_t pmd)
   {
+ #ifdef CONFIG_NUMA_BALANCING
+       /* pmd_numa check */
+       if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
+               return 0;
+ #endif
         return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
   }
   
@@@ -602,8 -615,6 +615,8 @@@ static inline int pgd_none(pgd_t pgd
   #ifndef __ASSEMBLY__
   
   extern int direct_gbpages;
+ +void init_mem_mapping(void);
+ +void early_alloc_pgt_buf(void);
   
   /* local pte updates need not use xchg for locking */
   static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
diff --combined arch/x86/include/asm/pgtable_types.h

index 79738f20aaf5b3ac977e22e3eb26d2c72649f3ba,3c32db8c539d7d037ac93afc679da31e1808c402..696fa7eafb1d8873bceea72ba4dde5840b974b09
--- 1/arch/x86/include/asm/pgtable_types.h
--- 2/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@@ -64,6 -64,26 +64,26 @@@
   #define _PAGE_FILE    (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
   #define _PAGE_PROTNONE        (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
   
+ /*
+  * _PAGE_NUMA indicates that this page will trigger a numa hinting
+  * minor page fault to gather numa placement statistics (see
+  * pte_numa()). The bit picked (8) is within the range between
+  * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
+  * require changes to the swp entry format because that bit is always
+  * zero when the pte is not present.
+  *
+  * The bit picked must be always zero when the pmd is present and not
+  * present, so that we don't lose information when we set it while
+  * atomically clearing the present bit.
+  *
+  * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
+  * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
+  * couldn't reach, like handle_mm_fault() (see access_error in
+  * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
+  * handle_mm_fault() to be invoked).
+  */
+ #define _PAGE_NUMA    _PAGE_PROTNONE
+ 
   #define _PAGE_TABLE   (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                          _PAGE_ACCESSED | _PAGE_DIRTY)
   #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
@@@ -301,6 -321,7 +321,6 @@@ int phys_mem_access_prot_allowed(struc
   /* Install a pte for a particular vaddr in kernel space. */
   void set_pte_vaddr(unsigned long vaddr, pte_t pte);
   
- -extern void native_pagetable_reserve(u64 start, u64 end);
   #ifdef CONFIG_X86_32
   extern void native_pagetable_init(void);
   #else
diff --combined arch/x86/kernel/acpi/boot.c

index 4b23aa18518d2f5a2d8644c65f7f4a73305e75d2,bacf4b0d91f4e0e01c4b7bc7cbcfebe6e1e3a680..cfc755dc1607b0c8d51bef3b2b12a9ffa94b2c25
--- 1/arch/x86/kernel/acpi/boot.c
--- 2/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@@ -51,6 -51,7 +51,6 @@@ EXPORT_SYMBOL(acpi_disabled)
   
   #ifdef        CONFIG_X86_64
   # include <asm/proto.h>
- -# include <asm/numa_64.h>
   #endif                                /* X86 */
   
   #define BAD_MADT_ENTRY(entry, end) (                                      \
@@@ -573,6 -574,12 +573,12 @@@ int acpi_register_gsi(struct device *de
   
         return irq;
   }
+ EXPORT_SYMBOL_GPL(acpi_register_gsi);
+ 
+ void acpi_unregister_gsi(u32 gsi)
+ {
+ }
+ EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
   
   void __init acpi_set_irq_model_pic(void)
   {
@@@ -1699,3 -1706,9 +1705,9 @@@ int __acpi_release_global_lock(unsigne
         } while (unlikely (val != old));
         return old & 0x1;
   }
+ 
+ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
+ {
+       e820_add_region(addr, size, E820_ACPI);
+       update_e820();
+ }
diff --combined arch/x86/kernel/cpu/amd.c

index 913f94f9e8d9fc121949e3f7050e43faa59ec0fc,15239fffd6fee747913a1f0e493c512885239379..eafb084e80f87e5904fbdd3fff36aecec19d6326
--- 1/arch/x86/kernel/cpu/amd.c
--- 2/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@@ -12,6 -12,7 +12,6 @@@
   #include <asm/pci-direct.h>
   
   #ifdef CONFIG_X86_64
- -# include <asm/numa_64.h>
   # include <asm/mmconfig.h>
   # include <asm/cacheflush.h>
   #endif
@@@ -303,7 -304,7 +303,7 @@@ static void __cpuinit amd_get_topology(
         int cpu = smp_processor_id();
   
         /* get information required for multi-node processors */
-       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+       if (cpu_has_topoext) {
                 u32 eax, ebx, ecx, edx;
   
                 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@@ -630,6 -631,20 +630,20 @@@ static void __cpuinit init_amd(struct c
                 }
         }
   
+       /*
+        * The way access filter has a performance penalty on some workloads.
+        * Disable it on the affected CPUs.
+        */
+       if ((c->x86 == 0x15) &&
+           (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+               u64 val;
+ 
+               if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) {
+                       val |= 0x1E;
+                       wrmsrl_safe(0xc0011021, val);
+               }
+       }
+ 
         cpu_detect_cache_sizes(c);
   
         /* Multi core CPU? */
@@@ -642,12 -657,7 +656,7 @@@
         detect_ht(c);
   #endif
   
-       if (c->extended_cpuid_level >= 0x80000006) {
-               if (cpuid_edx(0x80000006) & 0xf000)
-                       num_cache_leaves = 4;
-               else
-                       num_cache_leaves = 3;
-       }
+       init_amd_cacheinfo(c);
   
         if (c->x86 >= 0xf)
                 set_cpu_cap(c, X86_FEATURE_K8);
@@@ -675,10 -685,12 +684,10 @@@
                  * benefit in doing so.
                  */
                 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ +                      unsigned long pfn = tseg >> PAGE_SHIFT;
+ +
                         printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- -                      if ((tseg>>PMD_SHIFT) <
- -                              (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
- -                              ((tseg>>PMD_SHIFT) <
- -                              (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
- -                              (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+ +                      if (pfn_range_is_mapped(pfn, pfn + 1))
                                 set_memory_4k((unsigned long)__va(tseg), 1);
                 }
         }
@@@ -736,9 -748,6 +745,6 @@@ static unsigned int __cpuinit amd_size_
   
   static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
   {
-       if (!cpu_has_invlpg)
-               return;
- 
         tlb_flushall_shift = 5;
   
         if (c->x86 <= 0x11)
diff --combined arch/x86/kernel/cpu/intel.c

index 3b547cc4bd03f1750cdd1c623babdba9b8eb433a,fcaabd0432c5dda0fa5e3c8f8b37473ee177c1d8..a24c462888f0722915ff40c2ba1a6ccad1e58d66
--- 1/arch/x86/kernel/cpu/intel.c
--- 2/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@@ -17,6 -17,7 +17,6 @@@
   
   #ifdef CONFIG_X86_64
   #include <linux/topology.h>
- -#include <asm/numa_64.h>
   #endif
   
   #include "cpu.h"
@@@ -611,10 -612,6 +611,6 @@@ static void __cpuinit intel_tlb_lookup(
   
   static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
   {
-       if (!cpu_has_invlpg) {
-               tlb_flushall_shift = -1;
-               return;
-       }
         switch ((c->x86 << 8) + c->x86_model) {
         case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
         case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
diff --combined arch/x86/kernel/setup.c

index 6d29d1fcf068cf82250668e0990e207413bdd72c,00f6c1472b850472e5f9759dd5ad9613f6c026be..268193746cd86efb66b1845021d1688388502102
--- 1/arch/x86/kernel/setup.c
--- 2/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@@ -108,16 -108,17 +108,16 @@@
   #include <asm/topology.h>
   #include <asm/apicdef.h>
   #include <asm/amd_nb.h>
- -#ifdef CONFIG_X86_64
- -#include <asm/numa_64.h>
- -#endif
   #include <asm/mce.h>
   #include <asm/alternative.h>
   #include <asm/prom.h>
   
   /*
- - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- - * The direct mapping extends to max_pfn_mapped, so that we can directly access
- - * apertures, ACPI and other tables without having to play with fixmaps.
+ + * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ + * max_pfn_mapped:     highest direct mapped pfn over 4GB
+ + *
+ + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ + * represented by pfn_mapped
    */
   unsigned long max_low_pfn_mapped;
   unsigned long max_pfn_mapped;
@@@ -142,11 -143,7 +142,7 @@@ int default_check_phys_apicid_present(i
   }
   #endif
   
- #ifndef CONFIG_DEBUG_BOOT_PARAMS
- struct boot_params __initdata boot_params;
- #else
   struct boot_params boot_params;
- #endif
   
   /*
    * Machine setup..
@@@ -279,7 -276,18 +275,7 @@@ void * __init extend_brk(size_t size, s
         return ret;
   }
   
- -#ifdef CONFIG_X86_64
- -static void __init init_gbpages(void)
- -{
- -      if (direct_gbpages && cpu_has_gbpages)
- -              printk(KERN_INFO "Using GB pages for direct mapping\n");
- -      else
- -              direct_gbpages = 0;
- -}
- -#else
- -static inline void init_gbpages(void)
- -{
- -}
+ +#ifdef CONFIG_X86_32
   static void __init cleanup_highmap(void)
   {
   }
@@@ -305,19 -313,20 +301,19 @@@ static void __init relocate_initrd(void
         u64 ramdisk_image = boot_params.hdr.ramdisk_image;
         u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
         u64 area_size     = PAGE_ALIGN(ramdisk_size);
- -      u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
         u64 ramdisk_here;
         unsigned long slop, clen, mapaddr;
         char *p, *q;
   
- -      /* We need to move the initrd down into lowmem */
- -      ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
- -                                       PAGE_SIZE);
+ +      /* We need to move the initrd down into directly mapped mem */
+ +      ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ +                                               area_size, PAGE_SIZE);
   
         if (!ramdisk_here)
                 panic("Cannot find place for new RAMDISK of size %lld\n",
                          ramdisk_size);
   
- -      /* Note: this includes all the lowmem currently occupied by
+ +      /* Note: this includes all the mem currently occupied by
            the initrd, we rely on that fact to keep the data intact. */
         memblock_reserve(ramdisk_here, area_size);
         initrd_start = ramdisk_here + PAGE_OFFSET;
@@@ -327,7 -336,17 +323,7 @@@
   
         q = (char *)initrd_start;
   
- -      /* Copy any lowmem portion of the initrd */
- -      if (ramdisk_image < end_of_lowmem) {
- -              clen = end_of_lowmem - ramdisk_image;
- -              p = (char *)__va(ramdisk_image);
- -              memcpy(q, p, clen);
- -              q += clen;
- -              ramdisk_image += clen;
- -              ramdisk_size  -= clen;
- -      }
- -
- -      /* Copy the highmem portion of the initrd */
+ +      /* Copy the initrd */
         while (ramdisk_size) {
                 slop = ramdisk_image & ~PAGE_MASK;
                 clen = ramdisk_size;
@@@ -341,7 -360,7 +337,7 @@@
                 ramdisk_image += clen;
                 ramdisk_size  -= clen;
         }
- -      /* high pages is not converted by early_res_to_bootmem */
+ +
         ramdisk_image = boot_params.hdr.ramdisk_image;
         ramdisk_size  = boot_params.hdr.ramdisk_size;
         printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
@@@ -350,27 -369,13 +346,27 @@@
                 ramdisk_here, ramdisk_here + ramdisk_size - 1);
   }
   
+ +static u64 __init get_mem_size(unsigned long limit_pfn)
+ +{
+ +      int i;
+ +      u64 mapped_pages = 0;
+ +      unsigned long start_pfn, end_pfn;
+ +
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ +              start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
+ +              end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
+ +              mapped_pages += end_pfn - start_pfn;
+ +      }
+ +
+ +      return mapped_pages << PAGE_SHIFT;
+ +}
   static void __init reserve_initrd(void)
   {
         /* Assume only end is not page aligned */
         u64 ramdisk_image = boot_params.hdr.ramdisk_image;
         u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
         u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- -      u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+ +      u64 mapped_size;
   
         if (!boot_params.hdr.type_of_loader ||
             !ramdisk_image || !ramdisk_size)
@@@ -378,18 -383,18 +374,18 @@@
   
         initrd_start = 0;
   
- -      if (ramdisk_size >= (end_of_lowmem>>1)) {
+ +      mapped_size = get_mem_size(max_pfn_mapped);
+ +      if (ramdisk_size >= (mapped_size>>1))
                 panic("initrd too large to handle, "
                        "disabling initrd (%lld needed, %lld available)\n",
- -                     ramdisk_size, end_of_lowmem>>1);
- -      }
+ +                     ramdisk_size, mapped_size>>1);
   
         printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
                         ramdisk_end - 1);
   
- -
- -      if (ramdisk_end <= end_of_lowmem) {
- -              /* All in lowmem, easy case */
+ +      if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+ +                              PFN_DOWN(ramdisk_end))) {
+ +              /* All are mapped, easy case */
                 /*
                  * don't need to reserve again, already reserved early
                  * in i386_start_kernel
@@@ -605,6 -610,83 +601,83 @@@ static __init void reserve_ibft_region(
   
   static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
   
+ static bool __init snb_gfx_workaround_needed(void)
+ {
+ #ifdef CONFIG_PCI
+       int i;
+       u16 vendor, devid;
+       static const __initconst u16 snb_ids[] = {
+               0x0102,
+               0x0112,
+               0x0122,
+               0x0106,
+               0x0116,
+               0x0126,
+               0x010a,
+       };
+ 
+       /* Assume no if something weird is going on with PCI */
+       if (!early_pci_allowed())
+               return false;
+ 
+       vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
+       if (vendor != 0x8086)
+               return false;
+ 
+       devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
+       for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
+               if (devid == snb_ids[i])
+                       return true;
+ #endif
+ 
+       return false;
+ }
+ 
+ /*
+  * Sandy Bridge graphics has trouble with certain ranges, exclude
+  * them from allocation.
+  */
+ static void __init trim_snb_memory(void)
+ {
+       static const __initconst unsigned long bad_pages[] = {
+               0x20050000,
+               0x20110000,
+               0x20130000,
+               0x20138000,
+               0x40004000,
+       };
+       int i;
+ 
+       if (!snb_gfx_workaround_needed())
+               return;
+ 
+       printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
+ 
+       /*
+        * Reserve all memory below the 1 MB mark that has not
+        * already been reserved.
+        */
+       memblock_reserve(0, 1<<20);
+       
+       for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+               if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+                       printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+                              bad_pages[i]);
+       }
+ }
+ 
+ /*
+  * Here we put platform-specific memory range workarounds, i.e.
+  * memory known to be corrupt or otherwise in need to be reserved on
+  * specific platforms.
+  *
+  * If this gets used more widely it could use a real dispatch mechanism.
+  */
+ static void __init trim_platform_memory_ranges(void)
+ {
+       trim_snb_memory();
+ }
+ 
   static void __init trim_bios_range(void)
   {
         /*
@@@ -625,6 -707,7 +698,7 @@@
          * take them out.
          */
         e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+ 
         sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
   }
   
@@@ -823,20 -906,6 +897,20 @@@ void __init setup_arch(char **cmdline_p
         insert_resource(&iomem_resource, &data_resource);
         insert_resource(&iomem_resource, &bss_resource);
   
+ +      /*
+ +       * Complain if .text .data and .bss are not marked as E820_RAM and
+ +       * attempt to fix it by adding the range. We may have a confused BIOS,
+ +       * or the user may have incorrectly supplied it via memmap=exactmap. If
+ +       * we really are running on top non-RAM, we will crash later anyways.
+ +       */
+ +      if (!e820_all_mapped(code_resource.start, __pa(__brk_limit), E820_RAM)) {
+ +              pr_warn(".text .data .bss are not marked as E820_RAM!\n");
+ +
+ +              e820_add_region(code_resource.start,
+ +                              __pa(__brk_limit) - code_resource.start + 1,
+ +                              E820_RAM);
+ +      }
+ +
         trim_bios_range();
   #ifdef CONFIG_X86_32
         if (ppro_with_ram_bug()) {
@@@ -886,8 -955,6 +960,8 @@@
   
         reserve_ibft_region();
   
+ +      early_alloc_pgt_buf();
+ +
         /*
          * Need to conclude brk, before memblock_x86_fill()
          *  it could use memblock_find_in_range, could overlap with
@@@ -897,7 -964,7 +971,7 @@@
   
         cleanup_highmap();
   
- -      memblock.current_limit = get_max_mapped();
+ +      memblock.current_limit = ISA_END_ADDRESS;
         memblock_x86_fill();
   
         /*
@@@ -919,8 -986,36 +993,10 @@@
   
         setup_real_mode();
   
- -      init_gbpages();
+       trim_platform_memory_ranges();
+ 
+ +      init_mem_mapping();
   
- -      /* max_pfn_mapped is updated here */
- -      max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
- -      max_pfn_mapped = max_low_pfn_mapped;
- -
- -#ifdef CONFIG_X86_64
- -      if (max_pfn > max_low_pfn) {
- -              int i;
- -              unsigned long start, end;
- -              unsigned long start_pfn, end_pfn;
- -
- -              for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
- -                                                       NULL) {
- -
- -                      end = PFN_PHYS(end_pfn);
- -                      if (end <= (1UL<<32))
- -                              continue;
- -
- -                      start = PFN_PHYS(start_pfn);
- -                      max_pfn_mapped = init_memory_mapping(
- -                                              max((1UL<<32), start), end);
- -              }
- -
- -              /* can we preseve max_low_pfn ?*/
- -              max_low_pfn = max_pfn;
- -      }
- -#endif
         memblock.current_limit = get_max_mapped();
         dma_contiguous_reserve(0);
   
@@@ -937,6 -1032,10 +1013,10 @@@
   
         reserve_initrd();
   
+ #if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD)
+       acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start);
+ #endif
+ 
         reserve_crashkernel();
   
         vsmp_init();
diff --combined arch/x86/mm/init_32.c

index f4fc4a28393aae1724c6d0c790ad417a377320da,745d66b843c84241f1d849e865dc2796a00b6f1f..b299724f6e34e9ace068a4c7f39d504a7cc5c7b4
--- 1/arch/x86/mm/init_32.c
--- 2/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@@ -53,14 -53,25 +53,14 @@@
   #include <asm/page_types.h>
   #include <asm/init.h>
   
+ +#include "mm_internal.h"
+ +
   unsigned long highstart_pfn, highend_pfn;
   
   static noinline int do_test_wp_bit(void);
   
   bool __read_mostly __vmalloc_start_set = false;
   
- -static __init void *alloc_low_page(void)
- -{
- -      unsigned long pfn = pgt_buf_end++;
- -      void *adr;
- -
- -      if (pfn >= pgt_buf_top)
- -              panic("alloc_low_page: ran out of memory");
- -
- -      adr = __va(pfn * PAGE_SIZE);
- -      clear_page(adr);
- -      return adr;
- -}
- -
   /*
    * Creates a middle page table and puts a pointer to it in the
    * given global directory entry. This only returns the gd entry
@@@ -73,7 -84,10 +73,7 @@@ static pmd_t * __init one_md_table_init
   
   #ifdef CONFIG_X86_PAE
         if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- -              if (after_bootmem)
- -                      pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
- -              else
- -                      pmd_table = (pmd_t *)alloc_low_page();
+ +              pmd_table = (pmd_t *)alloc_low_page();
                 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
                 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
                 pud = pud_offset(pgd, 0);
@@@ -95,7 -109,17 +95,7 @@@
   static pte_t * __init one_page_table_init(pmd_t *pmd)
   {
         if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
- -              pte_t *page_table = NULL;
- -
- -              if (after_bootmem) {
- -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- -                      page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
- -#endif
- -                      if (!page_table)
- -                              page_table =
- -                              (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
- -              } else
- -                      page_table = (pte_t *)alloc_low_page();
+ +              pte_t *page_table = (pte_t *)alloc_low_page();
   
                 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
                 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@@ -122,39 -146,8 +122,39 @@@ pte_t * __init populate_extra_pte(unsig
         return one_page_table_init(pmd) + pte_idx;
   }
   
+ +static unsigned long __init
+ +page_table_range_init_count(unsigned long start, unsigned long end)
+ +{
+ +      unsigned long count = 0;
+ +#ifdef CONFIG_HIGHMEM
+ +      int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ +      int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+ +      int pgd_idx, pmd_idx;
+ +      unsigned long vaddr;
+ +
+ +      if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+ +              return 0;
+ +
+ +      vaddr = start;
+ +      pgd_idx = pgd_index(vaddr);
+ +
+ +      for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+ +              for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ +                                                      pmd_idx++) {
+ +                      if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+ +                          (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+ +                              count++;
+ +                      vaddr += PMD_SIZE;
+ +              }
+ +              pmd_idx = 0;
+ +      }
+ +#endif
+ +      return count;
+ +}
+ +
   static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
- -                                         unsigned long vaddr, pte_t *lastpte)
+ +                                         unsigned long vaddr, pte_t *lastpte,
+ +                                         void **adr)
   {
   #ifdef CONFIG_HIGHMEM
         /*
@@@ -168,15 -161,16 +168,15 @@@
   
         if (pmd_idx_kmap_begin != pmd_idx_kmap_end
             && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
- -          && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- -          && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
- -              || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+ +          && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
                 pte_t *newpte;
                 int i;
   
                 BUG_ON(after_bootmem);
- -              newpte = alloc_low_page();
+ +              newpte = *adr;
                 for (i = 0; i < PTRS_PER_PTE; i++)
                         set_pte(newpte + i, pte[i]);
+ +              *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
   
                 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
                 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@@ -210,11 -204,6 +210,11 @@@ page_table_range_init(unsigned long sta
         pgd_t *pgd;
         pmd_t *pmd;
         pte_t *pte = NULL;
+ +      unsigned long count = page_table_range_init_count(start, end);
+ +      void *adr = NULL;
+ +
+ +      if (count)
+ +              adr = alloc_low_pages(count);
   
         vaddr = start;
         pgd_idx = pgd_index(vaddr);
@@@ -227,7 -216,7 +227,7 @@@
                 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
                                                         pmd++, pmd_idx++) {
                         pte = page_table_kmap_check(one_page_table_init(pmd),
- -                                                  pmd, vaddr, pte);
+ +                                                  pmd, vaddr, pte, &adr);
   
                         vaddr += PMD_SIZE;
                 }
@@@ -321,7 -310,6 +321,7 @@@ repeat
                                         __pgprot(PTE_IDENT_ATTR |
                                                  _PAGE_PSE);
   
+ +                              pfn &= PMD_MASK >> PAGE_SHIFT;
                                 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
                                         PAGE_OFFSET + PAGE_SIZE-1;
   
@@@ -467,14 -455,9 +467,14 @@@ void __init native_pagetable_init(void
   
         /*
          * Remove any mappings which extend past the end of physical
- -       * memory from the boot time page table:
+ +       * memory from the boot time page table.
+ +       * In virtual address space, we should have at least two pages
+ +       * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+ +       * definition. And max_low_pfn is set to VMALLOC_END physical
+ +       * address. If initial memory mapping is doing right job, we
+ +       * should have pte used near max_low_pfn or one pmd is not present.
          */
- -      for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ +      for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
                 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
                 pgd = base + pgd_index(va);
                 if (!pgd_present(*pgd))
@@@ -485,19 -468,10 +485,19 @@@
                 if (!pmd_present(*pmd))
                         break;
   
+ +              /* should not be large page here */
+ +              if (pmd_large(*pmd)) {
+ +                      pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+ +                              pfn, pmd, __pa(pmd));
+ +                      BUG_ON(1);
+ +              }
+ +
                 pte = pte_offset_kernel(pmd, va);
                 if (!pte_present(*pte))
                         break;
   
+ +              printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+ +                              pfn, pmd, __pa(pmd), pte, __pa(pte));
                 pte_clear(NULL, va, pte);
         }
         paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@@ -576,7 -550,7 +576,7 @@@ early_param("highmem", parse_highmem)
    * artificially via the highmem=x boot parameter then create
    * it:
    */
- -void __init lowmem_pfn_init(void)
+ +static void __init lowmem_pfn_init(void)
   {
         /* max_low_pfn is 0, we already have early_res support */
         max_low_pfn = max_pfn;
@@@ -612,7 -586,7 +612,7 @@@
    * We have more RAM than fits into lowmem - we try to put it into
    * highmem, also taking the highmem=x boot parameter into account:
    */
- -void __init highmem_pfn_init(void)
+ +static void __init highmem_pfn_init(void)
   {
         max_low_pfn = MAXMEM_PFN;
   
@@@ -695,6 -669,8 +695,6 @@@ void __init setup_bootmem_allocator(voi
         printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
                  max_pfn_mapped<<PAGE_SHIFT);
         printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
- -
- -      after_bootmem = 1;
   }
   
   /*
@@@ -739,10 -715,7 +739,7 @@@ static void __init test_wp_bit(void
   
         if (!boot_cpu_data.wp_works_ok) {
                 printk(KERN_CONT "No.\n");
- #ifdef CONFIG_X86_WP_WORKS_OK
-               panic(
-   "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
- #endif
+               panic("Linux doesn't support CPUs with broken WP.");
         } else {
                 printk(KERN_CONT "Ok.\n");
         }
@@@ -780,8 -753,6 +777,8 @@@ void __init mem_init(void
                 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
                         reservedpages++;
   
+ +      after_bootmem = 1;
+ +
         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --combined arch/x86/mm/init_64.c

index 41785305f645d5621b90e87aded80c648f9683e5,2ead3c8a4c8419da92a61fbba35eb695e5205dde..191ab12f5ff37b0dc93f74efa01208f92a0d461b
--- 1/arch/x86/mm/init_64.c
--- 2/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@@ -54,8 -54,6 +54,8 @@@
   #include <asm/uv/uv.h>
   #include <asm/setup.h>
   
+ +#include "mm_internal.h"
+ +
   static int __init parse_direct_gbpages_off(char *arg)
   {
         direct_gbpages = 0;
@@@ -316,24 -314,69 +316,24 @@@ void __init cleanup_highmap(void
         }
   }
   
- -static __ref void *alloc_low_page(unsigned long *phys)
- -{
- -      unsigned long pfn = pgt_buf_end++;
- -      void *adr;
- -
- -      if (after_bootmem) {
- -              adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
- -              *phys = __pa(adr);
- -
- -              return adr;
- -      }
- -
- -      if (pfn >= pgt_buf_top)
- -              panic("alloc_low_page: ran out of memory");
- -
- -      adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- -      clear_page(adr);
- -      *phys  = pfn * PAGE_SIZE;
- -      return adr;
- -}
- -
- -static __ref void *map_low_page(void *virt)
- -{
- -      void *adr;
- -      unsigned long phys, left;
- -
- -      if (after_bootmem)
- -              return virt;
- -
- -      phys = __pa(virt);
- -      left = phys & (PAGE_SIZE - 1);
- -      adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
- -      adr = (void *)(((unsigned long)adr) | left);
- -
- -      return adr;
- -}
- -
- -static __ref void unmap_low_page(void *adr)
- -{
- -      if (after_bootmem)
- -              return;
- -
- -      early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
- -}
- -
   static unsigned long __meminit
   phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
               pgprot_t prot)
   {
- -      unsigned pages = 0;
+ +      unsigned long pages = 0, next;
         unsigned long last_map_addr = end;
         int i;
   
         pte_t *pte = pte_page + pte_index(addr);
   
- -      for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
- -
+ +      for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+ +              next = (addr & PAGE_MASK) + PAGE_SIZE;
                 if (addr >= end) {
- -                      if (!after_bootmem) {
- -                              for(; i < PTRS_PER_PTE; i++, pte++)
- -                                      set_pte(pte, __pte(0));
- -                      }
- -                      break;
+ +                      if (!after_bootmem &&
+ +                          !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+ +                          !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ +                              set_pte(pte, __pte(0));
+ +                      continue;
                 }
   
                 /*
@@@ -371,25 -414,28 +371,25 @@@ phys_pmd_init(pmd_t *pmd_page, unsigne
         int i = pmd_index(address);
   
         for (; i < PTRS_PER_PMD; i++, address = next) {
- -              unsigned long pte_phys;
                 pmd_t *pmd = pmd_page + pmd_index(address);
                 pte_t *pte;
                 pgprot_t new_prot = prot;
   
+ +              next = (address & PMD_MASK) + PMD_SIZE;
                 if (address >= end) {
- -                      if (!after_bootmem) {
- -                              for (; i < PTRS_PER_PMD; i++, pmd++)
- -                                      set_pmd(pmd, __pmd(0));
- -                      }
- -                      break;
+ +                      if (!after_bootmem &&
+ +                          !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+ +                          !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ +                              set_pmd(pmd, __pmd(0));
+ +                      continue;
                 }
   
- -              next = (address & PMD_MASK) + PMD_SIZE;
- -
                 if (pmd_val(*pmd)) {
                         if (!pmd_large(*pmd)) {
                                 spin_lock(&init_mm.page_table_lock);
- -                              pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+ +                              pte = (pte_t *)pmd_page_vaddr(*pmd);
                                 last_map_addr = phys_pte_init(pte, address,
                                                                 end, prot);
- -                              unmap_low_page(pte);
                                 spin_unlock(&init_mm.page_table_lock);
                                 continue;
                         }
@@@ -418,18 -464,19 +418,18 @@@
                         pages++;
                         spin_lock(&init_mm.page_table_lock);
                         set_pte((pte_t *)pmd,
- -                              pfn_pte(address >> PAGE_SHIFT,
+ +                              pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
                                         __pgprot(pgprot_val(prot) | _PAGE_PSE)));
                         spin_unlock(&init_mm.page_table_lock);
                         last_map_addr = next;
                         continue;
                 }
   
- -              pte = alloc_low_page(&pte_phys);
+ +              pte = alloc_low_page();
                 last_map_addr = phys_pte_init(pte, address, end, new_prot);
- -              unmap_low_page(pte);
   
                 spin_lock(&init_mm.page_table_lock);
- -              pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+ +              pmd_populate_kernel(&init_mm, pmd, pte);
                 spin_unlock(&init_mm.page_table_lock);
         }
         update_page_count(PG_LEVEL_2M, pages);
@@@ -445,24 -492,27 +445,24 @@@ phys_pud_init(pud_t *pud_page, unsigne
         int i = pud_index(addr);
   
         for (; i < PTRS_PER_PUD; i++, addr = next) {
- -              unsigned long pmd_phys;
                 pud_t *pud = pud_page + pud_index(addr);
                 pmd_t *pmd;
                 pgprot_t prot = PAGE_KERNEL;
   
- -              if (addr >= end)
- -                      break;
- -
                 next = (addr & PUD_MASK) + PUD_SIZE;
- -
- -              if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
- -                      set_pud(pud, __pud(0));
+ +              if (addr >= end) {
+ +                      if (!after_bootmem &&
+ +                          !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+ +                          !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ +                              set_pud(pud, __pud(0));
                         continue;
                 }
   
                 if (pud_val(*pud)) {
                         if (!pud_large(*pud)) {
- -                              pmd = map_low_page(pmd_offset(pud, 0));
+ +                              pmd = pmd_offset(pud, 0);
                                 last_map_addr = phys_pmd_init(pmd, addr, end,
                                                          page_size_mask, prot);
- -                              unmap_low_page(pmd);
                                 __flush_tlb_all();
                                 continue;
                         }
@@@ -491,19 -541,19 +491,19 @@@
                         pages++;
                         spin_lock(&init_mm.page_table_lock);
                         set_pte((pte_t *)pud,
- -                              pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ +                              pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ +                                      PAGE_KERNEL_LARGE));
                         spin_unlock(&init_mm.page_table_lock);
                         last_map_addr = next;
                         continue;
                 }
   
- -              pmd = alloc_low_page(&pmd_phys);
+ +              pmd = alloc_low_page();
                 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
                                               prot);
- -              unmap_low_page(pmd);
   
                 spin_lock(&init_mm.page_table_lock);
- -              pud_populate(&init_mm, pud, __va(pmd_phys));
+ +              pud_populate(&init_mm, pud, pmd);
                 spin_unlock(&init_mm.page_table_lock);
         }
         __flush_tlb_all();
@@@ -528,6 -578,7 +528,6 @@@ kernel_physical_mapping_init(unsigned l
   
         for (; start < end; start = next) {
                 pgd_t *pgd = pgd_offset_k(start);
- -              unsigned long pud_phys;
                 pud_t *pud;
   
                 next = (start + PGDIR_SIZE) & PGDIR_MASK;
@@@ -535,18 -586,20 +535,18 @@@
                         next = end;
   
                 if (pgd_val(*pgd)) {
- -                      pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+ +                      pud = (pud_t *)pgd_page_vaddr(*pgd);
                         last_map_addr = phys_pud_init(pud, __pa(start),
                                                  __pa(end), page_size_mask);
- -                      unmap_low_page(pud);
                         continue;
                 }
   
- -              pud = alloc_low_page(&pud_phys);
+ +              pud = alloc_low_page();
                 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
                                                  page_size_mask);
- -              unmap_low_page(pud);
   
                 spin_lock(&init_mm.page_table_lock);
- -              pgd_populate(&init_mm, pgd, __va(pud_phys));
+ +              pgd_populate(&init_mm, pgd, pud);
                 spin_unlock(&init_mm.page_table_lock);
                 pgd_changed = true;
         }
@@@ -577,7 -630,9 +577,9 @@@ void __init paging_init(void
          *       numa support is not compiled in, and later node_set_state
          *       will not set it back.
          */
-       node_clear_state(0, N_NORMAL_MEMORY);
+       node_clear_state(0, N_MEMORY);
+       if (N_MEMORY != N_NORMAL_MEMORY)
+               node_clear_state(0, N_NORMAL_MEMORY);
   
         zone_sizes_init();
   }
@@@ -609,11 -664,13 +611,11 @@@ int arch_add_memory(int nid, u64 start
   {
         struct pglist_data *pgdat = NODE_DATA(nid);
         struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
- -      unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+ +      unsigned long start_pfn = start >> PAGE_SHIFT;
         unsigned long nr_pages = size >> PAGE_SHIFT;
         int ret;
   
- -      last_mapped_pfn = init_memory_mapping(start, start + size);
- -      if (last_mapped_pfn > max_pfn_mapped)
- -              max_pfn_mapped = last_mapped_pfn;
+ +      init_memory_mapping(start, start + size);
   
         ret = __add_pages(nid, zone, start_pfn, nr_pages);
         WARN_ON_ONCE(ret);
@@@ -629,16 -686,6 +631,16 @@@ EXPORT_SYMBOL_GPL(arch_add_memory)
   
   static struct kcore_list kcore_vsyscall;
   
+ +static void __init register_page_bootmem_info(void)
+ +{
+ +#ifdef CONFIG_NUMA
+ +      int i;
+ +
+ +      for_each_online_node(i)
+ +              register_page_bootmem_info_node(NODE_DATA(i));
+ +#endif
+ +}
+ +
   void __init mem_init(void)
   {
         long codesize, reservedpages, datasize, initsize;
@@@ -651,8 -698,11 +653,8 @@@
         reservedpages = 0;
   
         /* this will put all low memory onto the freelists */
- -#ifdef CONFIG_NUMA
- -      totalram_pages = numa_free_all_bootmem();
- -#else
+ +      register_page_bootmem_info();
         totalram_pages = free_all_bootmem();
- -#endif
   
         absent_pages = absent_pages_in_range(0, max_pfn);
         reservedpages = max_pfn - totalram_pages - absent_pages;
diff --combined arch/x86/xen/mmu.c

index bbb883f58bc408c566fd93d989f9d0d9c38df9c9,01de35c772210120075300504189c22bd00c5899..f5e86eee4e0ec9c7b80c94433f4ace0be0fe7403
--- 1/arch/x86/xen/mmu.c
--- 2/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@ -1178,6 -1178,20 +1178,6 @@@ static void xen_exit_mmap(struct mm_str
   
   static void xen_post_allocator_init(void);
   
- -static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
- -{
- -      /* reserve the range used */
- -      native_pagetable_reserve(start, end);
- -
- -      /* set as RW the rest */
- -      printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
- -                      PFN_PHYS(pgt_buf_top));
- -      while (end < PFN_PHYS(pgt_buf_top)) {
- -              make_lowmem_page_readwrite(__va(end));
- -              end += PAGE_SIZE;
- -      }
- -}
- -
   #ifdef CONFIG_X86_64
   static void __init xen_cleanhighmap(unsigned long vaddr,
                                     unsigned long vaddr_end)
@@@ -1489,6 -1503,19 +1489,6 @@@ static pte_t __init mask_rw_pte(pte_t *
   #else /* CONFIG_X86_64 */
   static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
   {
- -      unsigned long pfn = pte_pfn(pte);
- -
- -      /*
- -       * If the new pfn is within the range of the newly allocated
- -       * kernel pagetable, and it isn't being mapped into an
- -       * early_ioremap fixmap slot as a freshly allocated page, make sure
- -       * it is RO.
- -       */
- -      if (((!is_early_ioremap_ptep(ptep) &&
- -                      pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
- -                      (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
- -              pte = pte_wrprotect(pte);
- -
         return pte;
   }
   #endif /* CONFIG_X86_64 */
@@@ -2170,6 -2197,7 +2170,6 @@@ static const struct pv_mmu_ops xen_mmu_
   
   void __init xen_init_mmu_ops(void)
   {
- -      x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
         x86_init.paging.pagetable_init = xen_pagetable_init;
         pv_mmu_ops = xen_mmu_ops;
   
@@@ -2469,8 -2497,10 +2469,10 @@@ static int remap_area_mfn_pte_fn(pte_t 
   
   int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
                                unsigned long addr,
-                              unsigned long mfn, int nr,
-                              pgprot_t prot, unsigned domid)
+                              xen_pfn_t mfn, int nr,
+                              pgprot_t prot, unsigned domid,
+                              struct page **pages)
+ 
   {
         struct remap_data rmd;
         struct mmu_update mmu_update[REMAP_BATCH_SIZE];
@@@ -2514,3 -2544,14 +2516,14 @@@ out
         return err;
   }
   EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+ 
+ /* Returns: 0 success */
+ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+                              int numpgs, struct page **pages)
+ {
+       if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
+               return 0;
+ 
+       return -EINVAL;
+ }
+ EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
diff --combined include/linux/mm.h

index 64d5271a3d3651e6a749f597f561e7625d3f232f,66e2f7c61e5c9d3a2924389e28ffa9d32c8728bf..9d9dcc35d6a1b00208cd3842a4791c192a0c4118
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -455,7 -455,6 +455,6 @@@ void put_pages_list(struct list_head *p
   
   void split_page(struct page *page, unsigned int order);
   int split_free_page(struct page *page);
- int capture_free_page(struct page *page, int alloc_order, int migratetype);
   
   /*
    * Compound pages have a destructor function.  Provide a
@@@ -693,6 -692,36 +692,36 @@@ static inline int page_to_nid(const str
   }
   #endif
   
+ #ifdef CONFIG_NUMA_BALANCING
+ static inline int page_xchg_last_nid(struct page *page, int nid)
+ {
+       return xchg(&page->_last_nid, nid);
+ }
+ 
+ static inline int page_last_nid(struct page *page)
+ {
+       return page->_last_nid;
+ }
+ static inline void reset_page_last_nid(struct page *page)
+ {
+       page->_last_nid = -1;
+ }
+ #else
+ static inline int page_xchg_last_nid(struct page *page, int nid)
+ {
+       return page_to_nid(page);
+ }
+ 
+ static inline int page_last_nid(struct page *page)
+ {
+       return page_to_nid(page);
+ }
+ 
+ static inline void reset_page_last_nid(struct page *page)
+ {
+ }
+ #endif
+ 
   static inline struct zone *page_zone(const struct page *page)
   {
         return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
@@@ -977,7 -1006,6 +1006,6 @@@ static inline void unmap_shared_mapping
   
   extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
   extern void truncate_setsize(struct inode *inode, loff_t newsize);
- extern int vmtruncate(struct inode *inode, loff_t offset);
   void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
   int truncate_inode_page(struct address_space *mapping, struct page *page);
   int generic_error_remove_page(struct address_space *mapping, struct page *page);
@@@ -1078,6 -1106,9 +1106,9 @@@ extern unsigned long move_page_tables(s
   extern unsigned long do_mremap(unsigned long addr,
                                unsigned long old_len, unsigned long new_len,
                                unsigned long flags, unsigned long new_addr);
+ extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+                             unsigned long end, pgprot_t newprot,
+                             int dirty_accountable, int prot_numa);
   extern int mprotect_fixup(struct vm_area_struct *vma,
                           struct vm_area_struct **pprev, unsigned long start,
                           unsigned long end, unsigned long newflags);
@@@ -1355,6 -1386,7 +1386,6 @@@ extern void __init mmap_init(void)
   extern void show_mem(unsigned int flags);
   extern void si_meminfo(struct sysinfo * val);
   extern void si_meminfo_node(struct sysinfo *val, int nid);
- -extern int after_bootmem;
   
   extern __printf(3, 4)
   void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
@@@ -1455,6 -1487,37 +1486,37 @@@ extern unsigned long vm_mmap(struct fil
           unsigned long, unsigned long,
           unsigned long, unsigned long);
   
+ struct vm_unmapped_area_info {
+ #define VM_UNMAPPED_AREA_TOPDOWN 1
+       unsigned long flags;
+       unsigned long length;
+       unsigned long low_limit;
+       unsigned long high_limit;
+       unsigned long align_mask;
+       unsigned long align_offset;
+ };
+ 
+ extern unsigned long unmapped_area(struct vm_unmapped_area_info *info);
+ extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
+ 
+ /*
+  * Search for an unmapped address range.
+  *
+  * We are looking for a range that:
+  * - does not intersect with any VMA;
+  * - is contained within the [low_limit, high_limit) interval;
+  * - is at least the desired size.
+  * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
+  */
+ static inline unsigned long
+ vm_unmapped_area(struct vm_unmapped_area_info *info)
+ {
+       if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN))
+               return unmapped_area(info);
+       else
+               return unmapped_area_topdown(info);
+ }
+ 
   /* truncate.c */
   extern void truncate_inode_pages(struct address_space *, loff_t);
   extern void truncate_inode_pages_range(struct address_space *,
@@@ -1547,6 -1610,11 +1609,11 @@@ static inline pgprot_t vm_get_page_prot
   }
   #endif
   
+ #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+ unsigned long change_prot_numa(struct vm_area_struct *vma,
+                       unsigned long start, unsigned long end);
+ #endif
+ 
   struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
   int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                         unsigned long pfn, unsigned long size, pgprot_t);
@@@ -1568,6 -1636,7 +1635,7 @@@ struct page *follow_page(struct vm_area
   #define FOLL_MLOCK    0x40    /* mark page as mlocked */
   #define FOLL_SPLIT    0x80    /* don't return transhuge pages, split them */
   #define FOLL_HWPOISON 0x100   /* check page is hwpoisoned */
+ #define FOLL_NUMA     0x200   /* force NUMA hinting page fault */
   
   typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                         void *data);
diff --combined mm/nobootmem.c

index ecc2f13d557d3b28f5bb82b3d5dca2e621cfcf9c,b8294fc03df869153378f47f41f0ecd595c10887..03d152a76acf5e18396b74e62a3372e55b469750
--- 1/mm/nobootmem.c
--- 2/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@@ -137,6 -137,37 +137,22 @@@ unsigned long __init free_low_memory_co
         return count;
   }
   
- -/**
- - * free_all_bootmem_node - release a node's free pages to the buddy allocator
- - * @pgdat: node to be released
- - *
- - * Returns the number of pages actually released.
- - */
- -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
- -{
- -      register_page_bootmem_info_node(pgdat);
- -      reset_node_lowmem_managed_pages(pgdat);
- -
- -      /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
- -      return 0;
- -}
- -
+ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+ {
+       struct zone *z;
+ 
+       /*
+        * In free_area_init_core(), highmem zone's managed_pages is set to
+        * present_pages, and bootmem allocator doesn't allocate from highmem
+        * zones. So there's no need to recalculate managed_pages because all
+        * highmem pages will be managed by the buddy system. Here highmem
+        * zone also includes highmem movable zone.
+        */
+       for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+               if (!is_highmem(z))
+                       z->managed_pages = 0;
+ }
+ 
   /**
    * free_all_bootmem - release free pages to the buddy allocator
    *
@@@ -144,6 -175,11 +160,11 @@@
    */
   unsigned long __init free_all_bootmem(void)
   {
+       struct pglist_data *pgdat;
+ 
+       for_each_online_pgdat(pgdat)
+               reset_node_lowmem_managed_pages(pgdat);
+ 
         /*
          * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
          *  because in some case like Node0 doesn't have RAM installed
author	H. Peter Anvin <hpa@linux.intel.com>
	Tue, 29 Jan 2013 22:59:09 +0000 (14:59 -0800)
committer	H. Peter Anvin <hpa@linux.intel.com>
	Tue, 29 Jan 2013 23:10:15 +0000 (15:10 -0800)
		1	2
arch/sparc/mm/init_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/pgtable_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/acpi/boot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/amd.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/intel.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/nobootmem.c	patch \|	diff1 \|	diff2 \|	blob \| history