Merge remote-tracking branch 'origin/x86/boot' into x86/mm2
authorH. Peter Anvin <hpa@linux.intel.com>
Tue, 29 Jan 2013 22:59:09 +0000 (14:59 -0800)
committerH. Peter Anvin <hpa@linux.intel.com>
Tue, 29 Jan 2013 23:10:15 +0000 (15:10 -0800)
Coming patches to x86/mm2 require the changes and advanced baseline in
x86/boot.

Resolved Conflicts:
arch/x86/kernel/setup.c
mm/nobootmem.c

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
25 files changed:
arch/sparc/mm/init_64.c
arch/x86/include/asm/init.h
arch/x86/include/asm/numa.h
arch/x86/include/asm/numa_64.h [deleted file]
arch/x86/include/asm/page_types.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/x86_init.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/amd_gart_64.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/e820.c
arch/x86/kernel/setup.c
arch/x86/kernel/x86_init.c
arch/x86/mm/init.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/mm_internal.h [new file with mode: 0644]
arch/x86/mm/numa_64.c
arch/x86/mm/pageattr.c
arch/x86/platform/efi/efi.c
arch/x86/xen/mmu.c
include/linux/mm.h
mm/nobootmem.c

index c3b72423c846465373e6c06ab961ac99efce4ea6..fc5a7c4bd9e8dab06e452589df3ab8b950d0ca66 100644 (file)
@@ -2021,6 +2021,16 @@ static void __init patch_tlb_miss_handler_bitmap(void)
        flushi(&valid_addr_bitmap_insn[0]);
 }
 
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+       int i;
+
+       for_each_online_node(i)
+               if (NODE_DATA(i)->node_spanned_pages)
+                       register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
 void __init mem_init(void)
 {
        unsigned long codepages, datapages, initpages;
@@ -2038,20 +2048,8 @@ void __init mem_init(void)
 
        high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-       {
-               int i;
-               for_each_online_node(i) {
-                       if (NODE_DATA(i)->node_spanned_pages != 0) {
-                               totalram_pages +=
-                                       free_all_bootmem_node(NODE_DATA(i));
-                       }
-               }
-               totalram_pages += free_low_memory_core_early(MAX_NUMNODES);
-       }
-#else
+       register_page_bootmem_info();
        totalram_pages = free_all_bootmem();
-#endif
 
        /* We subtract one to account for the mem_map_zero page
         * allocated below.
index adcc0ae73d0914b4fe7b816c4512d296b290ba70..bac770b7cbc4debe98a6d6c3a1280fae58a6d5f4 100644 (file)
@@ -1,20 +1,5 @@
-#ifndef _ASM_X86_INIT_32_H
-#define _ASM_X86_INIT_32_H
+#ifndef _ASM_X86_INIT_H
+#define _ASM_X86_INIT_H
 
-#ifdef CONFIG_X86_32
-extern void __init early_ioremap_page_table_range_init(void);
-#endif
 
-extern void __init zone_sizes_init(void);
-
-extern unsigned long __init
-kernel_physical_mapping_init(unsigned long start,
-                            unsigned long end,
-                            unsigned long page_size_mask);
-
-
-extern unsigned long __initdata pgt_buf_start;
-extern unsigned long __meminitdata pgt_buf_end;
-extern unsigned long __meminitdata pgt_buf_top;
-
-#endif /* _ASM_X86_INIT_32_H */
+#endif /* _ASM_X86_INIT_H */
index 49119fcea2dc9771505b04928dbbb5800e030eaf..52560a2038e103fd359a4bcbc4e188052de6f2cf 100644 (file)
@@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu)
 
 #ifdef CONFIG_X86_32
 # include <asm/numa_32.h>
-#else
-# include <asm/numa_64.h>
 #endif
 
 #ifdef CONFIG_NUMA
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
deleted file mode 100644 (file)
index 0c05f7a..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_X86_NUMA_64_H
-#define _ASM_X86_NUMA_64_H
-
-extern unsigned long numa_free_all_bootmem(void);
-
-#endif /* _ASM_X86_NUMA_64_H */
index e21fdd10479f88e339e1e7351da837c563361635..54c97879195e7c5137d2a45fbda51c40a1c41be2 100644 (file)
@@ -51,6 +51,8 @@ static inline phys_addr_t get_max_mapped(void)
        return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
 }
 
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
+
 extern unsigned long init_memory_mapping(unsigned long start,
                                         unsigned long end);
 
index 5199db2923d31ff88b94c54397daae2b279a7bc7..3c7c6985045d1c84109f1bb59432cee143793f24 100644 (file)
@@ -615,6 +615,8 @@ static inline int pgd_none(pgd_t pgd)
 #ifndef __ASSEMBLY__
 
 extern int direct_gbpages;
+void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
 
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
index 3c32db8c539d7d037ac93afc679da31e1808c402..696fa7eafb1d8873bceea72ba4dde5840b974b09 100644 (file)
@@ -321,7 +321,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
-extern void native_pagetable_reserve(u64 start, u64 end);
 #ifdef CONFIG_X86_32
 extern void native_pagetable_init(void);
 #else
index 57693498519c4a69962417c8889b376182b49e2f..3b2ce8fc995ac73fa151d7e6abb3de79cd07f5b2 100644 (file)
@@ -68,17 +68,6 @@ struct x86_init_oem {
        void (*banner)(void);
 };
 
-/**
- * struct x86_init_mapping - platform specific initial kernel pagetable setup
- * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
- *
- * For more details on the purpose of this hook, look in
- * init_memory_mapping and the commit that added it.
- */
-struct x86_init_mapping {
-       void (*pagetable_reserve)(u64 start, u64 end);
-};
-
 /**
  * struct x86_init_paging - platform specific paging functions
  * @pagetable_init:    platform specific paging initialization call to setup
@@ -136,7 +125,6 @@ struct x86_init_ops {
        struct x86_init_mpparse         mpparse;
        struct x86_init_irqs            irqs;
        struct x86_init_oem             oem;
-       struct x86_init_mapping         mapping;
        struct x86_init_paging          paging;
        struct x86_init_timers          timers;
        struct x86_init_iommu           iommu;
index bacf4b0d91f4e0e01c4b7bc7cbcfebe6e1e3a680..cfc755dc1607b0c8d51bef3b2b12a9ffa94b2c25 100644 (file)
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
 
 #ifdef CONFIG_X86_64
 # include <asm/proto.h>
-# include <asm/numa_64.h>
 #endif                         /* X86 */
 
 #define BAD_MADT_ENTRY(entry, end) (                                       \
index e66311200cbd8ae78274e7f3525d9098cec43ffe..b574b295a2f9922c03673f8784cf72c0a743dbf5 100644 (file)
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
        aper_base       = info.aper_base;
        end_pfn         = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
 
-       if (end_pfn > max_low_pfn_mapped) {
-               start_pfn = (aper_base>>PAGE_SHIFT);
+       start_pfn = PFN_DOWN(aper_base);
+       if (!pfn_range_is_mapped(start_pfn, end_pfn))
                init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
-       }
 
        pr_info("PCI-DMA: using GART IOMMU.\n");
        iommu_size = check_iommu_size(info.aper_base, aper_size);
index 15239fffd6fee747913a1f0e493c512885239379..eafb084e80f87e5904fbdd3fff36aecec19d6326 100644 (file)
@@ -12,7 +12,6 @@
 #include <asm/pci-direct.h>
 
 #ifdef CONFIG_X86_64
-# include <asm/numa_64.h>
 # include <asm/mmconfig.h>
 # include <asm/cacheflush.h>
 #endif
@@ -685,12 +684,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                 * benefit in doing so.
                 */
                if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+                       unsigned long pfn = tseg >> PAGE_SHIFT;
+
                        printk(KERN_DEBUG "tseg: %010llx\n", tseg);
-                       if ((tseg>>PMD_SHIFT) <
-                               (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
-                               ((tseg>>PMD_SHIFT) <
-                               (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
-                               (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+                       if (pfn_range_is_mapped(pfn, pfn + 1))
                                set_memory_4k((unsigned long)__va(tseg), 1);
                }
        }
index fcaabd0432c5dda0fa5e3c8f8b37473ee177c1d8..a24c462888f0722915ff40c2ba1a6ccad1e58d66 100644 (file)
@@ -17,7 +17,6 @@
 
 #ifdef CONFIG_X86_64
 #include <linux/topology.h>
-#include <asm/numa_64.h>
 #endif
 
 #include "cpu.h"
index df06ade26bef8485af1a66d797370385d0012e52..d32abeabbda556ea0c9387a331f15a129bf4a0de 100644 (file)
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
 }
 early_param("mem", parse_memopt);
 
-static int __init parse_memmap_opt(char *p)
+static int __init parse_memmap_one(char *p)
 {
        char *oldp;
        u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
 
        return *p == '\0' ? 0 : -EINVAL;
 }
+static int __init parse_memmap_opt(char *str)
+{
+       while (str) {
+               char *k = strchr(str, ',');
+
+               if (k)
+                       *k++ = 0;
+
+               parse_memmap_one(str);
+               str = k;
+       }
+
+       return 0;
+}
 early_param("memmap", parse_memmap_opt);
 
 void __init finish_e820_parsing(void)
index 00f6c1472b850472e5f9759dd5ad9613f6c026be..268193746cd86efb66b1845021d1688388502102 100644 (file)
 #include <asm/topology.h>
 #include <asm/apicdef.h>
 #include <asm/amd_nb.h>
-#ifdef CONFIG_X86_64
-#include <asm/numa_64.h>
-#endif
 #include <asm/mce.h>
 #include <asm/alternative.h>
 #include <asm/prom.h>
 
 /*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped:     highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
        return ret;
 }
 
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
-       if (direct_gbpages && cpu_has_gbpages)
-               printk(KERN_INFO "Using GB pages for direct mapping\n");
-       else
-               direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
-{
-}
+#ifdef CONFIG_X86_32
 static void __init cleanup_highmap(void)
 {
 }
@@ -313,20 +301,19 @@ static void __init relocate_initrd(void)
        u64 ramdisk_image = boot_params.hdr.ramdisk_image;
        u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
        u64 area_size     = PAGE_ALIGN(ramdisk_size);
-       u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
        u64 ramdisk_here;
        unsigned long slop, clen, mapaddr;
        char *p, *q;
 
-       /* We need to move the initrd down into lowmem */
-       ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
-                                        PAGE_SIZE);
+       /* We need to move the initrd down into directly mapped mem */
+       ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+                                                area_size, PAGE_SIZE);
 
        if (!ramdisk_here)
                panic("Cannot find place for new RAMDISK of size %lld\n",
                         ramdisk_size);
 
-       /* Note: this includes all the lowmem currently occupied by
+       /* Note: this includes all the mem currently occupied by
           the initrd, we rely on that fact to keep the data intact. */
        memblock_reserve(ramdisk_here, area_size);
        initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +323,7 @@ static void __init relocate_initrd(void)
 
        q = (char *)initrd_start;
 
-       /* Copy any lowmem portion of the initrd */
-       if (ramdisk_image < end_of_lowmem) {
-               clen = end_of_lowmem - ramdisk_image;
-               p = (char *)__va(ramdisk_image);
-               memcpy(q, p, clen);
-               q += clen;
-               ramdisk_image += clen;
-               ramdisk_size  -= clen;
-       }
-
-       /* Copy the highmem portion of the initrd */
+       /* Copy the initrd */
        while (ramdisk_size) {
                slop = ramdisk_image & ~PAGE_MASK;
                clen = ramdisk_size;
@@ -360,7 +337,7 @@ static void __init relocate_initrd(void)
                ramdisk_image += clen;
                ramdisk_size  -= clen;
        }
-       /* high pages is not converted by early_res_to_bootmem */
+
        ramdisk_image = boot_params.hdr.ramdisk_image;
        ramdisk_size  = boot_params.hdr.ramdisk_size;
        printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
@@ -369,13 +346,27 @@ static void __init relocate_initrd(void)
                ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
 
+static u64 __init get_mem_size(unsigned long limit_pfn)
+{
+       int i;
+       u64 mapped_pages = 0;
+       unsigned long start_pfn, end_pfn;
+
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+               start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
+               end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
+               mapped_pages += end_pfn - start_pfn;
+       }
+
+       return mapped_pages << PAGE_SHIFT;
+}
 static void __init reserve_initrd(void)
 {
        /* Assume only end is not page aligned */
        u64 ramdisk_image = boot_params.hdr.ramdisk_image;
        u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
        u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-       u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+       u64 mapped_size;
 
        if (!boot_params.hdr.type_of_loader ||
            !ramdisk_image || !ramdisk_size)
@@ -383,18 +374,18 @@ static void __init reserve_initrd(void)
 
        initrd_start = 0;
 
-       if (ramdisk_size >= (end_of_lowmem>>1)) {
+       mapped_size = get_mem_size(max_pfn_mapped);
+       if (ramdisk_size >= (mapped_size>>1))
                panic("initrd too large to handle, "
                       "disabling initrd (%lld needed, %lld available)\n",
-                      ramdisk_size, end_of_lowmem>>1);
-       }
+                      ramdisk_size, mapped_size>>1);
 
        printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
                        ramdisk_end - 1);
 
-
-       if (ramdisk_end <= end_of_lowmem) {
-               /* All in lowmem, easy case */
+       if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+                               PFN_DOWN(ramdisk_end))) {
+               /* All are mapped, easy case */
                /*
                 * don't need to reserve again, already reserved early
                 * in i386_start_kernel
@@ -906,6 +897,20 @@ void __init setup_arch(char **cmdline_p)
        insert_resource(&iomem_resource, &data_resource);
        insert_resource(&iomem_resource, &bss_resource);
 
+       /*
+        * Complain if .text .data and .bss are not marked as E820_RAM and
+        * attempt to fix it by adding the range. We may have a confused BIOS,
+        * or the user may have incorrectly supplied it via memmap=exactmap. If
+        * we really are running on top non-RAM, we will crash later anyways.
+        */
+       if (!e820_all_mapped(code_resource.start, __pa(__brk_limit), E820_RAM)) {
+               pr_warn(".text .data .bss are not marked as E820_RAM!\n");
+
+               e820_add_region(code_resource.start,
+                               __pa(__brk_limit) - code_resource.start + 1,
+                               E820_RAM);
+       }
+
        trim_bios_range();
 #ifdef CONFIG_X86_32
        if (ppro_with_ram_bug()) {
@@ -955,6 +960,8 @@ void __init setup_arch(char **cmdline_p)
 
        reserve_ibft_region();
 
+       early_alloc_pgt_buf();
+
        /*
         * Need to conclude brk, before memblock_x86_fill()
         *  it could use memblock_find_in_range, could overlap with
@@ -964,7 +971,7 @@ void __init setup_arch(char **cmdline_p)
 
        cleanup_highmap();
 
-       memblock.current_limit = get_max_mapped();
+       memblock.current_limit = ISA_END_ADDRESS;
        memblock_x86_fill();
 
        /*
@@ -988,34 +995,8 @@ void __init setup_arch(char **cmdline_p)
 
        trim_platform_memory_ranges();
 
-       init_gbpages();
+       init_mem_mapping();
 
-       /* max_pfn_mapped is updated here */
-       max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
-       max_pfn_mapped = max_low_pfn_mapped;
-
-#ifdef CONFIG_X86_64
-       if (max_pfn > max_low_pfn) {
-               int i;
-               unsigned long start, end;
-               unsigned long start_pfn, end_pfn;
-
-               for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
-                                                        NULL) {
-
-                       end = PFN_PHYS(end_pfn);
-                       if (end <= (1UL<<32))
-                               continue;
-
-                       start = PFN_PHYS(start_pfn);
-                       max_pfn_mapped = init_memory_mapping(
-                                               max((1UL<<32), start), end);
-               }
-
-               /* can we preseve max_low_pfn ?*/
-               max_low_pfn = max_pfn;
-       }
-#endif
        memblock.current_limit = get_max_mapped();
        dma_contiguous_reserve(0);
 
index 7a3d075a814a9c83a603df93b5961f247049e624..50cf83ecd32e29e75f7742b23c703ac9da0e256d 100644 (file)
@@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = {
                .banner                 = default_banner,
        },
 
-       .mapping = {
-               .pagetable_reserve              = native_pagetable_reserve,
-       },
-
        .paging = {
                .pagetable_init         = native_pagetable_init,
        },
index d7aea41563b372437eb227a499259be23d755564..6f85de8a1f281a63b5c2d632b473279bb7c639fe 100644 (file)
 #include <asm/proto.h>
 #include <asm/dma.h>           /* for MAX_DMA_PFN */
 
-unsigned long __initdata pgt_buf_start;
-unsigned long __meminitdata pgt_buf_end;
-unsigned long __meminitdata pgt_buf_top;
+#include "mm_internal.h"
 
-int after_bootmem;
-
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-                               = 1
-#endif
-;
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
 
-struct map_range {
-       unsigned long start;
-       unsigned long end;
-       unsigned page_size_mask;
-};
+static unsigned long min_pfn_mapped;
 
 /*
- * First calculate space needed for kernel direct mapping page tables to cover
- * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
- * pages. Then find enough contiguous space for those page tables.
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ *    279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
  */
-static void __init find_early_table_space(struct map_range *mr, int nr_range)
+__ref void *alloc_low_pages(unsigned int num)
 {
+       unsigned long pfn;
        int i;
-       unsigned long puds = 0, pmds = 0, ptes = 0, tables;
-       unsigned long start = 0, good_end;
-       phys_addr_t base;
 
-       for (i = 0; i < nr_range; i++) {
-               unsigned long range, extra;
+       if (after_bootmem) {
+               unsigned int order;
 
-               range = mr[i].end - mr[i].start;
-               puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
+               order = get_order((unsigned long)num << PAGE_SHIFT);
+               return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+                                               __GFP_ZERO, order);
+       }
 
-               if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
-                       extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
-                       pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-               } else {
-                       pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
-               }
+       if ((pgt_buf_end + num) >= pgt_buf_top) {
+               unsigned long ret;
+               if (min_pfn_mapped >= max_pfn_mapped)
+                       panic("alloc_low_page: ran out of memory");
+               ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+                                       max_pfn_mapped << PAGE_SHIFT,
+                                       PAGE_SIZE * num , PAGE_SIZE);
+               if (!ret)
+                       panic("alloc_low_page: can not alloc memory");
+               memblock_reserve(ret, PAGE_SIZE * num);
+               pfn = ret >> PAGE_SHIFT;
+       } else {
+               pfn = pgt_buf_end;
+               pgt_buf_end += num;
+       }
 
-               if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
-                       extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
-                       extra += PMD_SIZE;
-#endif
-                       ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               } else {
-                       ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               }
+       for (i = 0; i < num; i++) {
+               void *adr;
+
+               adr = __va((pfn + i) << PAGE_SHIFT);
+               clear_page(adr);
        }
 
-       tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-       tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-       tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+       return __va(pfn << PAGE_SHIFT);
+}
 
-#ifdef CONFIG_X86_32
-       /* for fixmap */
-       tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-       good_end = max_pfn_mapped << PAGE_SHIFT;
+/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE      (5 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void  __init early_alloc_pgt_buf(void)
+{
+       unsigned long tables = INIT_PGT_BUF_SIZE;
+       phys_addr_t base;
 
-       base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
-       if (!base)
-               panic("Cannot find space for the kernel page tables");
+       base = __pa(extend_brk(tables, PAGE_SIZE));
 
        pgt_buf_start = base >> PAGE_SHIFT;
        pgt_buf_end = pgt_buf_start;
        pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+                               = 1
+#endif
+;
 
-       printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
-               mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
-               (pgt_buf_top << PAGE_SHIFT) - 1);
+static void __init init_gbpages(void)
+{
+#ifdef CONFIG_X86_64
+       if (direct_gbpages && cpu_has_gbpages)
+               printk(KERN_INFO "Using GB pages for direct mapping\n");
+       else
+               direct_gbpages = 0;
+#endif
 }
 
-void __init native_pagetable_reserve(u64 start, u64 end)
+struct map_range {
+       unsigned long start;
+       unsigned long end;
+       unsigned page_size_mask;
+};
+
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
 {
-       memblock_reserve(start, end - start);
+       init_gbpages();
+
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+       /*
+        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+        * This will simplify cpa(), which otherwise needs to support splitting
+        * large pages into small in interrupt context, etc.
+        */
+       if (direct_gbpages)
+               page_size_mask |= 1 << PG_LEVEL_1G;
+       if (cpu_has_pse)
+               page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+       /* Enable PSE if available */
+       if (cpu_has_pse)
+               set_in_cr4(X86_CR4_PSE);
+
+       /* Enable PGE if available */
+       if (cpu_has_pge) {
+               set_in_cr4(X86_CR4_PGE);
+               __supported_pte_mask |= _PAGE_GLOBAL;
+       }
 }
 
 #ifdef CONFIG_X86_32
@@ -122,58 +164,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
 }
 
 /*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
+ * adjust the page_size_mask for small range to go with
+ *     big page size instead small one if nearby are ram too.
  */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-                                              unsigned long end)
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+                                                        int nr_range)
 {
-       unsigned long page_size_mask = 0;
-       unsigned long start_pfn, end_pfn;
-       unsigned long ret = 0;
-       unsigned long pos;
-
-       struct map_range mr[NR_RANGE_MR];
-       int nr_range, i;
-       int use_pse, use_gbpages;
+       int i;
 
-       printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
-              start, end - 1);
+       for (i = 0; i < nr_range; i++) {
+               if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+                   !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+                       unsigned long start = round_down(mr[i].start, PMD_SIZE);
+                       unsigned long end = round_up(mr[i].end, PMD_SIZE);
 
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-       /*
-        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-        * This will simplify cpa(), which otherwise needs to support splitting
-        * large pages into small in interrupt context, etc.
-        */
-       use_pse = use_gbpages = 0;
-#else
-       use_pse = cpu_has_pse;
-       use_gbpages = direct_gbpages;
+#ifdef CONFIG_X86_32
+                       if ((end >> PAGE_SHIFT) > max_low_pfn)
+                               continue;
 #endif
 
-       /* Enable PSE if available */
-       if (cpu_has_pse)
-               set_in_cr4(X86_CR4_PSE);
+                       if (memblock_is_region_memory(start, end - start))
+                               mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+               }
+               if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+                   !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+                       unsigned long start = round_down(mr[i].start, PUD_SIZE);
+                       unsigned long end = round_up(mr[i].end, PUD_SIZE);
 
-       /* Enable PGE if available */
-       if (cpu_has_pge) {
-               set_in_cr4(X86_CR4_PGE);
-               __supported_pte_mask |= _PAGE_GLOBAL;
+                       if (memblock_is_region_memory(start, end - start))
+                               mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+               }
        }
+}
 
-       if (use_gbpages)
-               page_size_mask |= 1 << PG_LEVEL_1G;
-       if (use_pse)
-               page_size_mask |= 1 << PG_LEVEL_2M;
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+                                    unsigned long start,
+                                    unsigned long end)
+{
+       unsigned long start_pfn, end_pfn, limit_pfn;
+       unsigned long pfn;
+       int i;
 
-       memset(mr, 0, sizeof(mr));
-       nr_range = 0;
+       limit_pfn = PFN_DOWN(end);
 
        /* head if not big page alignment ? */
-       start_pfn = start >> PAGE_SHIFT;
-       pos = start_pfn << PAGE_SHIFT;
+       pfn = start_pfn = PFN_DOWN(start);
 #ifdef CONFIG_X86_32
        /*
         * Don't use a large page for the first 2/4MB of memory
@@ -181,66 +216,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
         * and overlapping MTRRs into large pages can cause
         * slowdowns.
         */
-       if (pos == 0)
-               end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+       if (pfn == 0)
+               end_pfn = PFN_DOWN(PMD_SIZE);
        else
-               end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                                << (PMD_SHIFT - PAGE_SHIFT);
+               end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #else /* CONFIG_X86_64 */
-       end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-                       << (PMD_SHIFT - PAGE_SHIFT);
+       end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #endif
-       if (end_pfn > (end >> PAGE_SHIFT))
-               end_pfn = end >> PAGE_SHIFT;
+       if (end_pfn > limit_pfn)
+               end_pfn = limit_pfn;
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-               pos = end_pfn << PAGE_SHIFT;
+               pfn = end_pfn;
        }
 
        /* big page (2M) range */
-       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
+       start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #ifdef CONFIG_X86_32
-       end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+       end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
 #else /* CONFIG_X86_64 */
-       end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                        << (PUD_SHIFT - PAGE_SHIFT);
-       if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-               end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+       end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+       if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+               end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
 #endif
 
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask & (1<<PG_LEVEL_2M));
-               pos = end_pfn << PAGE_SHIFT;
+               pfn = end_pfn;
        }
 
 #ifdef CONFIG_X86_64
        /* big page (1G) range */
-       start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                        << (PUD_SHIFT - PAGE_SHIFT);
-       end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+       start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+       end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask &
                                 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-               pos = end_pfn << PAGE_SHIFT;
+               pfn = end_pfn;
        }
 
        /* tail is not big page (1G) alignment */
-       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
-       end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+       start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+       end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask & (1<<PG_LEVEL_2M));
-               pos = end_pfn << PAGE_SHIFT;
+               pfn = end_pfn;
        }
 #endif
 
        /* tail is not big page (2M) alignment */
-       start_pfn = pos>>PAGE_SHIFT;
-       end_pfn = end>>PAGE_SHIFT;
+       start_pfn = pfn;
+       end_pfn = limit_pfn;
        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
 
        /* try to merge same page size and continuous */
@@ -257,59 +286,161 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
                nr_range--;
        }
 
+       if (!after_bootmem)
+               adjust_range_page_size_mask(mr, nr_range);
+
        for (i = 0; i < nr_range; i++)
                printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
                                mr[i].start, mr[i].end - 1,
                        (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
                         (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
 
-       /*
-        * Find space for the kernel direct mapping tables.
-        *
-        * Later we should allocate these tables in the local node of the
-        * memory mapped. Unfortunately this is done currently before the
-        * nodes are discovered.
-        */
-       if (!after_bootmem)
-               find_early_table_space(mr, nr_range);
+       return nr_range;
+}
+
+static struct range pfn_mapped[E820_X_MAX];
+static int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+       nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+                                            nr_pfn_mapped, start_pfn, end_pfn);
+       nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+       max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+       if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+               max_low_pfn_mapped = max(max_low_pfn_mapped,
+                                        min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+       int i;
+
+       for (i = 0; i < nr_pfn_mapped; i++)
+               if ((start_pfn >= pfn_mapped[i].start) &&
+                   (end_pfn <= pfn_mapped[i].end))
+                       return true;
+
+       return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+                                              unsigned long end)
+{
+       struct map_range mr[NR_RANGE_MR];
+       unsigned long ret = 0;
+       int nr_range, i;
+
+       pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+              start, end - 1);
+
+       memset(mr, 0, sizeof(mr));
+       nr_range = split_mem_range(mr, 0, start, end);
 
        for (i = 0; i < nr_range; i++)
                ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
                                                   mr[i].page_size_mask);
 
-#ifdef CONFIG_X86_32
-       early_ioremap_page_table_range_init();
+       add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
 
-       load_cr3(swapper_pg_dir);
-#endif
+       return ret >> PAGE_SHIFT;
+}
 
-       __flush_tlb_all();
+/*
+ * would have hole in the middle or ends, and only ram parts will be mapped.
+ */
+static unsigned long __init init_range_memory_mapping(
+                                          unsigned long r_start,
+                                          unsigned long r_end)
+{
+       unsigned long start_pfn, end_pfn;
+       unsigned long mapped_ram_size = 0;
+       int i;
 
-       /*
-        * Reserve the kernel pagetable pages we used (pgt_buf_start -
-        * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
-        * so that they can be reused for other purposes.
-        *
-        * On native it just means calling memblock_reserve, on Xen it also
-        * means marking RW the pagetable pages that we allocated before
-        * but that haven't been used.
-        *
-        * In fact on xen we mark RO the whole range pgt_buf_start -
-        * pgt_buf_top, because we have to make sure that when
-        * init_memory_mapping reaches the pagetable pages area, it maps
-        * RO all the pagetable pages, including the ones that are beyond
-        * pgt_buf_end at that time.
-        */
-       if (!after_bootmem && pgt_buf_end > pgt_buf_start)
-               x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
-                               PFN_PHYS(pgt_buf_end));
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+               u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+               u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+               if (start >= end)
+                       continue;
 
-       if (!after_bootmem)
-               early_memtest(start, end);
+               init_memory_mapping(start, end);
+               mapped_ram_size += end - start;
+       }
 
-       return ret >> PAGE_SHIFT;
+       return mapped_ram_size;
 }
 
+/* (PUD_SHIFT-PMD_SHIFT)/2 */
+#define STEP_SIZE_SHIFT 5
+void __init init_mem_mapping(void)
+{
+       unsigned long end, real_end, start, last_start;
+       unsigned long step_size;
+       unsigned long addr;
+       unsigned long mapped_ram_size = 0;
+       unsigned long new_mapped_ram_size;
+
+       probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+       end = max_pfn << PAGE_SHIFT;
+#else
+       end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+       /* the ISA range is always mapped regardless of memory holes */
+       init_memory_mapping(0, ISA_END_ADDRESS);
+
+       /* xen has big range in reserved near end of ram, skip it at first */
+       addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
+                        PAGE_SIZE);
+       real_end = addr + PMD_SIZE;
+
+       /* step_size need to be small so pgt_buf from BRK could cover it */
+       step_size = PMD_SIZE;
+       max_pfn_mapped = 0; /* will get exact value next */
+       min_pfn_mapped = real_end >> PAGE_SHIFT;
+       last_start = start = real_end;
+       while (last_start > ISA_END_ADDRESS) {
+               if (last_start > step_size) {
+                       start = round_down(last_start - 1, step_size);
+                       if (start < ISA_END_ADDRESS)
+                               start = ISA_END_ADDRESS;
+               } else
+                       start = ISA_END_ADDRESS;
+               new_mapped_ram_size = init_range_memory_mapping(start,
+                                                       last_start);
+               last_start = start;
+               min_pfn_mapped = last_start >> PAGE_SHIFT;
+               /* only increase step_size after big range get mapped */
+               if (new_mapped_ram_size > mapped_ram_size)
+                       step_size <<= STEP_SIZE_SHIFT;
+               mapped_ram_size += new_mapped_ram_size;
+       }
+
+       if (real_end < end)
+               init_range_memory_mapping(real_end, end);
+
+#ifdef CONFIG_X86_64
+       if (max_pfn > max_low_pfn) {
+               /* can we preseve max_low_pfn ?*/
+               max_low_pfn = max_pfn;
+       }
+#else
+       early_ioremap_page_table_range_init();
+       load_cr3(swapper_pg_dir);
+       __flush_tlb_all();
+#endif
+
+       early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
 
 /*
  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
index 745d66b843c84241f1d849e865dc2796a00b6f1f..b299724f6e34e9ace068a4c7f39d504a7cc5c7b4 100644 (file)
 #include <asm/page_types.h>
 #include <asm/init.h>
 
+#include "mm_internal.h"
+
 unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
 bool __read_mostly __vmalloc_start_set = false;
 
-static __init void *alloc_low_page(void)
-{
-       unsigned long pfn = pgt_buf_end++;
-       void *adr;
-
-       if (pfn >= pgt_buf_top)
-               panic("alloc_low_page: ran out of memory");
-
-       adr = __va(pfn * PAGE_SIZE);
-       clear_page(adr);
-       return adr;
-}
-
 /*
  * Creates a middle page table and puts a pointer to it in the
  * given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 
 #ifdef CONFIG_X86_PAE
        if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-               if (after_bootmem)
-                       pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
-               else
-                       pmd_table = (pmd_t *)alloc_low_page();
+               pmd_table = (pmd_t *)alloc_low_page();
                paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
                set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
                pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 static pte_t * __init one_page_table_init(pmd_t *pmd)
 {
        if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
-               pte_t *page_table = NULL;
-
-               if (after_bootmem) {
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-                       page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
-#endif
-                       if (!page_table)
-                               page_table =
-                               (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
-               } else
-                       page_table = (pte_t *)alloc_low_page();
+               pte_t *page_table = (pte_t *)alloc_low_page();
 
                paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
        return one_page_table_init(pmd) + pte_idx;
 }
 
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+       unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+       int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+       int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+       int pgd_idx, pmd_idx;
+       unsigned long vaddr;
+
+       if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+               return 0;
+
+       vaddr = start;
+       pgd_idx = pgd_index(vaddr);
+
+       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+                                                       pmd_idx++) {
+                       if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+                           (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+                               count++;
+                       vaddr += PMD_SIZE;
+               }
+               pmd_idx = 0;
+       }
+#endif
+       return count;
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
-                                          unsigned long vaddr, pte_t *lastpte)
+                                          unsigned long vaddr, pte_t *lastpte,
+                                          void **adr)
 {
 #ifdef CONFIG_HIGHMEM
        /*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 
        if (pmd_idx_kmap_begin != pmd_idx_kmap_end
            && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
-           && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-           && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
-               || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+           && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
                pte_t *newpte;
                int i;
 
                BUG_ON(after_bootmem);
-               newpte = alloc_low_page();
+               newpte = *adr;
                for (i = 0; i < PTRS_PER_PTE; i++)
                        set_pte(newpte + i, pte[i]);
+               *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
 
                paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
                set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
        pgd_t *pgd;
        pmd_t *pmd;
        pte_t *pte = NULL;
+       unsigned long count = page_table_range_init_count(start, end);
+       void *adr = NULL;
+
+       if (count)
+               adr = alloc_low_pages(count);
 
        vaddr = start;
        pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
                for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
                                                        pmd++, pmd_idx++) {
                        pte = page_table_kmap_check(one_page_table_init(pmd),
-                                                   pmd, vaddr, pte);
+                                                   pmd, vaddr, pte, &adr);
 
                        vaddr += PMD_SIZE;
                }
@@ -310,6 +321,7 @@ repeat:
                                        __pgprot(PTE_IDENT_ATTR |
                                                 _PAGE_PSE);
 
+                               pfn &= PMD_MASK >> PAGE_SHIFT;
                                addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
                                        PAGE_OFFSET + PAGE_SIZE-1;
 
@@ -455,9 +467,14 @@ void __init native_pagetable_init(void)
 
        /*
         * Remove any mappings which extend past the end of physical
-        * memory from the boot time page table:
+        * memory from the boot time page table.
+        * In virtual address space, we should have at least two pages
+        * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+        * definition. And max_low_pfn is set to VMALLOC_END physical
+        * address. If initial memory mapping is doing right job, we
+        * should have pte used near max_low_pfn or one pmd is not present.
         */
-       for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+       for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
                va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
                pgd = base + pgd_index(va);
                if (!pgd_present(*pgd))
@@ -468,10 +485,19 @@ void __init native_pagetable_init(void)
                if (!pmd_present(*pmd))
                        break;
 
+               /* should not be large page here */
+               if (pmd_large(*pmd)) {
+                       pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+                               pfn, pmd, __pa(pmd));
+                       BUG_ON(1);
+               }
+
                pte = pte_offset_kernel(pmd, va);
                if (!pte_present(*pte))
                        break;
 
+               printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+                               pfn, pmd, __pa(pmd), pte, __pa(pte));
                pte_clear(NULL, va, pte);
        }
        paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@ -550,7 +576,7 @@ early_param("highmem", parse_highmem);
  * artificially via the highmem=x boot parameter then create
  * it:
  */
-void __init lowmem_pfn_init(void)
+static void __init lowmem_pfn_init(void)
 {
        /* max_low_pfn is 0, we already have early_res support */
        max_low_pfn = max_pfn;
@@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void)
  * We have more RAM than fits into lowmem - we try to put it into
  * highmem, also taking the highmem=x boot parameter into account:
  */
-void __init highmem_pfn_init(void)
+static void __init highmem_pfn_init(void)
 {
        max_low_pfn = MAXMEM_PFN;
 
@@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void)
        printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
                 max_pfn_mapped<<PAGE_SHIFT);
        printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-
-       after_bootmem = 1;
 }
 
 /*
@@ -753,6 +777,8 @@ void __init mem_init(void)
                if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
                        reservedpages++;
 
+       after_bootmem = 1;
+
        codesize =  (unsigned long) &_etext - (unsigned long) &_text;
        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
        initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
index 2ead3c8a4c8419da92a61fbba35eb695e5205dde..191ab12f5ff37b0dc93f74efa01208f92a0d461b 100644 (file)
@@ -54,6 +54,8 @@
 #include <asm/uv/uv.h>
 #include <asm/setup.h>
 
+#include "mm_internal.h"
+
 static int __init parse_direct_gbpages_off(char *arg)
 {
        direct_gbpages = 0;
@@ -314,69 +316,24 @@ void __init cleanup_highmap(void)
        }
 }
 
-static __ref void *alloc_low_page(unsigned long *phys)
-{
-       unsigned long pfn = pgt_buf_end++;
-       void *adr;
-
-       if (after_bootmem) {
-               adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
-               *phys = __pa(adr);
-
-               return adr;
-       }
-
-       if (pfn >= pgt_buf_top)
-               panic("alloc_low_page: ran out of memory");
-
-       adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
-       clear_page(adr);
-       *phys  = pfn * PAGE_SIZE;
-       return adr;
-}
-
-static __ref void *map_low_page(void *virt)
-{
-       void *adr;
-       unsigned long phys, left;
-
-       if (after_bootmem)
-               return virt;
-
-       phys = __pa(virt);
-       left = phys & (PAGE_SIZE - 1);
-       adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
-       adr = (void *)(((unsigned long)adr) | left);
-
-       return adr;
-}
-
-static __ref void unmap_low_page(void *adr)
-{
-       if (after_bootmem)
-               return;
-
-       early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
-}
-
 static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
              pgprot_t prot)
 {
-       unsigned pages = 0;
+       unsigned long pages = 0, next;
        unsigned long last_map_addr = end;
        int i;
 
        pte_t *pte = pte_page + pte_index(addr);
 
-       for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
-
+       for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+               next = (addr & PAGE_MASK) + PAGE_SIZE;
                if (addr >= end) {
-                       if (!after_bootmem) {
-                               for(; i < PTRS_PER_PTE; i++, pte++)
-                                       set_pte(pte, __pte(0));
-                       }
-                       break;
+                       if (!after_bootmem &&
+                           !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+                           !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+                               set_pte(pte, __pte(0));
+                       continue;
                }
 
                /*
@@ -414,28 +371,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
        int i = pmd_index(address);
 
        for (; i < PTRS_PER_PMD; i++, address = next) {
-               unsigned long pte_phys;
                pmd_t *pmd = pmd_page + pmd_index(address);
                pte_t *pte;
                pgprot_t new_prot = prot;
 
+               next = (address & PMD_MASK) + PMD_SIZE;
                if (address >= end) {
-                       if (!after_bootmem) {
-                               for (; i < PTRS_PER_PMD; i++, pmd++)
-                                       set_pmd(pmd, __pmd(0));
-                       }
-                       break;
+                       if (!after_bootmem &&
+                           !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+                           !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+                               set_pmd(pmd, __pmd(0));
+                       continue;
                }
 
-               next = (address & PMD_MASK) + PMD_SIZE;
-
                if (pmd_val(*pmd)) {
                        if (!pmd_large(*pmd)) {
                                spin_lock(&init_mm.page_table_lock);
-                               pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+                               pte = (pte_t *)pmd_page_vaddr(*pmd);
                                last_map_addr = phys_pte_init(pte, address,
                                                                end, prot);
-                               unmap_low_page(pte);
                                spin_unlock(&init_mm.page_table_lock);
                                continue;
                        }
@@ -464,19 +418,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
                        pages++;
                        spin_lock(&init_mm.page_table_lock);
                        set_pte((pte_t *)pmd,
-                               pfn_pte(address >> PAGE_SHIFT,
+                               pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
                                        __pgprot(pgprot_val(prot) | _PAGE_PSE)));
                        spin_unlock(&init_mm.page_table_lock);
                        last_map_addr = next;
                        continue;
                }
 
-               pte = alloc_low_page(&pte_phys);
+               pte = alloc_low_page();
                last_map_addr = phys_pte_init(pte, address, end, new_prot);
-               unmap_low_page(pte);
 
                spin_lock(&init_mm.page_table_lock);
-               pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+               pmd_populate_kernel(&init_mm, pmd, pte);
                spin_unlock(&init_mm.page_table_lock);
        }
        update_page_count(PG_LEVEL_2M, pages);
@@ -492,27 +445,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
        int i = pud_index(addr);
 
        for (; i < PTRS_PER_PUD; i++, addr = next) {
-               unsigned long pmd_phys;
                pud_t *pud = pud_page + pud_index(addr);
                pmd_t *pmd;
                pgprot_t prot = PAGE_KERNEL;
 
-               if (addr >= end)
-                       break;
-
                next = (addr & PUD_MASK) + PUD_SIZE;
-
-               if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
-                       set_pud(pud, __pud(0));
+               if (addr >= end) {
+                       if (!after_bootmem &&
+                           !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+                           !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+                               set_pud(pud, __pud(0));
                        continue;
                }
 
                if (pud_val(*pud)) {
                        if (!pud_large(*pud)) {
-                               pmd = map_low_page(pmd_offset(pud, 0));
+                               pmd = pmd_offset(pud, 0);
                                last_map_addr = phys_pmd_init(pmd, addr, end,
                                                         page_size_mask, prot);
-                               unmap_low_page(pmd);
                                __flush_tlb_all();
                                continue;
                        }
@@ -541,19 +491,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
                        pages++;
                        spin_lock(&init_mm.page_table_lock);
                        set_pte((pte_t *)pud,
-                               pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+                               pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+                                       PAGE_KERNEL_LARGE));
                        spin_unlock(&init_mm.page_table_lock);
                        last_map_addr = next;
                        continue;
                }
 
-               pmd = alloc_low_page(&pmd_phys);
+               pmd = alloc_low_page();
                last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
                                              prot);
-               unmap_low_page(pmd);
 
                spin_lock(&init_mm.page_table_lock);
-               pud_populate(&init_mm, pud, __va(pmd_phys));
+               pud_populate(&init_mm, pud, pmd);
                spin_unlock(&init_mm.page_table_lock);
        }
        __flush_tlb_all();
@@ -578,7 +528,6 @@ kernel_physical_mapping_init(unsigned long start,
 
        for (; start < end; start = next) {
                pgd_t *pgd = pgd_offset_k(start);
-               unsigned long pud_phys;
                pud_t *pud;
 
                next = (start + PGDIR_SIZE) & PGDIR_MASK;
@@ -586,20 +535,18 @@ kernel_physical_mapping_init(unsigned long start,
                        next = end;
 
                if (pgd_val(*pgd)) {
-                       pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+                       pud = (pud_t *)pgd_page_vaddr(*pgd);
                        last_map_addr = phys_pud_init(pud, __pa(start),
                                                 __pa(end), page_size_mask);
-                       unmap_low_page(pud);
                        continue;
                }
 
-               pud = alloc_low_page(&pud_phys);
+               pud = alloc_low_page();
                last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
                                                 page_size_mask);
-               unmap_low_page(pud);
 
                spin_lock(&init_mm.page_table_lock);
-               pgd_populate(&init_mm, pgd, __va(pud_phys));
+               pgd_populate(&init_mm, pgd, pud);
                spin_unlock(&init_mm.page_table_lock);
                pgd_changed = true;
        }
@@ -664,13 +611,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
 {
        struct pglist_data *pgdat = NODE_DATA(nid);
        struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
-       unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+       unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
        int ret;
 
-       last_mapped_pfn = init_memory_mapping(start, start + size);
-       if (last_mapped_pfn > max_pfn_mapped)
-               max_pfn_mapped = last_mapped_pfn;
+       init_memory_mapping(start, start + size);
 
        ret = __add_pages(nid, zone, start_pfn, nr_pages);
        WARN_ON_ONCE(ret);
@@ -686,6 +631,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory);
 
 static struct kcore_list kcore_vsyscall;
 
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+       int i;
+
+       for_each_online_node(i)
+               register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
 void __init mem_init(void)
 {
        long codesize, reservedpages, datasize, initsize;
@@ -698,11 +653,8 @@ void __init mem_init(void)
        reservedpages = 0;
 
        /* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
-       totalram_pages = numa_free_all_bootmem();
-#else
+       register_page_bootmem_info();
        totalram_pages = free_all_bootmem();
-#endif
 
        absent_pages = absent_pages_in_range(0, max_pfn);
        reservedpages = max_pfn - totalram_pages - absent_pages;
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644 (file)
index 0000000..6b563a1
--- /dev/null
@@ -0,0 +1,19 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+       return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+                                            unsigned long end,
+                                            unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+#endif /* __X86_MM_INTERNAL_H */
index 92e27119ee1a0b5db2559ff8787a3ce83c150a59..9405ffc915026a14e5658d82ab9a6c54c9a781b2 100644 (file)
@@ -10,16 +10,3 @@ void __init initmem_init(void)
 {
        x86_numa_init();
 }
-
-unsigned long __init numa_free_all_bootmem(void)
-{
-       unsigned long pages = 0;
-       int i;
-
-       for_each_online_node(i)
-               pages += free_all_bootmem_node(NODE_DATA(i));
-
-       pages += free_low_memory_core_early(MAX_NUMNODES);
-
-       return pages;
-}
index a718e0d23503fdc4bb3149d4ad5c7046458f2a57..44acfcd6c16f2730f8235e302b62f80d5955378c 100644 (file)
@@ -551,16 +551,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
                set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 
-       if (address >= (unsigned long)__va(0) &&
-               address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+       if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+                               PFN_DOWN(__pa(address)) + 1))
                split_page_count(level);
 
-#ifdef CONFIG_X86_64
-       if (address >= (unsigned long)__va(1UL<<32) &&
-               address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
-               split_page_count(level);
-#endif
-
        /*
         * Install the new, split up pagetable.
         *
@@ -729,13 +723,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
        unsigned long vaddr;
        int ret;
 
-       if (cpa->pfn >= max_pfn_mapped)
+       if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
                return 0;
 
-#ifdef CONFIG_X86_64
-       if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
-               return 0;
-#endif
        /*
         * No need to redo, when the primary call touched the direct
         * mapping already:
index ad4439145f858314dfe518cf7cd9336c5fe9c96d..36e53f0a9ce3e101774f677b8bc2cd03c21e720b 100644 (file)
@@ -835,7 +835,7 @@ void __init efi_enter_virtual_mode(void)
        efi_memory_desc_t *md, *prev_md = NULL;
        efi_status_t status;
        unsigned long size;
-       u64 end, systab, end_pfn;
+       u64 end, systab, start_pfn, end_pfn;
        void *p, *va, *new_memmap = NULL;
        int count = 0;
 
@@ -888,10 +888,9 @@ void __init efi_enter_virtual_mode(void)
                size = md->num_pages << EFI_PAGE_SHIFT;
                end = md->phys_addr + size;
 
+               start_pfn = PFN_DOWN(md->phys_addr);
                end_pfn = PFN_UP(end);
-               if (end_pfn <= max_low_pfn_mapped
-                   || (end_pfn > (1UL << (32 - PAGE_SHIFT))
-                       && end_pfn <= max_pfn_mapped)) {
+               if (pfn_range_is_mapped(start_pfn, end_pfn)) {
                        va = __va(md->phys_addr);
 
                        if (!(md->attribute & EFI_MEMORY_WB))
index 01de35c772210120075300504189c22bd00c5899..f5e86eee4e0ec9c7b80c94433f4ace0be0fe7403 100644 (file)
@@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm)
 
 static void xen_post_allocator_init(void);
 
-static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
-{
-       /* reserve the range used */
-       native_pagetable_reserve(start, end);
-
-       /* set as RW the rest */
-       printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
-                       PFN_PHYS(pgt_buf_top));
-       while (end < PFN_PHYS(pgt_buf_top)) {
-               make_lowmem_page_readwrite(__va(end));
-               end += PAGE_SIZE;
-       }
-}
-
 #ifdef CONFIG_X86_64
 static void __init xen_cleanhighmap(unsigned long vaddr,
                                    unsigned long vaddr_end)
@@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 #else /* CONFIG_X86_64 */
 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 {
-       unsigned long pfn = pte_pfn(pte);
-
-       /*
-        * If the new pfn is within the range of the newly allocated
-        * kernel pagetable, and it isn't being mapped into an
-        * early_ioremap fixmap slot as a freshly allocated page, make sure
-        * it is RO.
-        */
-       if (((!is_early_ioremap_ptep(ptep) &&
-                       pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
-                       (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
-               pte = pte_wrprotect(pte);
-
        return pte;
 }
 #endif /* CONFIG_X86_64 */
@@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 
 void __init xen_init_mmu_ops(void)
 {
-       x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
        x86_init.paging.pagetable_init = xen_pagetable_init;
        pv_mmu_ops = xen_mmu_ops;
 
index 66e2f7c61e5c9d3a2924389e28ffa9d32c8728bf..9d9dcc35d6a1b00208cd3842a4791c192a0c4118 100644 (file)
@@ -1386,7 +1386,6 @@ extern void __init mmap_init(void);
 extern void show_mem(unsigned int flags);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
-extern int after_bootmem;
 
 extern __printf(3, 4)
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
index b8294fc03df869153378f47f41f0ecd595c10887..03d152a76acf5e18396b74e62a3372e55b469750 100644 (file)
@@ -153,21 +153,6 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
                        z->managed_pages = 0;
 }
 
-/**
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
-{
-       register_page_bootmem_info_node(pgdat);
-       reset_node_lowmem_managed_pages(pgdat);
-
-       /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
-       return 0;
-}
-
 /**
  * free_all_bootmem - release free pages to the buddy allocator
  *