mm: rcu-protected get_mm_exe_file()
[linux-2.6-block.git] / mm / page_alloc.c
index 40e29429e7b0995bd5799bd6263b18d6ce8261cb..ebffa0e4a9c0451cfbd78bcf4835825ffd72c6de 100644 (file)
@@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 static int fallbacks[MIGRATE_TYPES][4] = {
        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
-       [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
        [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
-#else
-       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
 #endif
 };
 
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+                                       unsigned int order)
+{
+       return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+                                       unsigned int order) { return NULL; }
+#endif
+
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page,
  * as fragmentation caused by those allocations polluting movable pageblocks
  * is worse than movable allocations stealing from unmovable and reclaimable
  * pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
  */
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
-                                 int start_type, int fallback_type)
+static bool can_steal_fallback(unsigned int order, int start_mt)
+{
+       /*
+        * Leaving this order check is intended, although there is
+        * relaxed order check in next check. The reason is that
+        * we can actually steal whole pageblock if this condition met,
+        * but, below check doesn't guarantee it and that is just heuristic
+        * so could be changed anytime.
+        */
+       if (order >= pageblock_order)
+               return true;
+
+       if (order >= pageblock_order / 2 ||
+               start_mt == MIGRATE_RECLAIMABLE ||
+               start_mt == MIGRATE_UNMOVABLE ||
+               page_group_by_mobility_disabled)
+               return true;
+
+       return false;
+}
+
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+                                                         int start_type)
 {
        int current_order = page_order(page);
+       int pages;
 
        /* Take ownership for orders >= pageblock_order */
        if (current_order >= pageblock_order) {
@@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
                return;
        }
 
-       if (current_order >= pageblock_order / 2 ||
-           start_type == MIGRATE_RECLAIMABLE ||
-           start_type == MIGRATE_UNMOVABLE ||
-           page_group_by_mobility_disabled) {
-               int pages;
+       pages = move_freepages_block(zone, page, start_type);
+
+       /* Claim the whole block if over half of it is free */
+       if (pages >= (1 << (pageblock_order-1)) ||
+                       page_group_by_mobility_disabled)
+               set_pageblock_migratetype(page, start_type);
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+                       int migratetype, bool only_stealable, bool *can_steal)
+{
+       int i;
+       int fallback_mt;
+
+       if (area->nr_free == 0)
+               return -1;
+
+       *can_steal = false;
+       for (i = 0;; i++) {
+               fallback_mt = fallbacks[migratetype][i];
+               if (fallback_mt == MIGRATE_RESERVE)
+                       break;
+
+               if (list_empty(&area->free_list[fallback_mt]))
+                       continue;
 
-               pages = move_freepages_block(zone, page, start_type);
+               if (can_steal_fallback(order, migratetype))
+                       *can_steal = true;
 
-               /* Claim the whole block if over half of it is free */
-               if (pages >= (1 << (pageblock_order-1)) ||
-                               page_group_by_mobility_disabled)
-                       set_pageblock_migratetype(page, start_type);
+               if (!only_stealable)
+                       return fallback_mt;
+
+               if (*can_steal)
+                       return fallback_mt;
        }
+
+       return -1;
 }
 
 /* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
        struct free_area *area;
        unsigned int current_order;
        struct page *page;
+       int fallback_mt;
+       bool can_steal;
 
        /* Find the largest possible block of pages in the other list */
        for (current_order = MAX_ORDER-1;
                                current_order >= order && current_order <= MAX_ORDER-1;
                                --current_order) {
-               int i;
-               for (i = 0;; i++) {
-                       int migratetype = fallbacks[start_migratetype][i];
-                       int buddy_type = start_migratetype;
-
-                       /* MIGRATE_RESERVE handled later if necessary */
-                       if (migratetype == MIGRATE_RESERVE)
-                               break;
-
-                       area = &(zone->free_area[current_order]);
-                       if (list_empty(&area->free_list[migratetype]))
-                               continue;
-
-                       page = list_entry(area->free_list[migratetype].next,
-                                       struct page, lru);
-                       area->nr_free--;
-
-                       if (!is_migrate_cma(migratetype)) {
-                               try_to_steal_freepages(zone, page,
-                                                       start_migratetype,
-                                                       migratetype);
-                       } else {
-                               /*
-                                * When borrowing from MIGRATE_CMA, we need to
-                                * release the excess buddy pages to CMA
-                                * itself, and we do not try to steal extra
-                                * free pages.
-                                */
-                               buddy_type = migratetype;
-                       }
+               area = &(zone->free_area[current_order]);
+               fallback_mt = find_suitable_fallback(area, current_order,
+                               start_migratetype, false, &can_steal);
+               if (fallback_mt == -1)
+                       continue;
 
-                       /* Remove the page from the freelists */
-                       list_del(&page->lru);
-                       rmv_page_order(page);
+               page = list_entry(area->free_list[fallback_mt].next,
+                                               struct page, lru);
+               if (can_steal)
+                       steal_suitable_fallback(zone, page, start_migratetype);
 
-                       expand(zone, page, order, current_order, area,
-                                       buddy_type);
+               /* Remove the page from the freelists */
+               area->nr_free--;
+               list_del(&page->lru);
+               rmv_page_order(page);
 
-                       /*
-                        * The freepage_migratetype may differ from pageblock's
-                        * migratetype depending on the decisions in
-                        * try_to_steal_freepages(). This is OK as long as it
-                        * does not differ for MIGRATE_CMA pageblocks. For CMA
-                        * we need to make sure unallocated pages flushed from
-                        * pcp lists are returned to the correct freelist.
-                        */
-                       set_freepage_migratetype(page, buddy_type);
+               expand(zone, page, order, current_order, area,
+                                       start_migratetype);
+               /*
+                * The freepage_migratetype may differ from pageblock's
+                * migratetype depending on the decisions in
+                * try_to_steal_freepages(). This is OK as long as it
+                * does not differ for MIGRATE_CMA pageblocks. For CMA
+                * we need to make sure unallocated pages flushed from
+                * pcp lists are returned to the correct freelist.
+                */
+               set_freepage_migratetype(page, start_migratetype);
 
-                       trace_mm_page_alloc_extfrag(page, order, current_order,
-                               start_migratetype, migratetype);
+               trace_mm_page_alloc_extfrag(page, order, current_order,
+                       start_migratetype, fallback_mt);
 
-                       return page;
-               }
+               return page;
        }
 
        return NULL;
@@ -1249,7 +1295,11 @@ retry_reserve:
        page = __rmqueue_smallest(zone, order, migratetype);
 
        if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
-               page = __rmqueue_fallback(zone, order, migratetype);
+               if (migratetype == MIGRATE_MOVABLE)
+                       page = __rmqueue_cma_fallback(zone, order);
+
+               if (!page)
+                       page = __rmqueue_fallback(zone, order, migratetype);
 
                /*
                 * Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -1321,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
        int to_drain, batch;
 
        local_irq_save(flags);
-       batch = ACCESS_ONCE(pcp->batch);
+       batch = READ_ONCE(pcp->batch);
        to_drain = min(pcp->count, batch);
        if (to_drain > 0) {
                free_pcppages_bulk(zone, to_drain, pcp);
@@ -1520,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                list_add_tail(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
-               unsigned long batch = ACCESS_ONCE(pcp->batch);
+               unsigned long batch = READ_ONCE(pcp->batch);
                free_pcppages_bulk(zone, batch, pcp);
                pcp->count -= batch;
        }
@@ -2362,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        *did_some_progress = 1;
                        goto out;
                }
-               /*
-                * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
-                * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
-                * The caller should handle page allocation failure by itself if
-                * it specifies __GFP_THISNODE.
-                * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
-                */
+               /* The OOM killer may not free memory on a specific node */
                if (gfp_mask & __GFP_THISNODE)
                        goto out;
        }
@@ -2623,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        }
 
        /*
-        * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
-        * __GFP_NOWARN set) should not cause reclaim since the subsystem
-        * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
-        * using a larger set of nodes after it has established that the
-        * allowed per node queues are empty and that nodes are
-        * over allocated.
+        * If this allocation cannot block and it is for a specific node, then
+        * fail early.  There's no need to wakeup kswapd or retry for a
+        * speculative node-specific allocation.
         */
-       if (IS_ENABLED(CONFIG_NUMA) &&
-           (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
                goto nopage;
 
 retry:
@@ -2824,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        /*
         * Check the zones suitable for the gfp_mask contain at least one
         * valid zone. It's possible to have an empty zonelist as a result
-        * of GFP_THISNODE and a memoryless node
+        * of __GFP_THISNODE and a memoryless node
         */
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
@@ -3201,38 +3241,31 @@ static void show_migration_types(unsigned char type)
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
- * SHOW_MEM_FILTER_NODES is passed.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ *   cpuset.
  */
 void show_free_areas(unsigned int filter)
 {
+       unsigned long free_pcp = 0;
        int cpu;
        struct zone *zone;
 
        for_each_populated_zone(zone) {
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
-               show_node(zone);
-               printk("%s per-cpu:\n", zone->name);
 
-               for_each_online_cpu(cpu) {
-                       struct per_cpu_pageset *pageset;
-
-                       pageset = per_cpu_ptr(zone->pageset, cpu);
-
-                       printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
-                              cpu, pageset->pcp.high,
-                              pageset->pcp.batch, pageset->pcp.count);
-               }
+               for_each_online_cpu(cpu)
+                       free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
        }
 
        printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
                " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
-               " unevictable:%lu"
-               " dirty:%lu writeback:%lu unstable:%lu\n"
-               " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+               " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+               " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-               " free_cma:%lu\n",
+               " free:%lu free_pcp:%lu free_cma:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
                global_page_state(NR_INACTIVE_ANON),
                global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3276,14 @@ void show_free_areas(unsigned int filter)
                global_page_state(NR_FILE_DIRTY),
                global_page_state(NR_WRITEBACK),
                global_page_state(NR_UNSTABLE_NFS),
-               global_page_state(NR_FREE_PAGES),
                global_page_state(NR_SLAB_RECLAIMABLE),
                global_page_state(NR_SLAB_UNRECLAIMABLE),
                global_page_state(NR_FILE_MAPPED),
                global_page_state(NR_SHMEM),
                global_page_state(NR_PAGETABLE),
                global_page_state(NR_BOUNCE),
+               global_page_state(NR_FREE_PAGES),
+               free_pcp,
                global_page_state(NR_FREE_CMA_PAGES));
 
        for_each_populated_zone(zone) {
@@ -3257,6 +3291,11 @@ void show_free_areas(unsigned int filter)
 
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
+
+               free_pcp = 0;
+               for_each_online_cpu(cpu)
+                       free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -3283,6 +3322,8 @@ void show_free_areas(unsigned int filter)
                        " pagetables:%lukB"
                        " unstable:%lukB"
                        " bounce:%lukB"
+                       " free_pcp:%lukB"
+                       " local_pcp:%ukB"
                        " free_cma:%lukB"
                        " writeback_tmp:%lukB"
                        " pages_scanned:%lu"
@@ -3314,6 +3355,8 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_PAGETABLE)),
                        K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                        K(zone_page_state(zone, NR_BOUNCE)),
+                       K(free_pcp),
+                       K(this_cpu_read(zone->pageset->pcp.count)),
                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5717,7 +5760,7 @@ static void __setup_per_zone_wmarks(void)
                         * value here.
                         *
                         * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
-                        * deltas controls asynch page reclaim, and so should
+                        * deltas control asynch page reclaim, and so should
                         * not be capped for highmem.
                         */
                        unsigned long min_pages;
@@ -6164,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
        mask <<= (BITS_PER_LONG - bitidx - 1);
        flags <<= (BITS_PER_LONG - bitidx - 1);
 
-       word = ACCESS_ONCE(bitmap[word_bitidx]);
+       word = READ_ONCE(bitmap[word_bitidx]);
        for (;;) {
                old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
                if (word == old_word)