mm: rcu-protected get_mm_exe_file()

[linux-2.6-block.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 40e29429e7b0995bd5799bd6263b18d6ce8261cb..ebffa0e4a9c0451cfbd78bcf4835825ffd72c6de 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
  static int fallbacks[MIGRATE_TYPES][4] = {
         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
  #ifdef CONFIG_CMA
-       [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
         [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
-#else
-       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
  #endif
         [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
  #ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
  #endif
  };
  
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+                                       unsigned int order)
+{
+       return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+                                       unsigned int order) { return NULL; }
+#endif
+
  /*
   * Move the free pages in a range to the free lists of the requested type.
   * Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page,
   * as fragmentation caused by those allocations polluting movable pageblocks
   * is worse than movable allocations stealing from unmovable and reclaimable
   * pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
   */
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
-                                 int start_type, int fallback_type)
+static bool can_steal_fallback(unsigned int order, int start_mt)
+{
+       /*
+        * Leaving this order check is intended, although there is
+        * relaxed order check in next check. The reason is that
+        * we can actually steal whole pageblock if this condition met,
+        * but, below check doesn't guarantee it and that is just heuristic
+        * so could be changed anytime.
+        */
+       if (order >= pageblock_order)
+               return true;
+
+       if (order >= pageblock_order / 2 ||
+               start_mt == MIGRATE_RECLAIMABLE ||
+               start_mt == MIGRATE_UNMOVABLE ||
+               page_group_by_mobility_disabled)
+               return true;
+
+       return false;
+}
+
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+                                                         int start_type)
  {
         int current_order = page_order(page);
+       int pages;
  
         /* Take ownership for orders >= pageblock_order */
         if (current_order >= pageblock_order) {
@@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
                 return;
         }
  
-       if (current_order >= pageblock_order / 2 ||
-           start_type == MIGRATE_RECLAIMABLE ||
-           start_type == MIGRATE_UNMOVABLE ||
-           page_group_by_mobility_disabled) {
-               int pages;
+       pages = move_freepages_block(zone, page, start_type);
+
+       /* Claim the whole block if over half of it is free */
+       if (pages >= (1 << (pageblock_order-1)) ||
+                       page_group_by_mobility_disabled)
+               set_pageblock_migratetype(page, start_type);
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+                       int migratetype, bool only_stealable, bool *can_steal)
+{
+       int i;
+       int fallback_mt;
+
+       if (area->nr_free == 0)
+               return -1;
+
+       *can_steal = false;
+       for (i = 0;; i++) {
+               fallback_mt = fallbacks[migratetype][i];
+               if (fallback_mt == MIGRATE_RESERVE)
+                       break;
+
+               if (list_empty(&area->free_list[fallback_mt]))
+                       continue;
  
-               pages = move_freepages_block(zone, page, start_type);
+               if (can_steal_fallback(order, migratetype))
+                       *can_steal = true;
  
-               /* Claim the whole block if over half of it is free */
-               if (pages >= (1 << (pageblock_order-1)) ||
-                               page_group_by_mobility_disabled)
-                       set_pageblock_migratetype(page, start_type);
+               if (!only_stealable)
+                       return fallback_mt;
+
+               if (*can_steal)
+                       return fallback_mt;
         }
+
+       return -1;
  }
  
  /* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
         struct free_area *area;
         unsigned int current_order;
         struct page *page;
+       int fallback_mt;
+       bool can_steal;
  
         /* Find the largest possible block of pages in the other list */
         for (current_order = MAX_ORDER-1;
                                 current_order >= order && current_order <= MAX_ORDER-1;
                                 --current_order) {
-               int i;
-               for (i = 0;; i++) {
-                       int migratetype = fallbacks[start_migratetype][i];
-                       int buddy_type = start_migratetype;
-
-                       /* MIGRATE_RESERVE handled later if necessary */
-                       if (migratetype == MIGRATE_RESERVE)
-                               break;
-
-                       area = &(zone->free_area[current_order]);
-                       if (list_empty(&area->free_list[migratetype]))
-                               continue;
-
-                       page = list_entry(area->free_list[migratetype].next,
-                                       struct page, lru);
-                       area->nr_free--;
-
-                       if (!is_migrate_cma(migratetype)) {
-                               try_to_steal_freepages(zone, page,
-                                                       start_migratetype,
-                                                       migratetype);
-                       } else {
-                               /*
-                                * When borrowing from MIGRATE_CMA, we need to
-                                * release the excess buddy pages to CMA
-                                * itself, and we do not try to steal extra
-                                * free pages.
-                                */
-                               buddy_type = migratetype;
-                       }
+               area = &(zone->free_area[current_order]);
+               fallback_mt = find_suitable_fallback(area, current_order,
+                               start_migratetype, false, &can_steal);
+               if (fallback_mt == -1)
+                       continue;
  
-                       /* Remove the page from the freelists */
-                       list_del(&page->lru);
-                       rmv_page_order(page);
+               page = list_entry(area->free_list[fallback_mt].next,
+                                               struct page, lru);
+               if (can_steal)
+                       steal_suitable_fallback(zone, page, start_migratetype);
  
-                       expand(zone, page, order, current_order, area,
-                                       buddy_type);
+               /* Remove the page from the freelists */
+               area->nr_free--;
+               list_del(&page->lru);
+               rmv_page_order(page);
  
-                       /*
-                        * The freepage_migratetype may differ from pageblock's
-                        * migratetype depending on the decisions in
-                        * try_to_steal_freepages(). This is OK as long as it
-                        * does not differ for MIGRATE_CMA pageblocks. For CMA
-                        * we need to make sure unallocated pages flushed from
-                        * pcp lists are returned to the correct freelist.
-                        */
-                       set_freepage_migratetype(page, buddy_type);
+               expand(zone, page, order, current_order, area,
+                                       start_migratetype);
+               /*
+                * The freepage_migratetype may differ from pageblock's
+                * migratetype depending on the decisions in
+                * try_to_steal_freepages(). This is OK as long as it
+                * does not differ for MIGRATE_CMA pageblocks. For CMA
+                * we need to make sure unallocated pages flushed from
+                * pcp lists are returned to the correct freelist.
+                */
+               set_freepage_migratetype(page, start_migratetype);
  
-                       trace_mm_page_alloc_extfrag(page, order, current_order,
-                               start_migratetype, migratetype);
+               trace_mm_page_alloc_extfrag(page, order, current_order,
+                       start_migratetype, fallback_mt);
  
-                       return page;
-               }
+               return page;
         }
  
         return NULL;
@@ -1249,7 +1295,11 @@ retry_reserve:
         page = __rmqueue_smallest(zone, order, migratetype);
  
         if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
-               page = __rmqueue_fallback(zone, order, migratetype);
+               if (migratetype == MIGRATE_MOVABLE)
+                       page = __rmqueue_cma_fallback(zone, order);
+
+               if (!page)
+                       page = __rmqueue_fallback(zone, order, migratetype);
  
                 /*
                  * Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -1321,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
         int to_drain, batch;
  
         local_irq_save(flags);
-       batch = ACCESS_ONCE(pcp->batch);
+       batch = READ_ONCE(pcp->batch);
         to_drain = min(pcp->count, batch);
         if (to_drain > 0) {
                 free_pcppages_bulk(zone, to_drain, pcp);
@@ -1520,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                 list_add_tail(&page->lru, &pcp->lists[migratetype]);
         pcp->count++;
         if (pcp->count >= pcp->high) {
-               unsigned long batch = ACCESS_ONCE(pcp->batch);
+               unsigned long batch = READ_ONCE(pcp->batch);
                 free_pcppages_bulk(zone, batch, pcp);
                 pcp->count -= batch;
         }
@@ -2362,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                         *did_some_progress = 1;
                         goto out;
                 }
-               /*
-                * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
-                * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
-                * The caller should handle page allocation failure by itself if
-                * it specifies __GFP_THISNODE.
-                * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
-                */
+               /* The OOM killer may not free memory on a specific node */
                 if (gfp_mask & __GFP_THISNODE)
                         goto out;
         }
@@ -2623,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         }
  
         /*
-        * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
-        * __GFP_NOWARN set) should not cause reclaim since the subsystem
-        * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
-        * using a larger set of nodes after it has established that the
-        * allowed per node queues are empty and that nodes are
-        * over allocated.
+        * If this allocation cannot block and it is for a specific node, then
+        * fail early.  There's no need to wakeup kswapd or retry for a
+        * speculative node-specific allocation.
          */
-       if (IS_ENABLED(CONFIG_NUMA) &&
-           (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
                 goto nopage;
  
  retry:
@@ -2824,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         /*
          * Check the zones suitable for the gfp_mask contain at least one
          * valid zone. It's possible to have an empty zonelist as a result
-        * of GFP_THISNODE and a memoryless node
+        * of __GFP_THISNODE and a memoryless node
          */
         if (unlikely(!zonelist->_zonerefs->zone))
                 return NULL;
@@ -3201,38 +3241,31 @@ static void show_migration_types(unsigned char type)
   * Show free area list (used inside shift_scroll-lock stuff)
   * We also calculate the percentage fragmentation. We do this by counting the
   * memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
- * SHOW_MEM_FILTER_NODES is passed.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ *   cpuset.
   */
  void show_free_areas(unsigned int filter)
  {
+       unsigned long free_pcp = 0;
         int cpu;
         struct zone *zone;
  
         for_each_populated_zone(zone) {
                 if (skip_free_areas_node(filter, zone_to_nid(zone)))
                         continue;
-               show_node(zone);
-               printk("%s per-cpu:\n", zone->name);
  
-               for_each_online_cpu(cpu) {
-                       struct per_cpu_pageset *pageset;
-
-                       pageset = per_cpu_ptr(zone->pageset, cpu);
-
-                       printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
-                              cpu, pageset->pcp.high,
-                              pageset->pcp.batch, pageset->pcp.count);
-               }
+               for_each_online_cpu(cpu)
+                       free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
         }
  
         printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
                 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
-               " unevictable:%lu"
-               " dirty:%lu writeback:%lu unstable:%lu\n"
-               " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+               " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+               " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-               " free_cma:%lu\n",
+               " free:%lu free_pcp:%lu free_cma:%lu\n",
                 global_page_state(NR_ACTIVE_ANON),
                 global_page_state(NR_INACTIVE_ANON),
                 global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3276,14 @@ void show_free_areas(unsigned int filter)
                 global_page_state(NR_FILE_DIRTY),
                 global_page_state(NR_WRITEBACK),
                 global_page_state(NR_UNSTABLE_NFS),
-               global_page_state(NR_FREE_PAGES),
                 global_page_state(NR_SLAB_RECLAIMABLE),
                 global_page_state(NR_SLAB_UNRECLAIMABLE),
                 global_page_state(NR_FILE_MAPPED),
                 global_page_state(NR_SHMEM),
                 global_page_state(NR_PAGETABLE),
                 global_page_state(NR_BOUNCE),
+               global_page_state(NR_FREE_PAGES),
+               free_pcp,
                 global_page_state(NR_FREE_CMA_PAGES));
  
         for_each_populated_zone(zone) {
@@ -3257,6 +3291,11 @@ void show_free_areas(unsigned int filter)
  
                 if (skip_free_areas_node(filter, zone_to_nid(zone)))
                         continue;
+
+               free_pcp = 0;
+               for_each_online_cpu(cpu)
+                       free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+
                 show_node(zone);
                 printk("%s"
                         " free:%lukB"
@@ -3283,6 +3322,8 @@ void show_free_areas(unsigned int filter)
                         " pagetables:%lukB"
                         " unstable:%lukB"
                         " bounce:%lukB"
+                       " free_pcp:%lukB"
+                       " local_pcp:%ukB"
                         " free_cma:%lukB"
                         " writeback_tmp:%lukB"
                         " pages_scanned:%lu"
@@ -3314,6 +3355,8 @@ void show_free_areas(unsigned int filter)
                         K(zone_page_state(zone, NR_PAGETABLE)),
                         K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                         K(zone_page_state(zone, NR_BOUNCE)),
+                       K(free_pcp),
+                       K(this_cpu_read(zone->pageset->pcp.count)),
                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                         K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                         K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5717,7 +5760,7 @@ static void __setup_per_zone_wmarks(void)
                          * value here.
                          *
                          * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
-                        * deltas controls asynch page reclaim, and so should
+                        * deltas control asynch page reclaim, and so should
                          * not be capped for highmem.
                          */
                         unsigned long min_pages;
@@ -6164,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
         mask <<= (BITS_PER_LONG - bitidx - 1);
         flags <<= (BITS_PER_LONG - bitidx - 1);
  
-       word = ACCESS_ONCE(bitmap[word_bitidx]);
+       word = READ_ONCE(bitmap[word_bitidx]);
         for (;;) {
                 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
                 if (word == old_word)