mm: extract in_gate_area() case from __get_user_pages()

[linux-2.6-block.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 3bac76ae4b30ec8a62042bdff9644e87ee181d3c..132c337dbe55d5ce4807740007c05b809ffe3808 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
         } while (zone_span_seqretry(zone, seq));
  
         if (ret)
-               pr_err("page %lu outside zone [ %lu - %lu ]\n",
-                       pfn, start_pfn, start_pfn + sp);
+               pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
+                       pfn, zone_to_nid(zone), zone->name,
+                       start_pfn, start_pfn + sp);
  
         return ret;
  }
@@ -295,7 +296,8 @@ static inline int bad_range(struct zone *zone, struct page *page)
  }
  #endif
  
-static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
+static void bad_page(struct page *page, const char *reason,
+               unsigned long bad_flags)
  {
         static unsigned long resume;
         static unsigned long nr_shown;
@@ -623,7 +625,7 @@ out:
  
  static inline int free_pages_check(struct page *page)
  {
-       char *bad_reason = NULL;
+       const char *bad_reason = NULL;
         unsigned long bad_flags = 0;
  
         if (unlikely(page_mapcount(page)))
@@ -859,7 +861,7 @@ static inline void expand(struct zone *zone, struct page *page,
   */
  static inline int check_new_page(struct page *page)
  {
-       char *bad_reason = NULL;
+       const char *bad_reason = NULL;
         unsigned long bad_flags = 0;
  
         if (unlikely(page_mapcount(page)))
@@ -930,6 +932,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                 rmv_page_order(page);
                 area->nr_free--;
                 expand(zone, page, order, current_order, area, migratetype);
+               set_freepage_migratetype(page, migratetype);
                 return page;
         }
  
@@ -1056,7 +1059,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
  
         /*
          * When borrowing from MIGRATE_CMA, we need to release the excess
-        * buddy pages to CMA itself.
+        * buddy pages to CMA itself. We also ensure the freepage_migratetype
+        * is set to CMA so it is returned to the correct freelist in case
+        * the page ends up being not actually allocated from the pcp lists.
          */
         if (is_migrate_cma(fallback_type))
                 return fallback_type;
@@ -1124,6 +1129,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
  
                         expand(zone, page, order, current_order, area,
                                new_type);
+                       /* The freepage_migratetype may differ from pageblock's
+                        * migratetype depending on the decisions in
+                        * try_to_steal_freepages. This is OK as long as it does
+                        * not differ for MIGRATE_CMA type.
+                        */
+                       set_freepage_migratetype(page, new_type);
  
                         trace_mm_page_alloc_extfrag(page, order, current_order,
                                 start_migratetype, migratetype, new_type);
@@ -1174,7 +1185,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                         unsigned long count, struct list_head *list,
                         int migratetype, int cold)
  {
-       int mt = migratetype, i;
+       int i;
  
         spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
@@ -1195,14 +1206,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                         list_add(&page->lru, list);
                 else
                         list_add_tail(&page->lru, list);
-               if (IS_ENABLED(CONFIG_CMA)) {
-                       mt = get_pageblock_migratetype(page);
-                       if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
-                               mt = migratetype;
-               }
-               set_freepage_migratetype(page, mt);
                 list = &page->lru;
-               if (is_migrate_cma(mt))
+               if (is_migrate_cma(get_freepage_migratetype(page)))
                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                               -(1 << order));
         }
@@ -1238,15 +1243,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
         }
         local_irq_restore(flags);
  }
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-       return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-       return false;
-}
  #endif
  
  /*
@@ -1580,15 +1576,10 @@ again:
                 if (!page)
                         goto failed;
                 __mod_zone_freepage_state(zone, -(1 << order),
-                                         get_pageblock_migratetype(page));
+                                         get_freepage_migratetype(page));
         }
  
-       /*
-        * NOTE: GFP_THISNODE allocations do not partake in the kswapd
-        * aging protocol, so they can't be fair.
-        */
-       if (!gfp_thisnode_allocation(gfp_flags))
-               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
  
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
         zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1863,18 +1854,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone)
  
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
-       return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
-}
-
-static void __paginginit init_zone_allows_reclaim(int nid)
-{
-       int i;
-
-       for_each_online_node(i)
-               if (node_distance(nid, i) <= RECLAIM_DISTANCE)
-                       node_set(i, NODE_DATA(nid)->reclaim_nodes);
-               else
-                       zone_reclaim_mode = 1;
+       return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+                               RECLAIM_DISTANCE;
  }
  
  #else  /* CONFIG_NUMA */
@@ -1908,9 +1889,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
         return true;
  }
  
-static inline void init_zone_allows_reclaim(int nid)
-{
-}
  #endif /* CONFIG_NUMA */
  
  /*
@@ -1954,23 +1932,12 @@ zonelist_scan:
                  * zone size to ensure fair page aging.  The zone a
                  * page was allocated in should have no effect on the
                  * time the page has in memory before being reclaimed.
-                *
-                * Try to stay in local zones in the fastpath.  If
-                * that fails, the slowpath is entered, which will do
-                * another pass starting with the local zones, but
-                * ultimately fall back to remote zones that do not
-                * partake in the fairness round-robin cycle of this
-                * zonelist.
-                *
-                * NOTE: GFP_THISNODE allocations do not partake in
-                * the kswapd aging protocol, so they can't be fair.
                  */
-               if ((alloc_flags & ALLOC_WMARK_LOW) &&
-                   !gfp_thisnode_allocation(gfp_mask)) {
-                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
-                               continue;
+               if (alloc_flags & ALLOC_FAIR) {
                         if (!zone_local(preferred_zone, zone))
                                 continue;
+                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                               continue;
                 }
                 /*
                  * When allocating a page cache page for writing, we
@@ -2408,32 +2375,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
         return page;
  }
  
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
-                            struct zonelist *zonelist,
-                            enum zone_type high_zoneidx,
-                            struct zone *preferred_zone)
+static void reset_alloc_batches(struct zonelist *zonelist,
+                               enum zone_type high_zoneidx,
+                               struct zone *preferred_zone)
  {
         struct zoneref *z;
         struct zone *zone;
  
         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-               if (!(gfp_mask & __GFP_NO_KSWAPD))
-                       wakeup_kswapd(zone, order, zone_idx(preferred_zone));
                 /*
                  * Only reset the batches of zones that were actually
-                * considered in the fast path, we don't want to
-                * thrash fairness information for zones that are not
+                * considered in the fairness pass, we don't want to
+                * trash fairness information for zones that are not
                  * actually part of this zonelist's round-robin cycle.
                  */
                 if (!zone_local(preferred_zone, zone))
                         continue;
                 mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                   high_wmark_pages(zone) -
-                                   low_wmark_pages(zone) -
-                                   zone_page_state(zone, NR_ALLOC_BATCH));
+                       high_wmark_pages(zone) - low_wmark_pages(zone) -
+                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
         }
  }
  
+static void wake_all_kswapds(unsigned int order,
+                            struct zonelist *zonelist,
+                            enum zone_type high_zoneidx,
+                            struct zone *preferred_zone)
+{
+       struct zoneref *z;
+       struct zone *zone;
+
+       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+               wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
@@ -2522,12 +2497,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
          * allowed per node queues are empty and that nodes are
          * over allocated.
          */
-       if (gfp_thisnode_allocation(gfp_mask))
+       if (IS_ENABLED(CONFIG_NUMA) &&
+           (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                 goto nopage;
  
  restart:
-       prepare_slowpath(gfp_mask, order, zonelist,
-                        high_zoneidx, preferred_zone);
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -2711,8 +2687,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
         unsigned int cpuset_mems_cookie;
-       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
-       struct mem_cgroup *memcg = NULL;
+       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
  
         gfp_mask &= gfp_allowed_mask;
  
@@ -2731,15 +2706,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         if (unlikely(!zonelist->_zonerefs->zone))
                 return NULL;
  
-       /*
-        * Will only have any effect when __GFP_KMEMCG is set.  This is
-        * verified in the (always inline) callee
-        */
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
-
  retry_cpuset:
-       cpuset_mems_cookie = get_mems_allowed();
+       cpuset_mems_cookie = read_mems_allowed_begin();
  
         /* The preferred zone is used for statistics later */
         first_zones_zonelist(zonelist, high_zoneidx,
@@ -2752,11 +2720,28 @@ retry_cpuset:
         if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                 alloc_flags |= ALLOC_CMA;
  #endif
+retry:
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                         zonelist, high_zoneidx, alloc_flags,
                         preferred_zone, migratetype);
         if (unlikely(!page)) {
+               /*
+                * The first pass makes sure allocations are spread
+                * fairly within the local node.  However, the local
+                * node might have free pages left after the fairness
+                * batches are exhausted, and remote zones haven't
+                * even been considered yet.  Try once more without
+                * fairness, and include remote zones now, before
+                * entering the slowpath and waking kswapd: prefer
+                * spilling to a remote zone over swapping locally.
+                */
+               if (alloc_flags & ALLOC_FAIR) {
+                       reset_alloc_batches(zonelist, high_zoneidx,
+                                           preferred_zone);
+                       alloc_flags &= ~ALLOC_FAIR;
+                       goto retry;
+               }
                 /*
                  * Runtime PM, block IO and its error handling path
                  * can deadlock because I/O on the device might not
@@ -2777,11 +2762,9 @@ out:
          * the mask is being updated. If a page allocation is about to fail,
          * check if the cpuset changed during allocation and if so, retry.
          */
-       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
                 goto retry_cpuset;
  
-       memcg_kmem_commit_charge(page, memcg, order);
-
         return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2835,27 +2818,51 @@ void free_pages(unsigned long addr, unsigned int order)
  EXPORT_SYMBOL(free_pages);
  
  /*
- * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
- * pages allocated with __GFP_KMEMCG.
+ * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
+ * of the current memory cgroup.
   *
- * Those pages are accounted to a particular memcg, embedded in the
- * corresponding page_cgroup. To avoid adding a hit in the allocator to search
- * for that information only to find out that it is NULL for users who have no
- * interest in that whatsoever, we provide these functions.
- *
- * The caller knows better which flags it relies on.
+ * It should be used when the caller would like to use kmalloc, but since the
+ * allocation is large, it has to fall back to the page allocator.
+ */
+struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
+{
+       struct page *page;
+       struct mem_cgroup *memcg = NULL;
+
+       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+               return NULL;
+       page = alloc_pages(gfp_mask, order);
+       memcg_kmem_commit_charge(page, memcg, order);
+       return page;
+}
+
+struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+{
+       struct page *page;
+       struct mem_cgroup *memcg = NULL;
+
+       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+               return NULL;
+       page = alloc_pages_node(nid, gfp_mask, order);
+       memcg_kmem_commit_charge(page, memcg, order);
+       return page;
+}
+
+/*
+ * __free_kmem_pages and free_kmem_pages will free pages allocated with
+ * alloc_kmem_pages.
   */
-void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+void __free_kmem_pages(struct page *page, unsigned int order)
  {
         memcg_kmem_uncharge_pages(page, order);
         __free_pages(page, order);
  }
  
-void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+void free_kmem_pages(unsigned long addr, unsigned int order)
  {
         if (addr != 0) {
                 VM_BUG_ON(!virt_addr_valid((void *)addr));
-               __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+               __free_kmem_pages(virt_to_page((void *)addr), order);
         }
  }
  
@@ -3045,9 +3052,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
                 goto out;
  
         do {
-               cpuset_mems_cookie = get_mems_allowed();
+               cpuset_mems_cookie = read_mems_allowed_begin();
                 ret = !node_isset(nid, cpuset_current_mems_allowed);
-       } while (!put_mems_allowed(cpuset_mems_cookie));
+       } while (read_mems_allowed_retry(cpuset_mems_cookie));
  out:
         return ret;
  }
@@ -4919,7 +4926,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  
         pgdat->node_id = nid;
         pgdat->node_start_pfn = node_start_pfn;
-       init_zone_allows_reclaim(nid);
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
  #endif
@@ -5070,7 +5076,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
         nodemask_t saved_node_state = node_states[N_MEMORY];
         unsigned long totalpages = early_calculate_totalpages();
         int usable_nodes = nodes_weight(node_states[N_MEMORY]);
-       struct memblock_type *type = &memblock.memory;
+       struct memblock_region *r;
  
         /* Need to find movable_zone earlier when movable_node is specified. */
         find_usable_zone_for_movable();
@@ -5080,13 +5086,13 @@ static void __init find_zone_movable_pfns_for_nodes(void)
          * options.
          */
         if (movable_node_is_enabled()) {
-               for (i = 0; i < type->cnt; i++) {
-                       if (!memblock_is_hotpluggable(&type->regions[i]))
+               for_each_memblock(memory, r) {
+                       if (!memblock_is_hotpluggable(r))
                                 continue;
  
-                       nid = type->regions[i].nid;
+                       nid = r->nid;
  
-                       usable_startpfn = PFN_DOWN(type->regions[i].base);
+                       usable_startpfn = PFN_DOWN(r->base);
                         zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
                                 min(usable_startpfn, zone_movable_pfn[nid]) :
                                 usable_startpfn;
@@ -6544,7 +6550,8 @@ static void dump_page_flags(unsigned long flags)
         printk(")\n");
  }
  
-void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
+void dump_page_badflags(struct page *page, const char *reason,
+               unsigned long badflags)
  {
         printk(KERN_ALERT
                "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
@@ -6560,8 +6567,8 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
         mem_cgroup_print_bad_page(page);
  }
  
-void dump_page(struct page *page, char *reason)
+void dump_page(struct page *page, const char *reason)
  {
         dump_page_badflags(page, reason, 0);
  }
-EXPORT_SYMBOL_GPL(dump_page);
+EXPORT_SYMBOL(dump_page);