Merge branch 'x86-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux-2.6-block.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 8be81422d4bd8de1d81d522884f159fa54e25f97..a873e61e312e6dd7795b4b734a0370bc844d9f29 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
         }
  }
  
+/* update __split_huge_page_refcount if you change this function */
  static int destroy_compound_page(struct page *page, unsigned long order)
  {
         int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
   *
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
   */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-       unsigned long buddy_idx = page_idx ^ (1 << order);
-
-       return page + (buddy_idx - page_idx);
-}
-
  static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
  {
-       return (page_idx & ~(1 << order));
+       return page_idx ^ (1 << order);
  }
  
  /*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
   *
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
   *
   * For recording page's order, we use page_private(page).
   */
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
   * order is recorded in page_private(page) field.
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
  {
         unsigned long page_idx;
         unsigned long combined_idx;
+       unsigned long uninitialized_var(buddy_idx);
         struct page *buddy;
  
         if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
         VM_BUG_ON(bad_range(zone, page));
  
         while (order < MAX_ORDER-1) {
-               buddy = __page_find_buddy(page, page_idx, order);
+               buddy_idx = __find_buddy_index(page_idx, order);
+               buddy = page + (buddy_idx - page_idx);
                 if (!page_is_buddy(page, buddy, order))
                         break;
  
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
                 list_del(&buddy->lru);
                 zone->free_area[order].nr_free--;
                 rmv_page_order(buddy);
-               combined_idx = __find_combined_index(page_idx, order);
+               combined_idx = buddy_idx & page_idx;
                 page = page + (combined_idx - page_idx);
                 page_idx = combined_idx;
                 order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
          */
         if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                 struct page *higher_page, *higher_buddy;
-               combined_idx = __find_combined_index(page_idx, order);
-               higher_page = page + combined_idx - page_idx;
-               higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+               combined_idx = buddy_idx & page_idx;
+               higher_page = page + (combined_idx - page_idx);
+               buddy_idx = __find_buddy_index(combined_idx, order + 1);
+               higher_buddy = page + (buddy_idx - combined_idx);
                 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                         list_add_tail(&page->lru,
                                 &zone->free_area[order].free_list[migratetype]);
@@ -1092,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
                 pset = per_cpu_ptr(zone->pageset, cpu);
  
                 pcp = &pset->pcp;
-               free_pcppages_bulk(zone, pcp->count, pcp);
-               pcp->count = 0;
+               if (pcp->count) {
+                       free_pcppages_bulk(zone, pcp->count, pcp);
+                       pcp->count = 0;
+               }
                 local_irq_restore(flags);
         }
  }
@@ -1813,15 +1811,14 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         bool sync_migration)
  {
         struct page *page;
-       struct task_struct *tsk = current;
  
         if (!order || compaction_deferred(preferred_zone))
                 return NULL;
  
-       tsk->flags |= PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                 nodemask, sync_migration);
-       tsk->flags &= ~PF_MEMALLOC;
+       current->flags &= ~PF_MEMALLOC;
         if (*did_some_progress != COMPACT_SKIPPED) {
  
                 /* Page migration frees to the PCP lists but we want merging */
@@ -1873,23 +1870,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  {
         struct page *page = NULL;
         struct reclaim_state reclaim_state;
-       struct task_struct *p = current;
         bool drained = false;
  
         cond_resched();
  
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
-       p->flags |= PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
         lockdep_set_current_reclaim_state(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       current->reclaim_state = &reclaim_state;
  
         *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
  
-       p->reclaim_state = NULL;
+       current->reclaim_state = NULL;
         lockdep_clear_current_reclaim_state();
-       p->flags &= ~PF_MEMALLOC;
+       current->flags &= ~PF_MEMALLOC;
  
         cond_resched();
  
@@ -1954,7 +1950,6 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
-       struct task_struct *p = current;
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
         const gfp_t wait = gfp_mask & __GFP_WAIT;
  
@@ -1970,18 +1965,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
         if (!wait) {
-               alloc_flags |= ALLOC_HARDER;
+               /*
+                * Not worth trying to allocate harder for
+                * __GFP_NOMEMALLOC even if it can't schedule.
+                */
+               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                       alloc_flags |= ALLOC_HARDER;
                 /*
                  * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                  * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                  */
                 alloc_flags &= ~ALLOC_CPUSET;
-       } else if (unlikely(rt_task(p)) && !in_interrupt())
+       } else if (unlikely(rt_task(current)) && !in_interrupt())
                 alloc_flags |= ALLOC_HARDER;
  
         if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                 if (!in_interrupt() &&
-                   ((p->flags & PF_MEMALLOC) ||
+                   ((current->flags & PF_MEMALLOC) ||
                      unlikely(test_thread_flag(TIF_MEMDIE))))
                         alloc_flags |= ALLOC_NO_WATERMARKS;
         }
@@ -2000,7 +2000,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         int alloc_flags;
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
-       struct task_struct *p = current;
         bool sync_migration = false;
  
         /*
@@ -2026,7 +2025,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                 goto nopage;
  
  restart:
-       wake_all_kswapd(order, zonelist, high_zoneidx,
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapd(order, zonelist, high_zoneidx,
                                                 zone_idx(preferred_zone));
  
         /*
@@ -2036,6 +2036,14 @@ restart:
          */
         alloc_flags = gfp_to_alloc_flags(gfp_mask);
  
+       /*
+        * Find the true preferred zone if the allocation is unconstrained by
+        * cpusets.
+        */
+       if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+               first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                       &preferred_zone);
+
         /* This is the last chance, in general, before the goto nopage. */
         page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                         high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2058,7 +2066,7 @@ rebalance:
                 goto nopage;
  
         /* Avoid recursion of direct reclaim */
-       if (p->flags & PF_MEMALLOC)
+       if (current->flags & PF_MEMALLOC)
                 goto nopage;
  
         /* Avoid allocations with no watermarks from looping endlessly */
@@ -2151,7 +2159,7 @@ nopage:
         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
                 printk(KERN_WARNING "%s: page allocation failure."
                         " order:%d, mode:0x%x\n",
-                       p->comm, order, gfp_mask);
+                       current->comm, order, gfp_mask);
                 dump_stack();
                 show_mem();
         }
@@ -2194,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  
         get_mems_allowed();
         /* The preferred zone is used for statistics later */
-       first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+       first_zones_zonelist(zonelist, high_zoneidx,
+                               nodemask ? : &cpuset_current_mems_allowed,
+                               &preferred_zone);
         if (!preferred_zone) {
                 put_mems_allowed();
                 return NULL;
@@ -5567,7 +5577,6 @@ static struct trace_print_flags pageflag_names[] = {
         {1UL << PG_swapcache,           "swapcache"     },
         {1UL << PG_mappedtodisk,        "mappedtodisk"  },
         {1UL << PG_reclaim,             "reclaim"       },
-       {1UL << PG_buddy,               "buddy"         },
         {1UL << PG_swapbacked,          "swapbacked"    },
         {1UL << PG_unevictable,         "unevictable"   },
  #ifdef CONFIG_MMU