mm: FOLL flags for GUP flags
[linux-block.git] / mm / page_alloc.c
index 80f954d82d773f311ed66810126802865fe2b7e6..6877e22e3aa19c6e81551fb9ffa2b8643f19413b 100644 (file)
@@ -48,6 +48,7 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
+#include <trace/events/kmem.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -123,8 +124,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
 
 int min_free_kbytes = 1024;
 
-unsigned long __meminitdata nr_kernel_pages;
-unsigned long __meminitdata nr_all_pages;
+static unsigned long __meminitdata nr_kernel_pages;
+static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -510,7 +511,7 @@ static inline int free_pages_check(struct page *page)
 }
 
 /*
- * Frees a list of pages. 
+ * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
@@ -520,22 +521,42 @@ static inline int free_pages_check(struct page *page)
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
-static void free_pages_bulk(struct zone *zone, int count,
-                                       struct list_head *list, int order)
+static void free_pcppages_bulk(struct zone *zone, int count,
+                                       struct per_cpu_pages *pcp)
 {
+       int migratetype = 0;
+       int batch_free = 0;
+
        spin_lock(&zone->lock);
        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
        zone->pages_scanned = 0;
 
-       __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
-       while (count--) {
+       __mod_zone_page_state(zone, NR_FREE_PAGES, count);
+       while (count) {
                struct page *page;
+               struct list_head *list;
 
-               VM_BUG_ON(list_empty(list));
-               page = list_entry(list->prev, struct page, lru);
-               /* have to delete it as __free_one_page list manipulates */
-               list_del(&page->lru);
-               __free_one_page(page, zone, order, page_private(page));
+               /*
+                * Remove pages from lists in a round-robin fashion. A
+                * batch_free count is maintained that is incremented when an
+                * empty list is encountered.  This is so more pages are freed
+                * off fuller lists instead of spinning excessively around empty
+                * lists
+                */
+               do {
+                       batch_free++;
+                       if (++migratetype == MIGRATE_PCPTYPES)
+                               migratetype = 0;
+                       list = &pcp->lists[migratetype];
+               } while (list_empty(list));
+
+               do {
+                       page = list_entry(list->prev, struct page, lru);
+                       /* must delete as __free_one_page list manipulates */
+                       list_del(&page->lru);
+                       __free_one_page(page, zone, 0, migratetype);
+                       trace_mm_page_pcpu_drain(page, 0, migratetype);
+               } while (--count && --batch_free && !list_empty(list));
        }
        spin_unlock(&zone->lock);
 }
@@ -853,6 +874,10 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                                                        start_migratetype);
 
                        expand(zone, page, order, current_order, area, migratetype);
+
+                       trace_mm_page_alloc_extfrag(page, order, current_order,
+                               start_migratetype, migratetype);
+
                        return page;
                }
        }
@@ -886,6 +911,7 @@ retry_reserve:
                }
        }
 
+       trace_mm_page_alloc_zone_locked(page, order, migratetype);
        return page;
 }
 
@@ -946,7 +972,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
                to_drain = pcp->batch;
        else
                to_drain = pcp->count;
-       free_pages_bulk(zone, to_drain, &pcp->list, 0);
+       free_pcppages_bulk(zone, to_drain, pcp);
        pcp->count -= to_drain;
        local_irq_restore(flags);
 }
@@ -972,7 +998,7 @@ static void drain_pages(unsigned int cpu)
 
                pcp = &pset->pcp;
                local_irq_save(flags);
-               free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+               free_pcppages_bulk(zone, pcp->count, pcp);
                pcp->count = 0;
                local_irq_restore(flags);
        }
@@ -1038,6 +1064,7 @@ static void free_hot_cold_page(struct page *page, int cold)
        struct zone *zone = page_zone(page);
        struct per_cpu_pages *pcp;
        unsigned long flags;
+       int migratetype;
        int wasMlocked = __TestClearPageMlocked(page);
 
        kmemcheck_free_shadow(page, 0);
@@ -1055,21 +1082,39 @@ static void free_hot_cold_page(struct page *page, int cold)
        kernel_map_pages(page, 1, 0);
 
        pcp = &zone_pcp(zone, get_cpu())->pcp;
-       set_page_private(page, get_pageblock_migratetype(page));
+       migratetype = get_pageblock_migratetype(page);
+       set_page_private(page, migratetype);
        local_irq_save(flags);
        if (unlikely(wasMlocked))
                free_page_mlock(page);
        __count_vm_event(PGFREE);
 
+       /*
+        * We only track unmovable, reclaimable and movable on pcp lists.
+        * Free ISOLATE pages back to the allocator because they are being
+        * offlined but treat RESERVE as movable pages so we can get those
+        * areas back if necessary. Otherwise, we may have to free
+        * excessively into the page allocator
+        */
+       if (migratetype >= MIGRATE_PCPTYPES) {
+               if (unlikely(migratetype == MIGRATE_ISOLATE)) {
+                       free_one_page(zone, page, 0, migratetype);
+                       goto out;
+               }
+               migratetype = MIGRATE_MOVABLE;
+       }
+
        if (cold)
-               list_add_tail(&page->lru, &pcp->list);
+               list_add_tail(&page->lru, &pcp->lists[migratetype]);
        else
-               list_add(&page->lru, &pcp->list);
+               list_add(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
-               free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+               free_pcppages_bulk(zone, pcp->batch, pcp);
                pcp->count -= pcp->batch;
        }
+
+out:
        local_irq_restore(flags);
        put_cpu();
 }
@@ -1127,46 +1172,24 @@ again:
        cpu  = get_cpu();
        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
+               struct list_head *list;
 
                pcp = &zone_pcp(zone, cpu)->pcp;
+               list = &pcp->lists[migratetype];
                local_irq_save(flags);
-               if (!pcp->count) {
-                       pcp->count = rmqueue_bulk(zone, 0,
-                                       pcp->batch, &pcp->list,
-                                       migratetype, cold);
-                       if (unlikely(!pcp->count))
-                               goto failed;
-               }
-
-               /* Find a page of the appropriate migrate type */
-               if (cold) {
-                       list_for_each_entry_reverse(page, &pcp->list, lru)
-                               if (page_private(page) == migratetype)
-                                       break;
-               } else {
-                       list_for_each_entry(page, &pcp->list, lru)
-                               if (page_private(page) == migratetype)
-                                       break;
-               }
-
-               /* Allocate more to the pcp list if necessary */
-               if (unlikely(&page->lru == &pcp->list)) {
-                       int get_one_page = 0;
-
+               if (list_empty(list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
-                                       pcp->batch, &pcp->list,
+                                       pcp->batch, list,
                                        migratetype, cold);
-                       list_for_each_entry(page, &pcp->list, lru) {
-                               if (get_pageblock_migratetype(page) !=
-                                           MIGRATE_ISOLATE) {
-                                       get_one_page = 1;
-                                       break;
-                               }
-                       }
-                       if (!get_one_page)
+                       if (unlikely(list_empty(list)))
                                goto failed;
                }
 
+               if (cold)
+                       page = list_entry(list->prev, struct page, lru);
+               else
+                       page = list_entry(list->next, struct page, lru);
+
                list_del(&page->lru);
                pcp->count--;
        } else {
@@ -2829,7 +2852,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
 {
        unsigned long start_pfn, pfn, end_pfn;
        struct page *page;
-       unsigned long reserve, block_migratetype;
+       unsigned long block_migratetype;
+       int reserve;
 
        /* Get the start pfn, end pfn and the number of blocks to reserve */
        start_pfn = zone->zone_start_pfn;
@@ -2837,6 +2861,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
 
+       /*
+        * Reserve blocks are generally in place to help high-order atomic
+        * allocations that are short-lived. A min_free_kbytes value that
+        * would result in more than 2 reserve blocks for atomic allocations
+        * is assumed to be in place to help anti-fragmentation for the
+        * future allocation of hugepages at runtime.
+        */
+       reserve = min(2, reserve);
+
        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                if (!pfn_valid(pfn))
                        continue;
@@ -3007,6 +3040,7 @@ static int zone_batchsize(struct zone *zone)
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
        struct per_cpu_pages *pcp;
+       int migratetype;
 
        memset(p, 0, sizeof(*p));
 
@@ -3014,7 +3048,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp->count = 0;
        pcp->high = 6 * batch;
        pcp->batch = max(1UL, 1 * batch);
-       INIT_LIST_HEAD(&pcp->list);
+       for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
+               INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 
 /*
@@ -3206,7 +3241,7 @@ static int __zone_pcp_update(void *data)
                pcp = &pset->pcp;
 
                local_irq_save(flags);
-               free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+               free_pcppages_bulk(zone, pcp->count, pcp);
                setup_pageset(pset, batch);
                local_irq_restore(flags);
        }
@@ -3792,7 +3827,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone_pcp_init(zone);
                for_each_lru(l) {
                        INIT_LIST_HEAD(&zone->lru[l].list);
-                       zone->lru[l].nr_saved_scan = 0;
+                       zone->reclaim_stat.nr_saved_scan[l] = 0;
                }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4804,7 +4839,14 @@ void *__init alloc_large_system_hash(const char *tablename,
                        numentries <<= (PAGE_SHIFT - scale);
 
                /* Make sure we've got at least a 0-order allocation.. */
-               if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+               if (unlikely(flags & HASH_SMALL)) {
+                       /* Makes no sense without HASH_EARLY */
+                       WARN_ON(!(flags & HASH_EARLY));
+                       if (!(numentries >> *_hash_shift)) {
+                               numentries = 1UL << *_hash_shift;
+                               BUG_ON(!numentries);
+                       }
+               } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
                        numentries = PAGE_SIZE / bucketsize;
        }
        numentries = roundup_pow_of_two(numentries);