mm: remove __GFP_NOFAIL is deprecated comment
[linux-2.6-block.git] / mm / compaction.c
index 585de54dbe8ccef06543b547eadfe8e4fe8e3117..ccf97b02b85f32d38f6f68a41bedf89b9a1ee596 100644 (file)
@@ -7,6 +7,7 @@
  *
  * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
  */
+#include <linux/cpu.h>
 #include <linux/swap.h>
 #include <linux/migrate.h>
 #include <linux/compaction.h>
@@ -17,6 +18,8 @@
 #include <linux/balloon_compaction.h>
 #include <linux/page-isolation.h>
 #include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -71,49 +74,6 @@ static inline bool migrate_async_suitable(int migratetype)
        return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
 
-/*
- * Check that the whole (or subset of) a pageblock given by the interval of
- * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
- *
- * Return struct page pointer of start_pfn, or NULL if checks were not passed.
- *
- * It's possible on some configurations to have a setup like node0 node1 node0
- * i.e. it's possible that all pages within a zones range of pages do not
- * belong to a single zone. We assume that a border between node0 and node1
- * can occur within a single pageblock, but not a node0 node1 node0
- * interleaving within a single pageblock. It is therefore sufficient to check
- * the first and last page of a pageblock and avoid checking each individual
- * page in a pageblock.
- */
-static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
-                               unsigned long end_pfn, struct zone *zone)
-{
-       struct page *start_page;
-       struct page *end_page;
-
-       /* end_pfn is one past the range we are checking */
-       end_pfn--;
-
-       if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
-               return NULL;
-
-       start_page = pfn_to_page(start_pfn);
-
-       if (page_zone(start_page) != zone)
-               return NULL;
-
-       end_page = pfn_to_page(end_pfn);
-
-       /* This gives a shorter code than deriving page_zone(end_page) */
-       if (page_zone_id(start_page) != page_zone_id(end_page))
-               return NULL;
-
-       return start_page;
-}
-
 #ifdef CONFIG_COMPACTION
 
 /* Do not skip compaction more than 64 times */
@@ -200,7 +160,8 @@ static void reset_cached_positions(struct zone *zone)
 {
        zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
        zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
-       zone->compact_cached_free_pfn = zone_end_pfn(zone);
+       zone->compact_cached_free_pfn =
+                       round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
 }
 
 /*
@@ -554,13 +515,17 @@ unsigned long
 isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn)
 {
-       unsigned long isolated, pfn, block_end_pfn;
+       unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
        LIST_HEAD(freelist);
 
        pfn = start_pfn;
+       block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+       if (block_start_pfn < cc->zone->zone_start_pfn)
+               block_start_pfn = cc->zone->zone_start_pfn;
        block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 
        for (; pfn < end_pfn; pfn += isolated,
+                               block_start_pfn = block_end_pfn,
                                block_end_pfn += pageblock_nr_pages) {
                /* Protect pfn from changing by isolate_freepages_block */
                unsigned long isolate_start_pfn = pfn;
@@ -573,11 +538,13 @@ isolate_freepages_range(struct compact_control *cc,
                 * scanning range to right one.
                 */
                if (pfn >= block_end_pfn) {
+                       block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
                        block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
                        block_end_pfn = min(block_end_pfn, end_pfn);
                }
 
-               if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+               if (!pageblock_pfn_to_page(block_start_pfn,
+                                       block_end_pfn, cc->zone))
                        break;
 
                isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -863,18 +830,23 @@ unsigned long
 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
                                                        unsigned long end_pfn)
 {
-       unsigned long pfn, block_end_pfn;
+       unsigned long pfn, block_start_pfn, block_end_pfn;
 
        /* Scan block by block. First and last block may be incomplete */
        pfn = start_pfn;
+       block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+       if (block_start_pfn < cc->zone->zone_start_pfn)
+               block_start_pfn = cc->zone->zone_start_pfn;
        block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 
        for (; pfn < end_pfn; pfn = block_end_pfn,
+                               block_start_pfn = block_end_pfn,
                                block_end_pfn += pageblock_nr_pages) {
 
                block_end_pfn = min(block_end_pfn, end_pfn);
 
-               if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+               if (!pageblock_pfn_to_page(block_start_pfn,
+                                       block_end_pfn, cc->zone))
                        continue;
 
                pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
@@ -1103,7 +1075,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
 static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
-       unsigned long low_pfn, end_pfn;
+       unsigned long block_start_pfn;
+       unsigned long block_end_pfn;
+       unsigned long low_pfn;
        unsigned long isolate_start_pfn;
        struct page *page;
        const isolate_mode_t isolate_mode =
@@ -1115,16 +1089,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
         * initialized by compact_zone()
         */
        low_pfn = cc->migrate_pfn;
+       block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+       if (block_start_pfn < zone->zone_start_pfn)
+               block_start_pfn = zone->zone_start_pfn;
 
        /* Only scan within a pageblock boundary */
-       end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+       block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
 
        /*
         * Iterate over whole pageblocks until we find the first suitable.
         * Do not cross the free scanner.
         */
-       for (; end_pfn <= cc->free_pfn;
-                       low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+       for (; block_end_pfn <= cc->free_pfn;
+                       low_pfn = block_end_pfn,
+                       block_start_pfn = block_end_pfn,
+                       block_end_pfn += pageblock_nr_pages) {
 
                /*
                 * This can potentially iterate a massively long zone with
@@ -1135,7 +1114,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                                && compact_should_abort(cc))
                        break;
 
-               page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+               page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+                                                                       zone);
                if (!page)
                        continue;
 
@@ -1154,8 +1134,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 
                /* Perform the isolation */
                isolate_start_pfn = low_pfn;
-               low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
-                                                               isolate_mode);
+               low_pfn = isolate_migratepages_block(cc, low_pfn,
+                                               block_end_pfn, isolate_mode);
 
                if (!low_pfn || cc->contended) {
                        acct_isolated(zone, cc);
@@ -1211,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
 
                /*
                 * Mark that the PG_migrate_skip information should be cleared
-                * by kswapd when it goes to sleep. kswapd does not set the
+                * by kswapd when it goes to sleep. kcompactd does not set the
                 * flag itself as the decision to be clear should be directly
                 * based on an allocation request.
                 */
-               if (!current_is_kswapd())
+               if (cc->direct_compaction)
                        zone->compact_blockskip_flush = true;
 
                return COMPACT_COMPLETE;
@@ -1358,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
        /*
         * Clear pageblock skip if there were failures recently and compaction
-        * is about to be retried after being deferred. kswapd does not do
-        * this reset as it'll reset the cached information when going to sleep.
+        * is about to be retried after being deferred.
         */
-       if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+       if (compaction_restarting(zone, cc->order))
                __reset_isolation_suitable(zone);
 
        /*
@@ -1371,11 +1350,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
         */
        cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
        cc->free_pfn = zone->compact_cached_free_pfn;
-       if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
-               cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+       if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+               cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
                zone->compact_cached_free_pfn = cc->free_pfn;
        }
-       if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+       if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
                cc->migrate_pfn = start_pfn;
                zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
                zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
@@ -1497,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
                .mode = mode,
                .alloc_flags = alloc_flags,
                .classzone_idx = classzone_idx,
+               .direct_compaction = true,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -1759,4 +1739,223 @@ void compaction_unregister_node(struct node *node)
 }
 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
 
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+       return pgdat->kcompactd_max_order > 0;
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+       int zoneid;
+       struct zone *zone;
+       enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+
+       for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
+               zone = &pgdat->node_zones[zoneid];
+
+               if (!populated_zone(zone))
+                       continue;
+
+               if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+                                       classzone_idx) == COMPACT_CONTINUE)
+                       return true;
+       }
+
+       return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+       /*
+        * With no special task, compact all zones so that a page of requested
+        * order is allocatable.
+        */
+       int zoneid;
+       struct zone *zone;
+       struct compact_control cc = {
+               .order = pgdat->kcompactd_max_order,
+               .classzone_idx = pgdat->kcompactd_classzone_idx,
+               .mode = MIGRATE_SYNC_LIGHT,
+               .ignore_skip_hint = true,
+
+       };
+       bool success = false;
+
+       trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+                                                       cc.classzone_idx);
+       count_vm_event(KCOMPACTD_WAKE);
+
+       for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
+               int status;
+
+               zone = &pgdat->node_zones[zoneid];
+               if (!populated_zone(zone))
+                       continue;
+
+               if (compaction_deferred(zone, cc.order))
+                       continue;
+
+               if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+                                                       COMPACT_CONTINUE)
+                       continue;
+
+               cc.nr_freepages = 0;
+               cc.nr_migratepages = 0;
+               cc.zone = zone;
+               INIT_LIST_HEAD(&cc.freepages);
+               INIT_LIST_HEAD(&cc.migratepages);
+
+               status = compact_zone(zone, &cc);
+
+               if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+                                               cc.classzone_idx, 0)) {
+                       success = true;
+                       compaction_defer_reset(zone, cc.order, false);
+               } else if (status == COMPACT_COMPLETE) {
+                       /*
+                        * We use sync migration mode here, so we defer like
+                        * sync direct compaction does.
+                        */
+                       defer_compaction(zone, cc.order);
+               }
+
+               VM_BUG_ON(!list_empty(&cc.freepages));
+               VM_BUG_ON(!list_empty(&cc.migratepages));
+       }
+
+       /*
+        * Regardless of success, we are done until woken up next. But remember
+        * the requested order/classzone_idx in case it was higher/tighter than
+        * our current ones
+        */
+       if (pgdat->kcompactd_max_order <= cc.order)
+               pgdat->kcompactd_max_order = 0;
+       if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
+               pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+       if (!order)
+               return;
+
+       if (pgdat->kcompactd_max_order < order)
+               pgdat->kcompactd_max_order = order;
+
+       if (pgdat->kcompactd_classzone_idx > classzone_idx)
+               pgdat->kcompactd_classzone_idx = classzone_idx;
+
+       if (!waitqueue_active(&pgdat->kcompactd_wait))
+               return;
+
+       if (!kcompactd_node_suitable(pgdat))
+               return;
+
+       trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+                                                       classzone_idx);
+       wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+       pg_data_t *pgdat = (pg_data_t*)p;
+       struct task_struct *tsk = current;
+
+       const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+       if (!cpumask_empty(cpumask))
+               set_cpus_allowed_ptr(tsk, cpumask);
+
+       set_freezable();
+
+       pgdat->kcompactd_max_order = 0;
+       pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+
+       while (!kthread_should_stop()) {
+               trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+               wait_event_freezable(pgdat->kcompactd_wait,
+                               kcompactd_work_requested(pgdat));
+
+               kcompactd_do_work(pgdat);
+       }
+
+       return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+       pg_data_t *pgdat = NODE_DATA(nid);
+       int ret = 0;
+
+       if (pgdat->kcompactd)
+               return 0;
+
+       pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+       if (IS_ERR(pgdat->kcompactd)) {
+               pr_err("Failed to start kcompactd on node %d\n", nid);
+               ret = PTR_ERR(pgdat->kcompactd);
+               pgdat->kcompactd = NULL;
+       }
+       return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+       struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+       if (kcompactd) {
+               kthread_stop(kcompactd);
+               NODE_DATA(nid)->kcompactd = NULL;
+       }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+                       void *hcpu)
+{
+       int nid;
+
+       if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+               for_each_node_state(nid, N_MEMORY) {
+                       pg_data_t *pgdat = NODE_DATA(nid);
+                       const struct cpumask *mask;
+
+                       mask = cpumask_of_node(pgdat->node_id);
+
+                       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+                               /* One of our CPUs online: restore mask */
+                               set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+               }
+       }
+       return NOTIFY_OK;
+}
+
+static int __init kcompactd_init(void)
+{
+       int nid;
+
+       for_each_node_state(nid, N_MEMORY)
+               kcompactd_run(nid);
+       hotcpu_notifier(cpu_callback, 0);
+       return 0;
+}
+subsys_initcall(kcompactd_init)
+
 #endif /* CONFIG_COMPACTION */