[PATCH] Zone reclaim: Reclaim logic

author Christoph Lameter <clameter@sgi.com>

Thu, 19 Jan 2006 01:42:31 +0000 (17:42 -0800)

committer Linus Torvalds <torvalds@g5.osdl.org>

Thu, 19 Jan 2006 03:20:17 +0000 (19:20 -0800)
author Christoph Lameter <clameter@sgi.com>
Thu, 19 Jan 2006 01:42:31 +0000 (17:42 -0800)
committer Linus Torvalds <torvalds@g5.osdl.org>
Thu, 19 Jan 2006 03:20:17 +0000 (19:20 -0800)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 34cbefd2ebded8437ab2cb82d6613264593df9a5..93a849f742dbbb47ae5ee00464f328b757d597fe 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -149,14 +149,16 @@ struct zone {
         unsigned long           pages_scanned;     /* since last reclaim */
         int                     all_unreclaimable; /* All pages pinned */
  
-       /*
-        * Does the allocator try to reclaim pages from the zone as soon
-        * as it fails a watermark_ok() in __alloc_pages?
-        */
-       int                     reclaim_pages;
         /* A count of how many reclaimers are scanning this zone */
         atomic_t                reclaim_in_progress;
  
+       /*
+        * timestamp (in jiffies) of the last zone reclaim that did not
+        * result in freeing of pages. This is used to avoid repeated scans
+        * if all memory in the zone is in use.
+        */
+       unsigned long           last_unsuccessful_zone_reclaim;
+
         /*
          * prev_priority holds the scanning priority for this zone.  It is
          * defined as the scanning priority at which we achieved our reclaim
diff --git a/include/linux/swap.h b/include/linux/swap.h

index d01f7efb0f2c9a1052fe807964bbcc67115d4b00..4a99e4a7fbf31edf83a86c5c35bfd0b4adbe9baa 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t);
  extern int shrink_all_memory(int);
  extern int vm_swappiness;
  
+#ifdef CONFIG_NUMA
+extern int zone_reclaim_mode;
+extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+#else
+#define zone_reclaim_mode 0
+static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+{
+       return 0;
+}
+#endif
+
  #ifdef CONFIG_MIGRATION
  extern int isolate_lru_page(struct page *p);
  extern int putback_lru_pages(struct list_head *l);
diff --git a/include/linux/topology.h b/include/linux/topology.h

index 315a5163d6a01a7f891251550feaf89716219803..e8eb0040ce3a241928476e46f8c1e6ed673e158f 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -56,6 +56,14 @@
  #define REMOTE_DISTANCE                20
  #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
  #endif
+#ifndef RECLAIM_DISTANCE
+/*
+ * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
+ * (in whatever arch specific measurement units returned by node_distance())
+ * then switch on zone reclaim on boot.
+ */
+#define RECLAIM_DISTANCE 20
+#endif
  #ifndef PENALTY_FOR_NODE_WITH_CPUS
  #define PENALTY_FOR_NODE_WITH_CPUS     (1)
  #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index c2e29743a8d156068581c05c027a37be2269a9d4..df54e2fc8ee09760c67d2dd4e6bd496e07286124 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                 mark = (*z)->pages_high;
                         if (!zone_watermark_ok(*z, order, mark,
                                     classzone_idx, alloc_flags))
-                               continue;
+                               if (!zone_reclaim_mode ||
+                                   !zone_reclaim(*z, gfp_mask, order))
+                                       continue;
                 }
  
                 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
         prev_node = local_node;
         nodes_clear(used_mask);
         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+               int distance = node_distance(local_node, node);
+
+               /*
+                * If another node is sufficiently far away then it is better
+                * to reclaim pages in a zone before going off node.
+                */
+               if (distance > RECLAIM_DISTANCE)
+                       zone_reclaim_mode = 1;
+
                 /*
                  * We don't want to pressure a particular node.
                  * So adding penalty to the first node in same
                  * distance group to make it round-robin.
                  */
-               if (node_distance(local_node, node) !=
-                               node_distance(local_node, prev_node))
+
+               if (distance != node_distance(local_node, prev_node))
                         node_load[node] += load;
                 prev_node = node;
                 load--;
diff --git a/mm/vmscan.c b/mm/vmscan.c

index e5117b6897a97b6b494fe79fd5d7ba76ee634fe3..2e34b61a70c727afc2895529c1997a1a8d399eee 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
  }
  
  module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+       int nr_pages = 1 << order;
+       struct task_struct *p = current;
+       struct reclaim_state reclaim_state;
+       struct scan_control sc = {
+               .gfp_mask       = gfp_mask,
+               .may_writepage  = 0,
+               .may_swap       = 0,
+               .nr_mapped      = read_page_state(nr_mapped),
+               .nr_scanned     = 0,
+               .nr_reclaimed   = 0,
+               .priority       = 0
+       };
+
+       if (!(gfp_mask & __GFP_WAIT) ||
+               zone->zone_pgdat->node_id != numa_node_id() ||
+               zone->all_unreclaimable ||
+               atomic_read(&zone->reclaim_in_progress) > 0)
+                       return 0;
+
+       if (time_before(jiffies,
+               zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+                       return 0;
+
+       disable_swap_token();
+
+       if (nr_pages > SWAP_CLUSTER_MAX)
+               sc.swap_cluster_max = nr_pages;
+       else
+               sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+       cond_resched();
+       p->flags |= PF_MEMALLOC;
+       reclaim_state.reclaimed_slab = 0;
+       p->reclaim_state = &reclaim_state;
+       shrink_zone(zone, &sc);
+       p->reclaim_state = NULL;
+       current->flags &= ~PF_MEMALLOC;
+
+       if (sc.nr_reclaimed == 0)
+               zone->last_unsuccessful_zone_reclaim = jiffies;
+
+       return sc.nr_reclaimed > nr_pages;
+}
+#endif
+
author	Christoph Lameter <clameter@sgi.com>
	Thu, 19 Jan 2006 01:42:31 +0000 (17:42 -0800)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Thu, 19 Jan 2006 03:20:17 +0000 (19:20 -0800)
include/linux/mmzone.h		patch \| blob \| blame \| history
include/linux/swap.h		patch \| blob \| blame \| history
include/linux/topology.h		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history