Merge tag 'ntb-4.10' of git://github.com/jonmason/ntb
[linux-2.6-block.git] / mm / workingset.c
index 617475f529f42aed1e1ba34565e034ed7cad8157..241fa5d6b3b2fe155ed0f458b9d188a926a51e80 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/atomic.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/dax.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 
@@ -334,48 +335,81 @@ out:
  * point where they would still be useful.
  */
 
-struct list_lru workingset_shadow_nodes;
+static struct list_lru shadow_nodes;
+
+void workingset_update_node(struct radix_tree_node *node, void *private)
+{
+       struct address_space *mapping = private;
+
+       /* Only regular page cache has shadow entries */
+       if (dax_mapping(mapping) || shmem_mapping(mapping))
+               return;
+
+       /*
+        * Track non-empty nodes that contain only shadow entries;
+        * unlink those that contain pages or are being freed.
+        *
+        * Avoid acquiring the list_lru lock when the nodes are
+        * already where they should be. The list_empty() test is safe
+        * as node->private_list is protected by &mapping->tree_lock.
+        */
+       if (node->count && node->count == node->exceptional) {
+               if (list_empty(&node->private_list)) {
+                       node->private_data = mapping;
+                       list_lru_add(&shadow_nodes, &node->private_list);
+               }
+       } else {
+               if (!list_empty(&node->private_list))
+                       list_lru_del(&shadow_nodes, &node->private_list);
+       }
+}
 
 static unsigned long count_shadow_nodes(struct shrinker *shrinker,
                                        struct shrink_control *sc)
 {
-       unsigned long shadow_nodes;
        unsigned long max_nodes;
-       unsigned long pages;
+       unsigned long nodes;
+       unsigned long cache;
 
        /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
        local_irq_disable();
-       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
+       nodes = list_lru_shrink_count(&shadow_nodes, sc);
        local_irq_enable();
 
-       if (memcg_kmem_enabled()) {
-               pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
-                                                    LRU_ALL_FILE);
-       } else {
-               pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
-                       node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
-       }
-
        /*
-        * Active cache pages are limited to 50% of memory, and shadow
-        * entries that represent a refault distance bigger than that
-        * do not have any effect.  Limit the number of shadow nodes
-        * such that shadow entries do not exceed the number of active
-        * cache pages, assuming a worst-case node population density
-        * of 1/8th on average.
+        * Approximate a reasonable limit for the radix tree nodes
+        * containing shadow entries. We don't need to keep more
+        * shadow entries than possible pages on the active list,
+        * since refault distances bigger than that are dismissed.
+        *
+        * The size of the active list converges toward 100% of
+        * overall page cache as memory grows, with only a tiny
+        * inactive list. Assume the total cache size for that.
+        *
+        * Nodes might be sparsely populated, with only one shadow
+        * entry in the extreme case. Obviously, we cannot keep one
+        * node for every eligible shadow entry, so compromise on a
+        * worst-case density of 1/8th. Below that, not all eligible
+        * refaults can be detected anymore.
         *
         * On 64-bit with 7 radix_tree_nodes per page and 64 slots
         * each, this will reclaim shadow entries when they consume
-        * ~2% of available memory:
+        * ~1.8% of available memory:
         *
-        * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
+        * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
         */
-       max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
+       if (sc->memcg) {
+               cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+                                                    LRU_ALL_FILE);
+       } else {
+               cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
+                       node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
+       }
+       max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
 
-       if (shadow_nodes <= max_nodes)
+       if (nodes <= max_nodes)
                return 0;
-
-       return shadow_nodes - max_nodes;
+       return nodes - max_nodes;
 }
 
 static enum lru_status shadow_lru_isolate(struct list_head *item,
@@ -418,23 +452,30 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
         * no pages, so we expect to be able to remove them all and
         * delete and free the empty node afterwards.
         */
-       BUG_ON(!workingset_node_shadows(node));
-       BUG_ON(workingset_node_pages(node));
-
+       if (WARN_ON_ONCE(!node->exceptional))
+               goto out_invalid;
+       if (WARN_ON_ONCE(node->count != node->exceptional))
+               goto out_invalid;
        for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
                if (node->slots[i]) {
-                       BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+                       if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
+                               goto out_invalid;
+                       if (WARN_ON_ONCE(!node->exceptional))
+                               goto out_invalid;
+                       if (WARN_ON_ONCE(!mapping->nrexceptional))
+                               goto out_invalid;
                        node->slots[i] = NULL;
-                       workingset_node_shadows_dec(node);
-                       BUG_ON(!mapping->nrexceptional);
+                       node->exceptional--;
+                       node->count--;
                        mapping->nrexceptional--;
                }
        }
-       BUG_ON(workingset_node_shadows(node));
+       if (WARN_ON_ONCE(node->exceptional))
+               goto out_invalid;
        inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
-       if (!__radix_tree_delete_node(&mapping->page_tree, node))
-               BUG();
+       __radix_tree_delete_node(&mapping->page_tree, node);
 
+out_invalid:
        spin_unlock(&mapping->tree_lock);
        ret = LRU_REMOVED_RETRY;
 out:
@@ -452,8 +493,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
 
        /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
        local_irq_disable();
-       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
-                                   shadow_lru_isolate, NULL);
+       ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
        local_irq_enable();
        return ret;
 }
@@ -492,7 +532,7 @@ static int __init workingset_init(void)
        pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
               timestamp_bits, max_order, bucket_order);
 
-       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+       ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
        if (ret)
                goto err;
        ret = register_shrinker(&workingset_shadow_shrinker);
@@ -500,7 +540,7 @@ static int __init workingset_init(void)
                goto err_list_lru;
        return 0;
 err_list_lru:
-       list_lru_destroy(&workingset_shadow_nodes);
+       list_lru_destroy(&shadow_nodes);
 err:
        return ret;
 }