net/mlx4_core: Fix when to save some qp context flags for dynamic VST to VGT transitions

[linux-2.6-block.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 4be518d4e68ab437549693161c189c7caefe87ae..a63a8f8326647b92bdc63810c8a93be96047f748 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -68,7 +68,7 @@
  #include <net/ip.h>
  #include "slab.h"
  
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
  
  #include <trace/events/vmscan.h>
  
@@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
  unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
                                            int nid, unsigned int lru_mask)
  {
+       struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
         unsigned long nr = 0;
-       struct mem_cgroup_per_node *mz;
         enum lru_list lru;
  
         VM_BUG_ON((unsigned)nid >= nr_node_ids);
@@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
         for_each_lru(lru) {
                 if (!(BIT(lru) & lru_mask))
                         continue;
-               mz = mem_cgroup_nodeinfo(memcg, nid);
-               nr += mz->lru_size[lru];
+               nr += mem_cgroup_get_lru_size(lruvec, lru);
         }
         return nr;
  }
@@ -920,6 +919,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
              iter != NULL;                              \
              iter = mem_cgroup_iter(NULL, iter, NULL))
  
+/**
+ * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
+ * @memcg: hierarchy root
+ * @fn: function to call for each task
+ * @arg: argument passed to @fn
+ *
+ * This function iterates over tasks attached to @memcg or to any of its
+ * descendants and calls @fn for each task. If @fn returns a non-zero
+ * value, the function breaks the iteration loop and returns the value.
+ * Otherwise, it will iterate over all tasks and return 0.
+ *
+ * This function must not be called for the root memory cgroup.
+ */
+int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+                         int (*fn)(struct task_struct *, void *), void *arg)
+{
+       struct mem_cgroup *iter;
+       int ret = 0;
+
+       BUG_ON(memcg == root_mem_cgroup);
+
+       for_each_mem_cgroup_tree(iter, memcg) {
+               struct css_task_iter it;
+               struct task_struct *task;
+
+               css_task_iter_start(&iter->css, &it);
+               while (!ret && (task = css_task_iter_next(&it)))
+                       ret = fn(task, arg);
+               css_task_iter_end(&it);
+               if (ret) {
+                       mem_cgroup_iter_break(memcg, iter);
+                       break;
+               }
+       }
+       return ret;
+}
+
  /**
   * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
   * @page: the page
@@ -965,6 +1001,7 @@ out:
   * mem_cgroup_update_lru_size - account for adding or removing an lru page
   * @lruvec: mem_cgroup per zone lru vector
   * @lru: index of lru list the page is sitting on
+ * @zid: zone id of the accounted pages
   * @nr_pages: positive when adding or negative when removing
   *
   * This function must be called under lru_lock, just before a page is added
@@ -972,27 +1009,25 @@ out:
   * so as to allow it to check that lru_size 0 is consistent with list_empty).
   */
  void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-                               int nr_pages)
+                               int zid, int nr_pages)
  {
         struct mem_cgroup_per_node *mz;
         unsigned long *lru_size;
         long size;
-       bool empty;
  
         if (mem_cgroup_disabled())
                 return;
  
         mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-       lru_size = mz->lru_size + lru;
-       empty = list_empty(lruvec->lists + lru);
+       lru_size = &mz->lru_zone_size[zid][lru];
  
         if (nr_pages < 0)
                 *lru_size += nr_pages;
  
         size = *lru_size;
-       if (WARN_ONCE(size < 0 || empty != !size,
-               "%s(%p, %d, %d): lru_size %ld but %sempty\n",
-               __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+       if (WARN_ONCE(size < 0,
+               "%s(%p, %d, %d): lru_size %ld\n",
+               __func__, lruvec, lru, nr_pages, size)) {
                 VM_BUG_ON(1);
                 *lru_size = 0;
         }
@@ -1178,7 +1213,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
  /*
   * Return the memory (and swap, if configured) limit for a memcg.
   */
-static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
  {
         unsigned long limit;
  
@@ -1205,79 +1240,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                 .gfp_mask = gfp_mask,
                 .order = order,
         };
-       struct mem_cgroup *iter;
-       unsigned long chosen_points = 0;
-       unsigned long totalpages;
-       unsigned int points = 0;
-       struct task_struct *chosen = NULL;
+       bool ret;
  
         mutex_lock(&oom_lock);
-
-       /*
-        * If current has a pending SIGKILL or is exiting, then automatically
-        * select it.  The goal is to allow it to allocate so that it may
-        * quickly exit and free its memory.
-        */
-       if (task_will_free_mem(current)) {
-               mark_oom_victim(current);
-               wake_oom_reaper(current);
-               goto unlock;
-       }
-
-       check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
-       totalpages = mem_cgroup_get_limit(memcg) ? : 1;
-       for_each_mem_cgroup_tree(iter, memcg) {
-               struct css_task_iter it;
-               struct task_struct *task;
-
-               css_task_iter_start(&iter->css, &it);
-               while ((task = css_task_iter_next(&it))) {
-                       switch (oom_scan_process_thread(&oc, task)) {
-                       case OOM_SCAN_SELECT:
-                               if (chosen)
-                                       put_task_struct(chosen);
-                               chosen = task;
-                               chosen_points = ULONG_MAX;
-                               get_task_struct(chosen);
-                               /* fall through */
-                       case OOM_SCAN_CONTINUE:
-                               continue;
-                       case OOM_SCAN_ABORT:
-                               css_task_iter_end(&it);
-                               mem_cgroup_iter_break(memcg, iter);
-                               if (chosen)
-                                       put_task_struct(chosen);
-                               /* Set a dummy value to return "true". */
-                               chosen = (void *) 1;
-                               goto unlock;
-                       case OOM_SCAN_OK:
-                               break;
-                       };
-                       points = oom_badness(task, memcg, NULL, totalpages);
-                       if (!points || points < chosen_points)
-                               continue;
-                       /* Prefer thread group leaders for display purposes */
-                       if (points == chosen_points &&
-                           thread_group_leader(chosen))
-                               continue;
-
-                       if (chosen)
-                               put_task_struct(chosen);
-                       chosen = task;
-                       chosen_points = points;
-                       get_task_struct(chosen);
-               }
-               css_task_iter_end(&it);
-       }
-
-       if (chosen) {
-               points = chosen_points * 1000 / totalpages;
-               oom_kill_process(&oc, chosen, points, totalpages,
-                                "Memory cgroup out of memory");
-       }
-unlock:
+       ret = out_of_memory(&oc);
         mutex_unlock(&oom_lock);
-       return chosen;
+       return ret;
  }
  
  #if MAX_NUMNODES > 1
@@ -1600,7 +1568,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
         if (!memcg)
                 return false;
  
-       if (!handle || oom_killer_disabled)
+       if (!handle)
                 goto cleanup;
  
         owait.memcg = memcg;
@@ -1846,22 +1814,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
         mutex_unlock(&percpu_charge_mutex);
  }
  
-static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
-                                       unsigned long action,
-                                       void *hcpu)
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
  {
-       int cpu = (unsigned long)hcpu;
         struct memcg_stock_pcp *stock;
  
-       if (action == CPU_ONLINE)
-               return NOTIFY_OK;
-
-       if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
-               return NOTIFY_OK;
-
         stock = &per_cpu(memcg_stock, cpu);
         drain_stock(stock);
-       return NOTIFY_OK;
+       return 0;
  }
  
  static void reclaim_high(struct mem_cgroup *memcg,
@@ -1947,6 +1906,15 @@ retry:
                      current->flags & PF_EXITING))
                 goto force;
  
+       /*
+        * Prevent unbounded recursion when reclaim operations need to
+        * allocate memory. This might exceed the limits temporarily,
+        * but we prefer facilitating memory reclaim and getting back
+        * under the limit over triggering OOM kills in these cases.
+        */
+       if (unlikely(current->flags & PF_MEMALLOC))
+               goto force;
+
         if (unlikely(task_in_memcg_oom(current)))
                 goto nomem;
  
@@ -2175,6 +2143,8 @@ struct memcg_kmem_cache_create_work {
         struct work_struct work;
  };
  
+static struct workqueue_struct *memcg_kmem_cache_create_wq;
+
  static void memcg_kmem_cache_create_func(struct work_struct *w)
  {
         struct memcg_kmem_cache_create_work *cw =
@@ -2206,7 +2176,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
         cw->cachep = cachep;
         INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
  
-       schedule_work(&cw->work);
+       queue_work(memcg_kmem_cache_create_wq, &cw->work);
  }
  
  static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@@ -2969,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
                 /*
                  * The active flag needs to be written after the static_key
                  * update. This is what guarantees that the socket activation
-                * function is the last one to run. See sock_update_memcg() for
-                * details, and note that we don't mark any socket as belonging
-                * to this memcg until that flag is up.
+                * function is the last one to run. See mem_cgroup_sk_alloc()
+                * for details, and note that we don't mark any socket as
+                * belonging to this memcg until that flag is up.
                  *
                  * We need to do this, because static_keys will span multiple
                  * sites, but we can't control their order. If we mark a socket
                  * as accounted, but the accounting functions are not patched in
                  * yet, we'll lose accounting.
                  *
-                * We never race with the readers in sock_update_memcg(),
+                * We never race with the readers in mem_cgroup_sk_alloc(),
                  * because when this value change, the code to process it is not
                  * patched in yet.
                  */
@@ -4092,11 +4062,13 @@ static DEFINE_IDR(mem_cgroup_idr);
  
  static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
  {
+       VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
         atomic_add(n, &memcg->id.ref);
  }
  
  static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
  {
+       VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
         if (atomic_sub_and_test(n, &memcg->id.ref)) {
                 idr_remove(&mem_cgroup_idr, memcg->id.id);
                 memcg->id.id = 0;
@@ -4285,8 +4257,10 @@ fail:
  
  static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
  {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
         /* Online state pins memcg ID, memcg ID pins CSS */
-       mem_cgroup_id_get(mem_cgroup_from_css(css));
+       atomic_set(&memcg->id.ref, 1);
         css_get(css);
         return 0;
  }
@@ -4434,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
          * Because lookup_swap_cache() updates some statistics counter,
          * we call find_get_page() with swapper_space directly.
          */
-       page = find_get_page(swap_address_space(ent), ent.val);
+       page = find_get_page(swap_address_space(ent), swp_offset(ent));
         if (do_memsw_account())
                 entry->val = ent.val;
  
@@ -4472,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                         swp_entry_t swp = radix_to_swp_entry(page);
                         if (do_memsw_account())
                                 *entry = swp;
-                       page = find_get_page(swap_address_space(swp), swp.val);
+                       page = find_get_page(swap_address_space(swp),
+                                            swp_offset(swp));
                 }
         } else
                 page = find_get_page(mapping, pgoff);
@@ -4707,7 +4682,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
                 .mm = mm,
         };
         down_read(&mm->mmap_sem);
-       walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
+       walk_page_range(0, mm->highest_vm_end,
+                       &mem_cgroup_count_precharge_walk);
         up_read(&mm->mmap_sem);
  
         precharge = mc.precharge;
@@ -4995,7 +4971,8 @@ retry:
          * When we have consumed all precharges and failed in doing
          * additional charge, the page walk just aborts.
          */
-       walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
+       walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+
         up_read(&mc.mm->mmap_sem);
         atomic_dec(&mc.from->moving_account);
  }
@@ -5674,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
  EXPORT_SYMBOL(memcg_sockets_enabled_key);
  
-void sock_update_memcg(struct sock *sk)
+void mem_cgroup_sk_alloc(struct sock *sk)
  {
         struct mem_cgroup *memcg;
  
-       /* Socket cloning can throw us here with sk_cgrp already
+       if (!mem_cgroup_sockets_enabled)
+               return;
+
+       /*
+        * Socket cloning can throw us here with sk_memcg already
          * filled. It won't however, necessarily happen from
          * process context. So the test for root memcg given
          * the current task's memcg won't help us in this case.
@@ -5703,12 +5684,11 @@ void sock_update_memcg(struct sock *sk)
  out:
         rcu_read_unlock();
  }
-EXPORT_SYMBOL(sock_update_memcg);
  
-void sock_release_memcg(struct sock *sk)
+void mem_cgroup_sk_free(struct sock *sk)
  {
-       WARN_ON(!sk->sk_memcg);
-       css_put(&sk->sk_memcg->css);
+       if (sk->sk_memcg)
+               css_put(&sk->sk_memcg->css);
  }
  
  /**
@@ -5785,16 +5765,28 @@ __setup("cgroup.memory=", cgroup_memory);
  /*
   * subsys_initcall() for memory controller.
   *
- * Some parts like hotcpu_notifier() have to be initialized from this context
- * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
- * everything that doesn't depend on a specific mem_cgroup structure should
- * be initialized from here.
+ * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
+ * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
+ * basically everything that doesn't depend on a specific mem_cgroup structure
+ * should be initialized from here.
   */
  static int __init mem_cgroup_init(void)
  {
         int cpu, node;
  
-       hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+#ifndef CONFIG_SLOB
+       /*
+        * Kmem cache creation is mostly done with the slab_mutex held,
+        * so use a special workqueue to avoid stalling all worker
+        * threads in case lots of cgroups are created simultaneously.
+        */
+       memcg_kmem_cache_create_wq =
+               alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
+       BUG_ON(!memcg_kmem_cache_create_wq);
+#endif
+
+       cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
+                                 memcg_hotplug_cpu_dead);
  
         for_each_possible_cpu(cpu)
                 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,