Merge tag 'topic/drm-fixes-2015-07-04' of git://anongit.freedesktop.org/drm-intel
[linux-2.6-block.git] / mm / memcontrol.c
index e65f7b0131d3598cb5ba0ce3497d47b43d676dea..acb93c554f6e8456dc9312734162317d1adea54d 100644 (file)
@@ -77,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 
 #define MEM_CGROUP_RECLAIM_RETRIES     5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
 
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
@@ -90,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = {
        "rss",
        "rss_huge",
        "mapped_file",
+       "dirty",
        "writeback",
        "swap",
 };
@@ -322,11 +324,6 @@ struct mem_cgroup {
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu __percpu *stat;
-       /*
-        * used when a cpu is offlined or other synchronizations
-        * See mem_cgroup_read_stat().
-        */
-       struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@ -346,6 +343,11 @@ struct mem_cgroup {
        atomic_t        numainfo_updating;
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head cgwb_list;
+       struct wb_domain cgwb_domain;
+#endif
+
        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;
@@ -596,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
        return &memcg->css;
 }
 
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned.  The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+       struct mem_cgroup *memcg;
+
+       rcu_read_lock();
+
+       memcg = page->mem_cgroup;
+
+       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+               memcg = root_mem_cgroup;
+
+       rcu_read_unlock();
+       return &memcg->css;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -795,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
        long val = 0;
        int cpu;
 
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.count[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
        return val;
 }
 
@@ -813,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
        unsigned long val = 0;
        int cpu;
 
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.events[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
        return val;
 }
 
@@ -2020,6 +2041,7 @@ again:
 
        return memcg;
 }
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
 
 /**
  * mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2038,6 +2060,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 
        rcu_read_unlock();
 }
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
 
 /**
  * mem_cgroup_update_page_stat - update page state statistics
@@ -2178,37 +2201,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
        mutex_unlock(&percpu_charge_mutex);
 }
 
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
-       int i;
-
-       spin_lock(&memcg->pcp_counter_lock);
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               long x = per_cpu(memcg->stat->count[i], cpu);
-
-               per_cpu(memcg->stat->count[i], cpu) = 0;
-               memcg->nocpu_base.count[i] += x;
-       }
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-               unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
-               per_cpu(memcg->stat->events[i], cpu) = 0;
-               memcg->nocpu_base.events[i] += x;
-       }
-       spin_unlock(&memcg->pcp_counter_lock);
-}
-
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                        unsigned long action,
                                        void *hcpu)
 {
        int cpu = (unsigned long)hcpu;
        struct memcg_stock_pcp *stock;
-       struct mem_cgroup *iter;
 
        if (action == CPU_ONLINE)
                return NOTIFY_OK;
@@ -2216,9 +2214,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
 
-       for_each_mem_cgroup(iter)
-               mem_cgroup_drain_pcp_counter(iter, cpu);
-
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
        return NOTIFY_OK;
@@ -4004,6 +3999,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 }
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+       return &memcg->cgwb_list;
+}
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+       return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+       wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+       wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+       if (!memcg->css.parent)
+               return NULL;
+
+       return &memcg->cgwb_domain;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used".  The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages.  Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+                        unsigned long *pdirty, unsigned long *pwriteback)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       struct mem_cgroup *parent;
+       unsigned long head_room = PAGE_COUNTER_MAX;
+       unsigned long file_pages;
+
+       *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+       /* this should eventually include NR_UNSTABLE_NFS */
+       *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+       file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                   (1 << LRU_ACTIVE_FILE));
+       while ((parent = parent_mem_cgroup(memcg))) {
+               unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+               unsigned long used = page_counter_read(&memcg->memory);
+
+               head_room = min(head_room, ceiling - min(ceiling, used));
+               memcg = parent;
+       }
+
+       *pavail = file_pages + head_room;
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+       return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * DO NOT USE IN NEW FILES.
  *
@@ -4388,9 +4475,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!memcg->stat)
                goto out_free;
+
+       if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+               goto out_free_stat;
+
        spin_lock_init(&memcg->pcp_counter_lock);
        return memcg;
 
+out_free_stat:
+       free_percpu(memcg->stat);
 out_free:
        kfree(memcg);
        return NULL;
@@ -4417,6 +4510,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                free_mem_cgroup_per_zone_info(memcg, node);
 
        free_percpu(memcg->stat);
+       memcg_wb_domain_exit(memcg);
        kfree(memcg);
 }
 
@@ -4449,6 +4543,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        /* root ? */
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
+               mem_cgroup_root_css = &memcg->css;
                page_counter_init(&memcg->memory, NULL);
                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4467,7 +4562,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #ifdef CONFIG_MEMCG_KMEM
        memcg->kmemcg_id = -1;
 #endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
        return &memcg->css;
 
 free_out:
@@ -4555,6 +4652,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        vmpressure_cleanup(&memcg->vmpressure);
 
        memcg_deactivate_kmem(memcg);
+
+       wb_memcg_offline(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4588,6 +4687,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        memcg->low = 0;
        memcg->high = PAGE_COUNTER_MAX;
        memcg->soft_limit = PAGE_COUNTER_MAX;
+       memcg_wb_domain_size_changed(memcg);
 }
 
 #ifdef CONFIG_MMU
@@ -4757,6 +4857,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
        unsigned long flags;
        int ret;
+       bool anon;
 
        VM_BUG_ON(from == to);
        VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -4782,15 +4883,33 @@ static int mem_cgroup_move_account(struct page *page,
        if (page->mem_cgroup != from)
                goto out_unlock;
 
+       anon = PageAnon(page);
+
        spin_lock_irqsave(&from->move_lock, flags);
 
-       if (!PageAnon(page) && page_mapped(page)) {
+       if (!anon && page_mapped(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                               nr_pages);
                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                               nr_pages);
        }
 
+       /*
+        * move_lock grabbed above and caller set from->moving_account, so
+        * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+        * So mapping should be stable for dirty pages.
+        */
+       if (!anon && PageDirty(page)) {
+               struct address_space *mapping = page_mapping(page);
+
+               if (mapping_cap_account_dirty(mapping)) {
+                       __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+                       __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+               }
+       }
+
        if (PageWriteback(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
                               nr_pages);
@@ -5306,6 +5425,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 
        memcg->high = high;
 
+       memcg_wb_domain_size_changed(memcg);
        return nbytes;
 }
 
@@ -5338,6 +5458,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
        if (err)
                return err;
 
+       memcg_wb_domain_size_changed(memcg);
        return nbytes;
 }