#include <net/ip.h>
#include "slab.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/vmscan.h>
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
unsigned long nr = 0;
- struct mem_cgroup_per_node *mz;
enum lru_list lru;
VM_BUG_ON((unsigned)nid >= nr_node_ids);
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- mz = mem_cgroup_nodeinfo(memcg, nid);
- nr += mz->lru_size[lru];
+ nr += mem_cgroup_get_lru_size(lruvec, lru);
}
return nr;
}
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
+/**
+ * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
+ * @memcg: hierarchy root
+ * @fn: function to call for each task
+ * @arg: argument passed to @fn
+ *
+ * This function iterates over tasks attached to @memcg or to any of its
+ * descendants and calls @fn for each task. If @fn returns a non-zero
+ * value, the function breaks the iteration loop and returns the value.
+ * Otherwise, it will iterate over all tasks and return 0.
+ *
+ * This function must not be called for the root memory cgroup.
+ */
+int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+ int (*fn)(struct task_struct *, void *), void *arg)
+{
+ struct mem_cgroup *iter;
+ int ret = 0;
+
+ BUG_ON(memcg == root_mem_cgroup);
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&iter->css, &it);
+ while (!ret && (task = css_task_iter_next(&it)))
+ ret = fn(task, arg);
+ css_task_iter_end(&it);
+ if (ret) {
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
+ }
+ return ret;
+}
+
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
+ * @zid: zone id of the accounted pages
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
* so as to allow it to check that lru_size 0 is consistent with list_empty).
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int nr_pages)
+ int zid, int nr_pages)
{
struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
long size;
- bool empty;
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- lru_size = mz->lru_size + lru;
- empty = list_empty(lruvec->lists + lru);
+ lru_size = &mz->lru_zone_size[zid][lru];
if (nr_pages < 0)
*lru_size += nr_pages;
size = *lru_size;
- if (WARN_ONCE(size < 0 || empty != !size,
- "%s(%p, %d, %d): lru_size %ld but %sempty\n",
- __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+ if (WARN_ONCE(size < 0,
+ "%s(%p, %d, %d): lru_size %ld\n",
+ __func__, lruvec, lru, nr_pages, size)) {
VM_BUG_ON(1);
*lru_size = 0;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
unsigned long limit;
.gfp_mask = gfp_mask,
.order = order,
};
- struct mem_cgroup *iter;
- unsigned long chosen_points = 0;
- unsigned long totalpages;
- unsigned int points = 0;
- struct task_struct *chosen = NULL;
+ bool ret;
mutex_lock(&oom_lock);
-
- /*
- * If current has a pending SIGKILL or is exiting, then automatically
- * select it. The goal is to allow it to allocate so that it may
- * quickly exit and free its memory.
- */
- if (task_will_free_mem(current)) {
- mark_oom_victim(current);
- wake_oom_reaper(current);
- goto unlock;
- }
-
- check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
- totalpages = mem_cgroup_get_limit(memcg) ? : 1;
- for_each_mem_cgroup_tree(iter, memcg) {
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&iter->css, &it);
- while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(&oc, task)) {
- case OOM_SCAN_SELECT:
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = ULONG_MAX;
- get_task_struct(chosen);
- /* fall through */
- case OOM_SCAN_CONTINUE:
- continue;
- case OOM_SCAN_ABORT:
- css_task_iter_end(&it);
- mem_cgroup_iter_break(memcg, iter);
- if (chosen)
- put_task_struct(chosen);
- /* Set a dummy value to return "true". */
- chosen = (void *) 1;
- goto unlock;
- case OOM_SCAN_OK:
- break;
- };
- points = oom_badness(task, memcg, NULL, totalpages);
- if (!points || points < chosen_points)
- continue;
- /* Prefer thread group leaders for display purposes */
- if (points == chosen_points &&
- thread_group_leader(chosen))
- continue;
-
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
- }
- css_task_iter_end(&it);
- }
-
- if (chosen) {
- points = chosen_points * 1000 / totalpages;
- oom_kill_process(&oc, chosen, points, totalpages,
- "Memory cgroup out of memory");
- }
-unlock:
+ ret = out_of_memory(&oc);
mutex_unlock(&oom_lock);
- return chosen;
+ return ret;
}
#if MAX_NUMNODES > 1
if (!memcg)
return false;
- if (!handle || oom_killer_disabled)
+ if (!handle)
goto cleanup;
owait.memcg = memcg;
mutex_unlock(&percpu_charge_mutex);
}
-static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
- unsigned long action,
- void *hcpu)
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
- if (action == CPU_ONLINE)
- return NOTIFY_OK;
-
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
-
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
- return NOTIFY_OK;
+ return 0;
}
static void reclaim_high(struct mem_cgroup *memcg,
current->flags & PF_EXITING))
goto force;
+ /*
+ * Prevent unbounded recursion when reclaim operations need to
+ * allocate memory. This might exceed the limits temporarily,
+ * but we prefer facilitating memory reclaim and getting back
+ * under the limit over triggering OOM kills in these cases.
+ */
+ if (unlikely(current->flags & PF_MEMALLOC))
+ goto force;
+
if (unlikely(task_in_memcg_oom(current)))
goto nomem;
struct work_struct work;
};
+static struct workqueue_struct *memcg_kmem_cache_create_wq;
+
static void memcg_kmem_cache_create_func(struct work_struct *w)
{
struct memcg_kmem_cache_create_work *cw =
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
- schedule_work(&cw->work);
+ queue_work(memcg_kmem_cache_create_wq, &cw->work);
}
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
/*
* The active flag needs to be written after the static_key
* update. This is what guarantees that the socket activation
- * function is the last one to run. See sock_update_memcg() for
- * details, and note that we don't mark any socket as belonging
- * to this memcg until that flag is up.
+ * function is the last one to run. See mem_cgroup_sk_alloc()
+ * for details, and note that we don't mark any socket as
+ * belonging to this memcg until that flag is up.
*
* We need to do this, because static_keys will span multiple
* sites, but we can't control their order. If we mark a socket
* as accounted, but the accounting functions are not patched in
* yet, we'll lose accounting.
*
- * We never race with the readers in sock_update_memcg(),
+ * We never race with the readers in mem_cgroup_sk_alloc(),
* because when this value change, the code to process it is not
* patched in yet.
*/
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
+ VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
atomic_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
+ VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
if (atomic_sub_and_test(n, &memcg->id.ref)) {
idr_remove(&mem_cgroup_idr, memcg->id.id);
memcg->id.id = 0;
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
/* Online state pins memcg ID, memcg ID pins CSS */
- mem_cgroup_id_get(mem_cgroup_from_css(css));
+ atomic_set(&memcg->id.ref, 1);
css_get(css);
return 0;
}
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
- page = find_get_page(swap_address_space(ent), ent.val);
+ page = find_get_page(swap_address_space(ent), swp_offset(ent));
if (do_memsw_account())
entry->val = ent.val;
swp_entry_t swp = radix_to_swp_entry(page);
if (do_memsw_account())
*entry = swp;
- page = find_get_page(swap_address_space(swp), swp.val);
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
}
} else
page = find_get_page(mapping, pgoff);
.mm = mm,
};
down_read(&mm->mmap_sem);
- walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
+ walk_page_range(0, mm->highest_vm_end,
+ &mem_cgroup_count_precharge_walk);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
+ walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+
up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
EXPORT_SYMBOL(memcg_sockets_enabled_key);
-void sock_update_memcg(struct sock *sk)
+void mem_cgroup_sk_alloc(struct sock *sk)
{
struct mem_cgroup *memcg;
- /* Socket cloning can throw us here with sk_cgrp already
+ if (!mem_cgroup_sockets_enabled)
+ return;
+
+ /*
+ * Socket cloning can throw us here with sk_memcg already
* filled. It won't however, necessarily happen from
* process context. So the test for root memcg given
* the current task's memcg won't help us in this case.
out:
rcu_read_unlock();
}
-EXPORT_SYMBOL(sock_update_memcg);
-void sock_release_memcg(struct sock *sk)
+void mem_cgroup_sk_free(struct sock *sk)
{
- WARN_ON(!sk->sk_memcg);
- css_put(&sk->sk_memcg->css);
+ if (sk->sk_memcg)
+ css_put(&sk->sk_memcg->css);
}
/**
/*
* subsys_initcall() for memory controller.
*
- * Some parts like hotcpu_notifier() have to be initialized from this context
- * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
- * everything that doesn't depend on a specific mem_cgroup structure should
- * be initialized from here.
+ * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
+ * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
+ * basically everything that doesn't depend on a specific mem_cgroup structure
+ * should be initialized from here.
*/
static int __init mem_cgroup_init(void)
{
int cpu, node;
- hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+#ifndef CONFIG_SLOB
+ /*
+ * Kmem cache creation is mostly done with the slab_mutex held,
+ * so use a special workqueue to avoid stalling all worker
+ * threads in case lots of cgroups are created simultaneously.
+ */
+ memcg_kmem_cache_create_wq =
+ alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
+ BUG_ON(!memcg_kmem_cache_create_wq);
+#endif
+
+ cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
+ memcg_hotplug_cpu_dead);
for_each_possible_cpu(cpu)
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,