oom: move oom_adj value from task_struct to mm_struct
authorDavid Rientjes <rientjes@google.com>
Tue, 16 Jun 2009 22:32:56 +0000 (15:32 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 17 Jun 2009 02:47:43 +0000 (19:47 -0700)
The per-task oom_adj value is a characteristic of its mm more than the
task itself since it's not possible to oom kill any thread that shares the
mm.  If a task were to be killed while attached to an mm that could not be
freed because another thread were set to OOM_DISABLE, it would have
needlessly been terminated since there is no potential for future memory
freeing.

This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct mm_struct.  This requires task_lock() on a
task to check its oom_adj value to protect against exec, but it's already
necessary to take the lock when dereferencing the mm to find the total VM
size for the badness heuristic.

This fixes a livelock if the oom killer chooses a task and another thread
sharing the same memory has an oom_adj value of OOM_DISABLE.  This occurs
because oom_kill_task() repeatedly returns 1 and refuses to kill the
chosen task while select_bad_process() will repeatedly choose the same
task during the next retry.

Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in
oom_kill_task() to check for threads sharing the same memory will be
removed in the next patch in this series where it will no longer be
necessary.

Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since
these threads are immune from oom killing already.  They simply report an
oom_adj value of OOM_DISABLE.

Cc: Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Documentation/filesystems/proc.txt
fs/proc/base.c
include/linux/mm_types.h
include/linux/sched.h
mm/oom_kill.c

index cd8717a362712fbb537bf562eaa95b0be778dd3b..ebff3c10a07fc124b865f815de5b38d65f38d1fa 100644 (file)
@@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS
 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
 ------------------------------------------------------
 
-This file can be used to adjust the score used to select which processes
-should be killed in an  out-of-memory  situation.  Giving it a high score will
-increase the likelihood of this process being killed by the oom-killer.  Valid
-values are in the range -16 to +15, plus the special value -17, which disables
-oom-killing altogether for this process.
+This file can be used to adjust the score used to select which processes should
+be killed in an out-of-memory situation.  The oom_adj value is a characteristic
+of the task's mm, so all threads that share an mm with pid will have the same
+oom_adj value.  A high value will increase the likelihood of this process being
+killed by the oom-killer.  Valid values are in the range -16 to +15 as
+explained below and a special value of -17, which disables oom-killing
+altogether for threads sharing pid's mm.
 
 The process to be killed in an out-of-memory situation is selected among all others
 based on its badness score. This value equals the original memory size of the process
@@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers
 are the prime candidates to be killed. Having only one 'hungry' child will make
 parent less preferable than the child.
 
+/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from
+oom-killing already.
+
 /proc/<pid>/oom_score shows process' current badness score.
 
 The following heuristics are then applied:
index 1539e630c47d524b1df251236e638dbb1e8d279b..3ce5ae9e3d2dabd36dce105ecb4545a4723c1202 100644 (file)
@@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 
        if (!task)
                return -ESRCH;
-       oom_adjust = task->oomkilladj;
+       task_lock(task);
+       if (task->mm)
+               oom_adjust = task->mm->oom_adj;
+       else
+               oom_adjust = OOM_DISABLE;
+       task_unlock(task);
        put_task_struct(task);
 
        len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        task = get_proc_task(file->f_path.dentry->d_inode);
        if (!task)
                return -ESRCH;
-       if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+       task_lock(task);
+       if (!task->mm) {
+               task_unlock(task);
+               put_task_struct(task);
+               return -EINVAL;
+       }
+       if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+               task_unlock(task);
                put_task_struct(task);
                return -EACCES;
        }
-       task->oomkilladj = oom_adjust;
+       task->mm->oom_adj = oom_adjust;
+       task_unlock(task);
        put_task_struct(task);
        if (end - buffer == 0)
                return -EIO;
index 0e80e26ecf21220104d8d2abbeb9cca6a1215e6e..f4408106fcbc4c2047380a913f886c757b247ee0 100644 (file)
@@ -232,6 +232,8 @@ struct mm_struct {
 
        unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
+       s8 oom_adj;     /* OOM kill score adjustment (bit shift) */
+
        cpumask_t cpu_vm_mask;
 
        /* Architecture-specific MM context */
index 1048bf50540a18825731d614fe396b6aa01347c3..1bc6fae0c1351b96ffd7e7f1e88dda544891f73c 100644 (file)
@@ -1178,7 +1178,6 @@ struct task_struct {
         * a short time
         */
        unsigned char fpu_counter;
-       s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
 #ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int btrace_seq;
 #endif
index a7b2460e922b779252ebcc225cecaf6060b0e964..b60913520ef3a746cb56839d8fd4813337f2b271 100644 (file)
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        unsigned long points, cpu_time, run_time;
        struct mm_struct *mm;
        struct task_struct *child;
+       int oom_adj;
 
        task_lock(p);
        mm = p->mm;
@@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
                task_unlock(p);
                return 0;
        }
+       oom_adj = mm->oom_adj;
 
        /*
         * The memory size of the process is the basis for the badness.
@@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
                points /= 8;
 
        /*
-        * Adjust the score by oomkilladj.
+        * Adjust the score by oom_adj.
         */
-       if (p->oomkilladj) {
-               if (p->oomkilladj > 0) {
+       if (oom_adj) {
+               if (oom_adj > 0) {
                        if (!points)
                                points = 1;
-                       points <<= p->oomkilladj;
+                       points <<= oom_adj;
                } else
-                       points >>= -(p->oomkilladj);
+                       points >>= -(oom_adj);
        }
 
 #ifdef DEBUG
@@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
                        *ppoints = ULONG_MAX;
                }
 
-               if (p->oomkilladj == OOM_DISABLE)
+               task_lock(p);
+               if (p->mm && p->mm->oom_adj == OOM_DISABLE) {
+                       task_unlock(p);
                        continue;
+               }
+               task_unlock(p);
 
                points = badness(p, uptime.tv_sec);
                if (points > *ppoints || !chosen) {
@@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
                }
                printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
                       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
-                      get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
-                      p->comm);
+                      get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
                task_unlock(p);
        } while_each_thread(g, p);
 }
@@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p)
         * Don't kill the process if any threads are set to OOM_DISABLE
         */
        do_each_thread(g, q) {
-               if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+               task_lock(q);
+               if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) {
+                       task_unlock(q);
                        return 1;
+               }
+               task_unlock(q);
        } while_each_thread(g, q);
 
        __oom_kill_task(p, 1);
@@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        struct task_struct *c;
 
        if (printk_ratelimit()) {
-               printk(KERN_WARNING "%s invoked oom-killer: "
-                       "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
-                       current->comm, gfp_mask, order, current->oomkilladj);
                task_lock(current);
+               printk(KERN_WARNING "%s invoked oom-killer: "
+                       "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
+                       current->comm, gfp_mask, order,
+                       current->mm ? current->mm->oom_adj : OOM_DISABLE);
                cpuset_print_task_mems_allowed(current);
                task_unlock(current);
                dump_stack();