time, signal: Protect resource use statistics with seqlock
authorRik van Riel <riel@redhat.com>
Sat, 16 Aug 2014 17:40:10 +0000 (13:40 -0400)
committerIngo Molnar <mingo@kernel.org>
Mon, 8 Sep 2014 06:17:01 +0000 (08:17 +0200)
Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability
issues on large systems, due to both functions being serialized with a
lock.

The lock protects against reporting a wrong value, due to a thread in the
task group exiting, its statistics reporting up to the signal struct, and
that exited task's statistics being counted twice (or not at all).

Protecting that with a lock results in times() and clock_gettime() being
completely serialized on large systems.

This can be fixed by using a seqlock around the events that gather and
propagate statistics. As an additional benefit, the protection code can
be moved into thread_group_cputime(), slightly simplifying the calling
functions.

In the case of posix_cpu_clock_get_task() things can be simplified a
lot, because the calling function already ensures that the task sticks
around, and the rest is now taken care of in thread_group_cputime().

This way the statistics reporting code can run lockless.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daeseok Youn <daeseok.youn@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guillaume Morin <guillaume@morinfr.org>
Cc: Ionut Alexa <ionut.m.alexa@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Michal Schmidt <mschmidt@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: umgwanakikbuti@gmail.com
Cc: fweisbec@gmail.com
Cc: srao@redhat.com
Cc: lwoodman@redhat.com
Cc: atheurer@redhat.com
Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
include/linux/sched.h
kernel/exit.c
kernel/fork.c
kernel/sched/cputime.c
kernel/sys.c
kernel/time/posix-cpu-timers.c

index 5c2c885ee52b3996a2665dc3d8c0e21ff9245aaf..dd9eb4807389c50f5595b7cf2d3254fcaa87b3fa 100644 (file)
@@ -645,6 +645,7 @@ struct signal_struct {
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
+       seqlock_t stats_lock;
        cputime_t utime, stime, cutime, cstime;
        cputime_t gtime;
        cputime_t cgtime;
index b93d46dab6fc016e8a684bc8acb69962278507d4..fa09b86609db733e441b3cf9d4a815420a908ac3 100644 (file)
@@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk)
         * the signal_struct.
         */
        task_cputime(tsk, &utime, &stime);
+       write_seqlock(&sig->stats_lock);
        sig->utime += utime;
        sig->stime += stime;
        sig->gtime += task_gtime(tsk);
@@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk)
        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        sig->nr_threads--;
        __unhash_process(tsk, group_dead);
+       write_sequnlock(&sig->stats_lock);
 
        /*
         * Do this under ->siglock, we can race with another thread
@@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
+               write_seqlock(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
+               write_sequnlock(&psig->stats_lock);
                spin_unlock_irq(&p->real_parent->sighand->siglock);
        }
 
index 0cf9cdb6e4919f254b32d04f6fdeb4943143a8d3..9387ae8ab048114fda9fb505dbbf97fea1312ed9 100644 (file)
@@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
+       seqlock_init(&sig->stats_lock);
 
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
index 3e52836359baff9bdcc352f523e9374e9754f88d..49b7cfe98f7a63ba99574002900660dfeff5296a 100644 (file)
@@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        struct signal_struct *sig = tsk->signal;
        cputime_t utime, stime;
        struct task_struct *t;
-
-       times->utime = sig->utime;
-       times->stime = sig->stime;
-       times->sum_exec_runtime = sig->sum_sched_runtime;
+       unsigned int seq, nextseq;
 
        rcu_read_lock();
-       for_each_thread(tsk, t) {
-               task_cputime(t, &utime, &stime);
-               times->utime += utime;
-               times->stime += stime;
-               times->sum_exec_runtime += task_sched_runtime(t);
-       }
+       /* Attempt a lockless read on the first round. */
+       nextseq = 0;
+       do {
+               seq = nextseq;
+               read_seqbegin_or_lock(&sig->stats_lock, &seq);
+               times->utime = sig->utime;
+               times->stime = sig->stime;
+               times->sum_exec_runtime = sig->sum_sched_runtime;
+
+               for_each_thread(tsk, t) {
+                       task_cputime(t, &utime, &stime);
+                       times->utime += utime;
+                       times->stime += stime;
+                       times->sum_exec_runtime += task_sched_runtime(t);
+               }
+               /* If lockless access failed, take the lock. */
+               nextseq = 1;
+       } while (need_seqretry(&sig->stats_lock, seq));
+       done_seqretry(&sig->stats_lock, seq);
        rcu_read_unlock();
 }
 
@@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime;
index ce8129192a26029da0d5b33b75daaeb85b1d08e5..b6636643cbd1c961278f9aca3a08e5bd0ed03219 100644 (file)
@@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms)
 {
        cputime_t tgutime, tgstime, cutime, cstime;
 
-       spin_lock_irq(&current->sighand->siglock);
        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
-       spin_unlock_irq(&current->sighand->siglock);
        tms->tms_utime = cputime_to_clock_t(tgutime);
        tms->tms_stime = cputime_to_clock_t(tgstime);
        tms->tms_cutime = cputime_to_clock_t(cutime);
index 3b8946416a5f836227edcf0db8dd1f8b2eb95063..492b986195d53350abe899b0ecd059f91edb77a4 100644 (file)
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
                if (same_thread_group(tsk, current))
                        err = cpu_clock_sample(which_clock, tsk, &rtn);
        } else {
-               unsigned long flags;
-               struct sighand_struct *sighand;
-
-               /*
-                * while_each_thread() is not yet entirely RCU safe,
-                * keep locking the group while sampling process
-                * clock for now.
-                */
-               sighand = lock_task_sighand(tsk, &flags);
-               if (!sighand)
-                       return err;
-
                if (tsk == current || thread_group_leader(tsk))
                        err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-
-               unlock_task_sighand(tsk, &flags);
        }
 
        if (!err)