Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
Pull scheduler fixes from Ingo Molnar:

 - Apply a number of membarrier related fixes and cleanups, which fixes
   a use-after-free race in the membarrier code

 - Introduce proper RCU protection for tasks on the runqueue - to get
   rid of the subtle task_rcu_dereference() interface that was easy to
   get wrong

 - Misc fixes, but also an EAS speedup

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Avoid redundant EAS calculation
  sched/core: Remove double update_max_interval() call on CPU startup
  sched/core: Fix preempt_schedule() interrupt return comment
  sched/fair: Fix -Wunused-but-set-variable warnings
  sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr()
  sched/membarrier: Return -ENOMEM to userspace on memory allocation failure
  sched/membarrier: Skip IPIs when mm->mm_users == 1
  selftests, sched/membarrier: Add multi-threaded test
  sched/membarrier: Fix p->mm->membarrier_state racy load
  sched/membarrier: Call sync_core only before usermode for same mm
  sched/membarrier: Remove redundant check
  sched/membarrier: Fix private expedited registration check
  tasks, sched/core: RCUify the assignment of rq->curr
  tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code
  tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue
  tasks: Add a count of task RCU users
  sched/core: Convert vcpu_is_preempted() from macro to an inline function
  sched/fair: Remove unused cfs_rq_clock_task() function

1  2 
include/linux/mm_types.h
include/linux/sched.h
kernel/fork.c
kernel/sched/core.c

diff --combined include/linux/mm_types.h
index 5183e0d77dfa6cf6fdcfb1c928bf315a61aec900,ec9bd3a6c82717fa9a1803bab4fb4ad414ed68cd..2222fa795284183344d603c5286fcfd8c40dffa3
@@@ -25,6 -25,7 +25,6 @@@
  
  struct address_space;
  struct mem_cgroup;
 -struct hmm;
  
  /*
   * Each physical page in the system has a struct page associated with
@@@ -138,7 -139,6 +138,7 @@@ struct page 
                struct {        /* Second tail page of compound page */
                        unsigned long _compound_pad_1;  /* compound_head */
                        unsigned long _compound_pad_2;
 +                      /* For both global and memcg */
                        struct list_head deferred_list;
                };
                struct {        /* Page table pages */
@@@ -383,6 -383,16 +383,16 @@@ struct mm_struct 
                unsigned long highest_vm_end;   /* highest vma end address */
                pgd_t * pgd;
  
+ #ifdef CONFIG_MEMBARRIER
+               /**
+                * @membarrier_state: Flags controlling membarrier behavior.
+                *
+                * This field is close to @pgd to hopefully fit in the same
+                * cache-line, which needs to be touched by switch_mm().
+                */
+               atomic_t membarrier_state;
+ #endif
                /**
                 * @mm_users: The number of users including userspace.
                 *
                unsigned long flags; /* Must use atomic bitops to access */
  
                struct core_state *core_state; /* coredumping support */
- #ifdef CONFIG_MEMBARRIER
-               atomic_t membarrier_state;
- #endif
  #ifdef CONFIG_AIO
                spinlock_t                      ioctx_lock;
                struct kioctx_table __rcu       *ioctx_table;
                atomic_long_t hugetlb_usage;
  #endif
                struct work_struct async_put_work;
 -
 -#ifdef CONFIG_HMM_MIRROR
 -              /* HMM needs to track a few things per mm */
 -              struct hmm *hmm;
 -#endif
        } __randomize_layout;
  
        /*
diff --combined include/linux/sched.h
index 70db597d6fd4f2ceb53b1a71c0db243e681d7070,8e43e54a02c732bde8f2f1d7c9e42fb81a8dba86..2c2e56bd8913250e88982ee8e47e36b23081fe23
  #include <linux/resource.h>
  #include <linux/latencytop.h>
  #include <linux/sched/prio.h>
 +#include <linux/sched/types.h>
  #include <linux/signal_types.h>
  #include <linux/mm_types_task.h>
  #include <linux/task_io_accounting.h>
 +#include <linux/posix-timers.h>
  #include <linux/rseq.h>
  
  /* task_struct member predeclarations (sorted alphabetically): */
@@@ -246,6 -244,27 +246,6 @@@ struct prev_cputime 
  #endif
  };
  
 -/**
 - * struct task_cputime - collected CPU time counts
 - * @utime:            time spent in user mode, in nanoseconds
 - * @stime:            time spent in kernel mode, in nanoseconds
 - * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
 - *
 - * This structure groups together three kinds of CPU time that are tracked for
 - * threads and thread groups.  Most things considering CPU time want to group
 - * these counts together and treat all three of them in parallel.
 - */
 -struct task_cputime {
 -      u64                             utime;
 -      u64                             stime;
 -      unsigned long long              sum_exec_runtime;
 -};
 -
 -/* Alternate field names when used on cache expirations: */
 -#define virt_exp                      utime
 -#define prof_exp                      stime
 -#define sched_exp                     sum_exec_runtime
 -
  enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
@@@ -862,8 -881,10 +862,8 @@@ struct task_struct 
        unsigned long                   min_flt;
        unsigned long                   maj_flt;
  
 -#ifdef CONFIG_POSIX_TIMERS
 -      struct task_cputime             cputime_expires;
 -      struct list_head                cpu_timers[3];
 -#endif
 +      /* Empty if CONFIG_POSIX_CPUTIMERS=n */
 +      struct posix_cputimers          posix_cputimers;
  
        /* Process credentials: */
  
        struct mutex_waiter             *blocked_on;
  #endif
  
 +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 +      int                             non_block_count;
 +#endif
 +
  #ifdef CONFIG_TRACE_IRQFLAGS
        unsigned int                    irq_events;
        unsigned long                   hardirq_enable_ip;
  
        struct tlbflush_unmap_batch     tlb_ubc;
  
-       struct rcu_head                 rcu;
+       union {
+               refcount_t              rcu_users;
+               struct rcu_head         rcu;
+       };
  
        /* Cache last used pipe for splice(): */
        struct pipe_inode_info          *splice_pipe;
@@@ -1839,7 -1859,10 +1842,10 @@@ static inline void set_task_cpu(struct 
   * running or not.
   */
  #ifndef vcpu_is_preempted
- # define vcpu_is_preempted(cpu)       false
+ static inline bool vcpu_is_preempted(int cpu)
+ {
+       return false;
+ }
  #endif
  
  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
diff --combined kernel/fork.c
index 60763c043aa3c7d71da8c3a1eaa78b4b0b634c68,d6e552552e526d5ec9bc078170f0fcabc8b12abf..f9572f416126283dd2e8ac6ce3bfd66899e58016
@@@ -125,15 -125,6 +125,15 @@@ int nr_threads;                  /* The idle threads d
  
  static int max_threads;               /* tunable limit on nr_threads */
  
 +#define NAMED_ARRAY_INDEX(x)  [x] = __stringify(x)
 +
 +static const char * const resident_page_types[] = {
 +      NAMED_ARRAY_INDEX(MM_FILEPAGES),
 +      NAMED_ARRAY_INDEX(MM_ANONPAGES),
 +      NAMED_ARRAY_INDEX(MM_SWAPENTS),
 +      NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
 +};
 +
  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  
  __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
@@@ -654,15 -645,12 +654,15 @@@ static void check_mm(struct mm_struct *
  {
        int i;
  
 +      BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
 +                       "Please make sure 'struct resident_page_types[]' is updated as well");
 +
        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = atomic_long_read(&mm->rss_stat.count[i]);
  
                if (unlikely(x))
 -                      printk(KERN_ALERT "BUG: Bad rss-counter state "
 -                                        "mm:%p idx:%d val:%ld\n", mm, i, x);
 +                      pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
 +                               mm, resident_page_types[i], x);
        }
  
        if (mm_pgtables_bytes(mm))
@@@ -915,10 -903,12 +915,12 @@@ static struct task_struct *dup_task_str
                tsk->cpus_ptr = &tsk->cpus_mask;
  
        /*
-        * One for us, one for whoever does the "release_task()" (usually
-        * parent)
+        * One for the user space visible state that goes away when reaped.
+        * One for the scheduler.
         */
-       refcount_set(&tsk->usage, 2);
+       refcount_set(&tsk->rcu_users, 2);
+       /* One for the rcu users */
+       refcount_set(&tsk->usage, 1);
  #ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
  #endif
@@@ -1021,6 -1011,7 +1023,6 @@@ static struct mm_struct *mm_init(struc
        mm_init_owner(mm, p);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_mm_init(mm);
 -      hmm_mm_init(mm);
        init_tlb_flush_pending(mm);
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
@@@ -1530,17 -1521,28 +1532,17 @@@ void __cleanup_sighand(struct sighand_s
        }
  }
  
 -#ifdef CONFIG_POSIX_TIMERS
  /*
   * Initialize POSIX timer handling for a thread group.
   */
  static void posix_cpu_timers_init_group(struct signal_struct *sig)
  {
 +      struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;
  
        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 -      if (cpu_limit != RLIM_INFINITY) {
 -              sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
 -              sig->cputimer.running = true;
 -      }
 -
 -      /* The timer lists. */
 -      INIT_LIST_HEAD(&sig->cpu_timers[0]);
 -      INIT_LIST_HEAD(&sig->cpu_timers[1]);
 -      INIT_LIST_HEAD(&sig->cpu_timers[2]);
 +      posix_cputimers_group_init(pct, cpu_limit);
  }
 -#else
 -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
 -#endif
  
  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
  {
@@@ -1642,6 -1644,23 +1644,6 @@@ static void rt_mutex_init_task(struct t
  #endif
  }
  
 -#ifdef CONFIG_POSIX_TIMERS
 -/*
 - * Initialize POSIX timer handling for a single task.
 - */
 -static void posix_cpu_timers_init(struct task_struct *tsk)
 -{
 -      tsk->cputime_expires.prof_exp = 0;
 -      tsk->cputime_expires.virt_exp = 0;
 -      tsk->cputime_expires.sched_exp = 0;
 -      INIT_LIST_HEAD(&tsk->cpu_timers[0]);
 -      INIT_LIST_HEAD(&tsk->cpu_timers[1]);
 -      INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 -}
 -#else
 -static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
 -#endif
 -
  static inline void init_task_pid_links(struct task_struct *task)
  {
        enum pid_type type;
@@@ -1928,7 -1947,7 +1930,7 @@@ static __latent_entropy struct task_str
        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);
  
 -      posix_cpu_timers_init(p);
 +      posix_cputimers_init(&p->posix_cputimers);
  
        p->io_context = NULL;
        audit_set_context(p, NULL);
diff --combined kernel/sched/core.c
index f9a1346a5fa9502be6ca45ecb1bd822bf706725e,1b61cf48eee89eba7386916afbb4c85b962fea49..7880f4f64d0eea19dab03a0408625a505b4454ed
@@@ -255,7 -255,7 +255,7 @@@ static void __hrtick_restart(struct rq 
  {
        struct hrtimer *timer = &rq->hrtick_timer;
  
 -      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 +      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
  }
  
  /*
@@@ -314,7 -314,7 +314,7 @@@ void hrtick_start(struct rq *rq, u64 de
         */
        delay = max_t(u64, delay, 10000LL);
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
 -                    HRTIMER_MODE_REL_PINNED);
 +                    HRTIMER_MODE_REL_PINNED_HARD);
  }
  #endif /* CONFIG_SMP */
  
@@@ -328,7 -328,7 +328,7 @@@ static void hrtick_rq_init(struct rq *r
        rq->hrtick_csd.info = rq;
  #endif
  
 -      hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 +      hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        rq->hrtick_timer.function = hrtick;
  }
  #else /* CONFIG_SCHED_HRTICK */
@@@ -1656,7 -1656,8 +1656,8 @@@ static int __set_cpus_allowed_ptr(struc
        if (cpumask_equal(p->cpus_ptr, new_mask))
                goto out;
  
-       if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+       if (dest_cpu >= nr_cpu_ids) {
                ret = -EINVAL;
                goto out;
        }
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
  
-       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
        if (task_running(rq, p) || p->state == TASK_WAKING) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
@@@ -3254,7 -3254,7 +3254,7 @@@ static struct rq *finish_task_switch(st
                /* Task is done with its stack. */
                put_task_stack(prev);
  
-               put_task_struct(prev);
+               put_task_struct_rcu_user(prev);
        }
  
        tick_nohz_task_switch();
@@@ -3358,15 -3358,15 +3358,15 @@@ context_switch(struct rq *rq, struct ta
                else
                        prev->active_mm = NULL;
        } else {                                        // to user
+               membarrier_switch_mm(rq, prev->active_mm, next->mm);
                /*
                 * sys_membarrier() requires an smp_mb() between setting
-                * rq->curr and returning to userspace.
+                * rq->curr / membarrier_switch_mm() and returning to userspace.
                 *
                 * The below provides this either through switch_mm(), or in
                 * case 'prev->active_mm == next->mm' through
                 * finish_task_switch()'s mmdrop().
                 */
                switch_mm_irqs_off(prev->active_mm, next->mm, next);
  
                if (!prev->mm) {                        // from kernel
@@@ -3871,22 -3871,13 +3871,22 @@@ static noinline void __schedule_bug(str
  /*
   * Various schedule()-time debugging checks and statistics:
   */
 -static inline void schedule_debug(struct task_struct *prev)
 +static inline void schedule_debug(struct task_struct *prev, bool preempt)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
        if (task_stack_end_corrupted(prev))
                panic("corrupted stack end detected inside scheduler\n");
  #endif
  
 +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 +      if (!preempt && prev->state && prev->non_block_count) {
 +              printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
 +                      prev->comm, prev->pid, prev->non_block_count);
 +              dump_stack();
 +              add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 +      }
 +#endif
 +
        if (unlikely(in_atomic_preempt_off())) {
                __schedule_bug(prev);
                preempt_count_set(PREEMPT_DISABLED);
@@@ -3998,7 -3989,7 +3998,7 @@@ static void __sched notrace __schedule(
        rq = cpu_rq(cpu);
        prev = rq->curr;
  
 -      schedule_debug(prev);
 +      schedule_debug(prev, preempt);
  
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
  
        if (likely(prev != next)) {
                rq->nr_switches++;
-               rq->curr = next;
+               /*
+                * RCU users of rcu_dereference(rq->curr) may not see
+                * changes to task_struct made by pick_next_task().
+                */
+               RCU_INIT_POINTER(rq->curr, next);
                /*
                 * The membarrier system call requires each architecture
                 * to have a full memory barrier after updating
@@@ -4223,9 -4218,8 +4227,8 @@@ static void __sched notrace preempt_sch
  
  #ifdef CONFIG_PREEMPTION
  /*
-  * this is the entry point to schedule() from in-kernel preemption
-  * off of preempt_enable. Kernel preemptions off return from interrupt
-  * occur there and call schedule directly.
+  * This is the entry point to schedule() from in-kernel preemption
+  * off of preempt_enable.
   */
  asmlinkage __visible void __sched notrace preempt_schedule(void)
  {
@@@ -4296,7 -4290,7 +4299,7 @@@ EXPORT_SYMBOL_GPL(preempt_schedule_notr
  #endif /* CONFIG_PREEMPTION */
  
  /*
-  * this is the entry point to schedule() from kernel preemption
+  * This is the entry point to schedule() from kernel preemption
   * off of irq context.
   * Note, that this is called and return with irqs disabled. This will
   * protect us against recursive calling from irq.
@@@ -6069,7 -6063,8 +6072,8 @@@ void init_idle(struct task_struct *idle
        __set_task_cpu(idle, cpu);
        rcu_read_unlock();
  
-       rq->curr = rq->idle = idle;
+       rq->idle = idle;
+       rcu_assign_pointer(rq->curr, idle);
        idle->on_rq = TASK_ON_RQ_QUEUED;
  #ifdef CONFIG_SMP
        idle->on_cpu = 1;
@@@ -6430,8 -6425,6 +6434,6 @@@ int sched_cpu_activate(unsigned int cpu
        }
        rq_unlock_irqrestore(rq, &rf);
  
-       update_max_interval();
        return 0;
  }
  
@@@ -6772,7 -6765,7 +6774,7 @@@ void ___might_sleep(const char *file, i
        rcu_sleep_check();
  
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
 -           !is_idle_task(current)) ||
 +           !is_idle_task(current) && !current->non_block_count) ||
            system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
            oops_in_progress)
                return;
                "BUG: sleeping function called from invalid context at %s:%d\n",
                        file, line);
        printk(KERN_ERR
 -              "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
 -                      in_atomic(), irqs_disabled(),
 +              "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
 +                      in_atomic(), irqs_disabled(), current->non_block_count,
                        current->pid, current->comm);
  
        if (task_stack_end_corrupted(current))