Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
diff --combined include/linux/mm_types.h

index 5183e0d77dfa6cf6fdcfb1c928bf315a61aec900,ec9bd3a6c82717fa9a1803bab4fb4ad414ed68cd..2222fa795284183344d603c5286fcfd8c40dffa3
--- 1/include/linux/mm_types.h
--- 2/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@@ -25,6 -25,7 +25,6 @@@
   
   struct address_space;
   struct mem_cgroup;
- -struct hmm;
   
   /*
    * Each physical page in the system has a struct page associated with
@@@ -138,7 -139,6 +138,7 @@@ struct page 
                 struct {        /* Second tail page of compound page */
                         unsigned long _compound_pad_1;  /* compound_head */
                         unsigned long _compound_pad_2;
+ +                      /* For both global and memcg */
                         struct list_head deferred_list;
                 };
                 struct {        /* Page table pages */
@@@ -383,6 -383,16 +383,16 @@@ struct mm_struct 
                 unsigned long highest_vm_end;   /* highest vma end address */
                 pgd_t * pgd;
   
+ #ifdef CONFIG_MEMBARRIER
+               /**
+                * @membarrier_state: Flags controlling membarrier behavior.
+                *
+                * This field is close to @pgd to hopefully fit in the same
+                * cache-line, which needs to be touched by switch_mm().
+                */
+               atomic_t membarrier_state;
+ #endif
+ 
                 /**
                  * @mm_users: The number of users including userspace.
                  *
@@@ -452,9 -462,7 +462,7 @@@
                 unsigned long flags; /* Must use atomic bitops to access */
   
                 struct core_state *core_state; /* coredumping support */
- #ifdef CONFIG_MEMBARRIER
-               atomic_t membarrier_state;
- #endif
+ 
   #ifdef CONFIG_AIO
                 spinlock_t                      ioctx_lock;
                 struct kioctx_table __rcu       *ioctx_table;
@@@ -511,6 -519,11 +519,6 @@@
                 atomic_long_t hugetlb_usage;
   #endif
                 struct work_struct async_put_work;
- -
- -#ifdef CONFIG_HMM_MIRROR
- -              /* HMM needs to track a few things per mm */
- -              struct hmm *hmm;
- -#endif
         } __randomize_layout;
   
         /*
diff --combined include/linux/sched.h

index 70db597d6fd4f2ceb53b1a71c0db243e681d7070,8e43e54a02c732bde8f2f1d7c9e42fb81a8dba86..2c2e56bd8913250e88982ee8e47e36b23081fe23
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -25,11 -25,9 +25,11 @@@
   #include <linux/resource.h>
   #include <linux/latencytop.h>
   #include <linux/sched/prio.h>
+ +#include <linux/sched/types.h>
   #include <linux/signal_types.h>
   #include <linux/mm_types_task.h>
   #include <linux/task_io_accounting.h>
+ +#include <linux/posix-timers.h>
   #include <linux/rseq.h>
   
   /* task_struct member predeclarations (sorted alphabetically): */
@@@ -246,6 -244,27 +246,6 @@@ struct prev_cputime 
   #endif
   };
   
- -/**
- - * struct task_cputime - collected CPU time counts
- - * @utime:            time spent in user mode, in nanoseconds
- - * @stime:            time spent in kernel mode, in nanoseconds
- - * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
- - *
- - * This structure groups together three kinds of CPU time that are tracked for
- - * threads and thread groups.  Most things considering CPU time want to group
- - * these counts together and treat all three of them in parallel.
- - */
- -struct task_cputime {
- -      u64                             utime;
- -      u64                             stime;
- -      unsigned long long              sum_exec_runtime;
- -};
- -
- -/* Alternate field names when used on cache expirations: */
- -#define virt_exp                      utime
- -#define prof_exp                      stime
- -#define sched_exp                     sum_exec_runtime
- -
   enum vtime_state {
         /* Task is sleeping or running in a CPU with VTIME inactive: */
         VTIME_INACTIVE = 0,
@@@ -862,8 -881,10 +862,8 @@@ struct task_struct 
         unsigned long                   min_flt;
         unsigned long                   maj_flt;
   
- -#ifdef CONFIG_POSIX_TIMERS
- -      struct task_cputime             cputime_expires;
- -      struct list_head                cpu_timers[3];
- -#endif
+ +      /* Empty if CONFIG_POSIX_CPUTIMERS=n */
+ +      struct posix_cputimers          posix_cputimers;
   
         /* Process credentials: */
   
@@@ -958,10 -979,6 +958,10 @@@
         struct mutex_waiter             *blocked_on;
   #endif
   
+ +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ +      int                             non_block_count;
+ +#endif
+ +
   #ifdef CONFIG_TRACE_IRQFLAGS
         unsigned int                    irq_events;
         unsigned long                   hardirq_enable_ip;
@@@ -1130,7 -1147,10 +1130,10 @@@
   
         struct tlbflush_unmap_batch     tlb_ubc;
   
-       struct rcu_head                 rcu;
+       union {
+               refcount_t              rcu_users;
+               struct rcu_head         rcu;
+       };
   
         /* Cache last used pipe for splice(): */
         struct pipe_inode_info          *splice_pipe;
@@@ -1839,7 -1859,10 +1842,10 @@@ static inline void set_task_cpu(struct 
    * running or not.
    */
   #ifndef vcpu_is_preempted
- # define vcpu_is_preempted(cpu)       false
+ static inline bool vcpu_is_preempted(int cpu)
+ {
+       return false;
+ }
   #endif
   
   extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
diff --combined kernel/fork.c

index 60763c043aa3c7d71da8c3a1eaa78b4b0b634c68,d6e552552e526d5ec9bc078170f0fcabc8b12abf..f9572f416126283dd2e8ac6ce3bfd66899e58016
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -125,15 -125,6 +125,15 @@@ int nr_threads;                  /* The idle threads d
   
   static int max_threads;               /* tunable limit on nr_threads */
   
+ +#define NAMED_ARRAY_INDEX(x)  [x] = __stringify(x)
+ +
+ +static const char * const resident_page_types[] = {
+ +      NAMED_ARRAY_INDEX(MM_FILEPAGES),
+ +      NAMED_ARRAY_INDEX(MM_ANONPAGES),
+ +      NAMED_ARRAY_INDEX(MM_SWAPENTS),
+ +      NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
+ +};
+ +
   DEFINE_PER_CPU(unsigned long, process_counts) = 0;
   
   __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
@@@ -654,15 -645,12 +654,15 @@@ static void check_mm(struct mm_struct *
   {
         int i;
   
+ +      BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
+ +                       "Please make sure 'struct resident_page_types[]' is updated as well");
+ +
         for (i = 0; i < NR_MM_COUNTERS; i++) {
                 long x = atomic_long_read(&mm->rss_stat.count[i]);
   
                 if (unlikely(x))
- -                      printk(KERN_ALERT "BUG: Bad rss-counter state "
- -                                        "mm:%p idx:%d val:%ld\n", mm, i, x);
+ +                      pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
+ +                               mm, resident_page_types[i], x);
         }
   
         if (mm_pgtables_bytes(mm))
@@@ -915,10 -903,12 +915,12 @@@ static struct task_struct *dup_task_str
                 tsk->cpus_ptr = &tsk->cpus_mask;
   
         /*
-        * One for us, one for whoever does the "release_task()" (usually
-        * parent)
+        * One for the user space visible state that goes away when reaped.
+        * One for the scheduler.
          */
-       refcount_set(&tsk->usage, 2);
+       refcount_set(&tsk->rcu_users, 2);
+       /* One for the rcu users */
+       refcount_set(&tsk->usage, 1);
   #ifdef CONFIG_BLK_DEV_IO_TRACE
         tsk->btrace_seq = 0;
   #endif
@@@ -1021,6 -1011,7 +1023,6 @@@ static struct mm_struct *mm_init(struc
         mm_init_owner(mm, p);
         RCU_INIT_POINTER(mm->exe_file, NULL);
         mmu_notifier_mm_init(mm);
- -      hmm_mm_init(mm);
         init_tlb_flush_pending(mm);
   #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
         mm->pmd_huge_pte = NULL;
@@@ -1530,17 -1521,28 +1532,17 @@@ void __cleanup_sighand(struct sighand_s
         }
   }
   
- -#ifdef CONFIG_POSIX_TIMERS
   /*
    * Initialize POSIX timer handling for a thread group.
    */
   static void posix_cpu_timers_init_group(struct signal_struct *sig)
   {
+ +      struct posix_cputimers *pct = &sig->posix_cputimers;
         unsigned long cpu_limit;
   
         cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
- -      if (cpu_limit != RLIM_INFINITY) {
- -              sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
- -              sig->cputimer.running = true;
- -      }
- -
- -      /* The timer lists. */
- -      INIT_LIST_HEAD(&sig->cpu_timers[0]);
- -      INIT_LIST_HEAD(&sig->cpu_timers[1]);
- -      INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ +      posix_cputimers_group_init(pct, cpu_limit);
   }
- -#else
- -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
- -#endif
   
   static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
   {
@@@ -1642,6 -1644,23 +1644,6 @@@ static void rt_mutex_init_task(struct t
   #endif
   }
   
- -#ifdef CONFIG_POSIX_TIMERS
- -/*
- - * Initialize POSIX timer handling for a single task.
- - */
- -static void posix_cpu_timers_init(struct task_struct *tsk)
- -{
- -      tsk->cputime_expires.prof_exp = 0;
- -      tsk->cputime_expires.virt_exp = 0;
- -      tsk->cputime_expires.sched_exp = 0;
- -      INIT_LIST_HEAD(&tsk->cpu_timers[0]);
- -      INIT_LIST_HEAD(&tsk->cpu_timers[1]);
- -      INIT_LIST_HEAD(&tsk->cpu_timers[2]);
- -}
- -#else
- -static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
- -#endif
- -
   static inline void init_task_pid_links(struct task_struct *task)
   {
         enum pid_type type;
@@@ -1928,7 -1947,7 +1930,7 @@@ static __latent_entropy struct task_str
         task_io_accounting_init(&p->ioac);
         acct_clear_integrals(p);
   
- -      posix_cpu_timers_init(p);
+ +      posix_cputimers_init(&p->posix_cputimers);
   
         p->io_context = NULL;
         audit_set_context(p, NULL);
diff --combined kernel/sched/core.c

index f9a1346a5fa9502be6ca45ecb1bd822bf706725e,1b61cf48eee89eba7386916afbb4c85b962fea49..7880f4f64d0eea19dab03a0408625a505b4454ed
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -255,7 -255,7 +255,7 @@@ static void __hrtick_restart(struct rq 
   {
         struct hrtimer *timer = &rq->hrtick_timer;
   
- -      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ +      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
   }
   
   /*
@@@ -314,7 -314,7 +314,7 @@@ void hrtick_start(struct rq *rq, u64 de
          */
         delay = max_t(u64, delay, 10000LL);
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- -                    HRTIMER_MODE_REL_PINNED);
+ +                    HRTIMER_MODE_REL_PINNED_HARD);
   }
   #endif /* CONFIG_SMP */
   
@@@ -328,7 -328,7 +328,7 @@@ static void hrtick_rq_init(struct rq *r
         rq->hrtick_csd.info = rq;
   #endif
   
- -      hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ +      hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         rq->hrtick_timer.function = hrtick;
   }
   #else /* CONFIG_SCHED_HRTICK */
@@@ -1656,7 -1656,8 +1656,8 @@@ static int __set_cpus_allowed_ptr(struc
         if (cpumask_equal(p->cpus_ptr, new_mask))
                 goto out;
   
-       if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+       if (dest_cpu >= nr_cpu_ids) {
                 ret = -EINVAL;
                 goto out;
         }
@@@ -1677,7 -1678,6 +1678,6 @@@
         if (cpumask_test_cpu(task_cpu(p), new_mask))
                 goto out;
   
-       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
         if (task_running(rq, p) || p->state == TASK_WAKING) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
@@@ -3254,7 -3254,7 +3254,7 @@@ static struct rq *finish_task_switch(st
                 /* Task is done with its stack. */
                 put_task_stack(prev);
   
-               put_task_struct(prev);
+               put_task_struct_rcu_user(prev);
         }
   
         tick_nohz_task_switch();
@@@ -3358,15 -3358,15 +3358,15 @@@ context_switch(struct rq *rq, struct ta
                 else
                         prev->active_mm = NULL;
         } else {                                        // to user
+               membarrier_switch_mm(rq, prev->active_mm, next->mm);
                 /*
                  * sys_membarrier() requires an smp_mb() between setting
-                * rq->curr and returning to userspace.
+                * rq->curr / membarrier_switch_mm() and returning to userspace.
                  *
                  * The below provides this either through switch_mm(), or in
                  * case 'prev->active_mm == next->mm' through
                  * finish_task_switch()'s mmdrop().
                  */
- 
                 switch_mm_irqs_off(prev->active_mm, next->mm, next);
   
                 if (!prev->mm) {                        // from kernel
@@@ -3871,22 -3871,13 +3871,22 @@@ static noinline void __schedule_bug(str
   /*
    * Various schedule()-time debugging checks and statistics:
    */
- -static inline void schedule_debug(struct task_struct *prev)
+ +static inline void schedule_debug(struct task_struct *prev, bool preempt)
   {
   #ifdef CONFIG_SCHED_STACK_END_CHECK
         if (task_stack_end_corrupted(prev))
                 panic("corrupted stack end detected inside scheduler\n");
   #endif
   
+ +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ +      if (!preempt && prev->state && prev->non_block_count) {
+ +              printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+ +                      prev->comm, prev->pid, prev->non_block_count);
+ +              dump_stack();
+ +              add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+ +      }
+ +#endif
+ +
         if (unlikely(in_atomic_preempt_off())) {
                 __schedule_bug(prev);
                 preempt_count_set(PREEMPT_DISABLED);
@@@ -3998,7 -3989,7 +3998,7 @@@ static void __sched notrace __schedule(
         rq = cpu_rq(cpu);
         prev = rq->curr;
   
- -      schedule_debug(prev);
+ +      schedule_debug(prev, preempt);
   
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
@@@ -4042,7 -4033,11 +4042,11 @@@
   
         if (likely(prev != next)) {
                 rq->nr_switches++;
-               rq->curr = next;
+               /*
+                * RCU users of rcu_dereference(rq->curr) may not see
+                * changes to task_struct made by pick_next_task().
+                */
+               RCU_INIT_POINTER(rq->curr, next);
                 /*
                  * The membarrier system call requires each architecture
                  * to have a full memory barrier after updating
@@@ -4223,9 -4218,8 +4227,8 @@@ static void __sched notrace preempt_sch
   
   #ifdef CONFIG_PREEMPTION
   /*
-  * this is the entry point to schedule() from in-kernel preemption
-  * off of preempt_enable. Kernel preemptions off return from interrupt
-  * occur there and call schedule directly.
+  * This is the entry point to schedule() from in-kernel preemption
+  * off of preempt_enable.
    */
   asmlinkage __visible void __sched notrace preempt_schedule(void)
   {
@@@ -4296,7 -4290,7 +4299,7 @@@ EXPORT_SYMBOL_GPL(preempt_schedule_notr
   #endif /* CONFIG_PREEMPTION */
   
   /*
-  * this is the entry point to schedule() from kernel preemption
+  * This is the entry point to schedule() from kernel preemption
    * off of irq context.
    * Note, that this is called and return with irqs disabled. This will
    * protect us against recursive calling from irq.
@@@ -6069,7 -6063,8 +6072,8 @@@ void init_idle(struct task_struct *idle
         __set_task_cpu(idle, cpu);
         rcu_read_unlock();
   
-       rq->curr = rq->idle = idle;
+       rq->idle = idle;
+       rcu_assign_pointer(rq->curr, idle);
         idle->on_rq = TASK_ON_RQ_QUEUED;
   #ifdef CONFIG_SMP
         idle->on_cpu = 1;
@@@ -6430,8 -6425,6 +6434,6 @@@ int sched_cpu_activate(unsigned int cpu
         }
         rq_unlock_irqrestore(rq, &rf);
   
-       update_max_interval();
- 
         return 0;
   }
   
@@@ -6772,7 -6765,7 +6774,7 @@@ void ___might_sleep(const char *file, i
         rcu_sleep_check();
   
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- -           !is_idle_task(current)) ||
+ +           !is_idle_task(current) && !current->non_block_count) ||
             system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
             oops_in_progress)
                 return;
@@@ -6788,8 -6781,8 +6790,8 @@@
                 "BUG: sleeping function called from invalid context at %s:%d\n",
                         file, line);
         printk(KERN_ERR
- -              "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- -                      in_atomic(), irqs_disabled(),
+ +              "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ +                      in_atomic(), irqs_disabled(), current->non_block_count,
                         current->pid, current->comm);
   
         if (task_stack_end_corrupted(current))
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
		1	2
include/linux/mm_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history