Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index 3b29005aa9810d98a6604bed10e1d4c98dde6360..5f1c266131b08b7d349b68afe28086b0bed7b005 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -951,6 +951,13 @@ controller implements weight and absolute bandwidth limit models for
  normal scheduling policy and absolute bandwidth allocation model for
  realtime scheduling policy.
  
+In all the above models, cycles distribution is defined only on a temporal
+base and it does not account for the frequency at which tasks are executed.
+The (optional) utilization clamping support allows to hint the schedutil
+cpufreq governor about the minimum desired frequency which should always be
+provided by a CPU, as well as the maximum desired frequency, which should not
+be exceeded by a CPU.
+
  WARNING: cgroup2 doesn't yet support control of realtime processes and
  the cpu controller can only be enabled when all RT processes are in
  the root cgroup.  Be aware that system management software may already
@@ -1016,6 +1023,33 @@ All time durations are in microseconds.
         Shows pressure stall information for CPU. See
         Documentation/accounting/psi.rst for details.
  
+  cpu.uclamp.min
+        A read-write single value file which exists on non-root cgroups.
+        The default is "0", i.e. no utilization boosting.
+
+        The requested minimum utilization (protection) as a percentage
+        rational number, e.g. 12.34 for 12.34%.
+
+        This interface allows reading and setting minimum utilization clamp
+        values similar to the sched_setattr(2). This minimum utilization
+        value is used to clamp the task specific minimum utilization clamp.
+
+        The requested minimum utilization (protection) is always capped by
+        the current value for the maximum utilization (limit), i.e.
+        `cpu.uclamp.max`.
+
+  cpu.uclamp.max
+        A read-write single value file which exists on non-root cgroups.
+        The default is "max". i.e. no utilization capping
+
+        The requested maximum utilization (limit) as a percentage rational
+        number, e.g. 98.76 for 98.76%.
+
+        This interface allows reading and setting maximum utilization clamp
+        values similar to the sched_setattr(2). This maximum utilization
+        value is used to clamp the task specific maximum utilization clamp.
+
+
  
  Memory
  ------
diff --git a/Documentation/scheduler/sched-bwc.rst b/Documentation/scheduler/sched-bwc.rst

index 3a9064219656cc97a2bc5881ed67e9fdb2ea15b2..9801d6b284b1ecfdd51b7e1ab5ab5f9bc6da3b40 100644 (file)
--- a/Documentation/scheduler/sched-bwc.rst
+++ b/Documentation/scheduler/sched-bwc.rst
@@ -9,15 +9,16 @@ CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
  specification of the maximum CPU bandwidth available to a group or hierarchy.
  
  The bandwidth allowed for a group is specified using a quota and period. Within
-each given "period" (microseconds), a group is allowed to consume only up to
-"quota" microseconds of CPU time.  When the CPU bandwidth consumption of a
-group exceeds this limit (for that period), the tasks belonging to its
-hierarchy will be throttled and are not allowed to run again until the next
-period.
-
-A group's unused runtime is globally tracked, being refreshed with quota units
-above at each period boundary.  As threads consume this bandwidth it is
-transferred to cpu-local "silos" on a demand basis.  The amount transferred
+each given "period" (microseconds), a task group is allocated up to "quota"
+microseconds of CPU time. That quota is assigned to per-cpu run queues in
+slices as threads in the cgroup become runnable. Once all quota has been
+assigned any additional requests for quota will result in those threads being
+throttled. Throttled threads will not be able to run again until the next
+period when the quota is replenished.
+
+A group's unassigned quota is globally tracked, being refreshed back to
+cfs_quota units at each period boundary. As threads consume this bandwidth it
+is transferred to cpu-local "silos" on a demand basis. The amount transferred
  within each of these updates is tunable and described as the "slice".
  
  Management
@@ -35,12 +36,12 @@ The default values are::
  
  A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
  bandwidth restriction in place, such a group is described as an unconstrained
-bandwidth group.  This represents the traditional work-conserving behavior for
+bandwidth group. This represents the traditional work-conserving behavior for
  CFS.
  
  Writing any (valid) positive value(s) will enact the specified bandwidth limit.
-The minimum quota allowed for the quota or period is 1ms.  There is also an
-upper bound on the period length of 1s.  Additional restrictions exist when
+The minimum quota allowed for the quota or period is 1ms. There is also an
+upper bound on the period length of 1s. Additional restrictions exist when
  bandwidth limits are used in a hierarchical fashion, these are explained in
  more detail below.
  
@@ -53,8 +54,8 @@ unthrottled if it is in a constrained state.
  System wide settings
  --------------------
  For efficiency run-time is transferred between the global pool and CPU local
-"silos" in a batch fashion.  This greatly reduces global accounting pressure
-on large systems.  The amount transferred each time such an update is required
+"silos" in a batch fashion. This greatly reduces global accounting pressure
+on large systems. The amount transferred each time such an update is required
  is described as the "slice".
  
  This is tunable via procfs::
@@ -97,6 +98,51 @@ There are two ways in which a group may become throttled:
  In case b) above, even though the child may have runtime remaining it will not
  be allowed to until the parent's runtime is refreshed.
  
+CFS Bandwidth Quota Caveats
+---------------------------
+Once a slice is assigned to a cpu it does not expire.  However all but 1ms of
+the slice may be returned to the global pool if all threads on that cpu become
+unrunnable. This is configured at compile time by the min_cfs_rq_runtime
+variable. This is a performance tweak that helps prevent added contention on
+the global lock.
+
+The fact that cpu-local slices do not expire results in some interesting corner
+cases that should be understood.
+
+For cgroup cpu constrained applications that are cpu limited this is a
+relatively moot point because they will naturally consume the entirety of their
+quota as well as the entirety of each cpu-local slice in each period. As a
+result it is expected that nr_periods roughly equal nr_throttled, and that
+cpuacct.usage will increase roughly equal to cfs_quota_us in each period.
+
+For highly-threaded, non-cpu bound applications this non-expiration nuance
+allows applications to briefly burst past their quota limits by the amount of
+unused slice on each cpu that the task group is running on (typically at most
+1ms per cpu or as defined by min_cfs_rq_runtime).  This slight burst only
+applies if quota had been assigned to a cpu and then not fully used or returned
+in previous periods. This burst amount will not be transferred between cores.
+As a result, this mechanism still strictly limits the task group to quota
+average usage, albeit over a longer time window than a single period.  This
+also limits the burst ability to no more than 1ms per cpu.  This provides
+better more predictable user experience for highly threaded applications with
+small quota limits on high core count machines. It also eliminates the
+propensity to throttle these applications while simultanously using less than
+quota amounts of cpu. Another way to say this, is that by allowing the unused
+portion of a slice to remain valid across periods we have decreased the
+possibility of wastefully expiring quota on cpu-local silos that don't need a
+full slice's amount of cpu time.
+
+The interaction between cpu-bound and non-cpu-bound-interactive applications
+should also be considered, especially when single core usage hits 100%. If you
+gave each of these applications half of a cpu-core and they both got scheduled
+on the same CPU it is theoretically possible that the non-cpu bound application
+will use up to 1ms additional quota in some periods, thereby preventing the
+cpu-bound application from fully using its quota by that same amount. In these
+instances it will be up to the CFS algorithm (see sched-design-CFS.rst) to
+decide which application is chosen to run, as they will both be runnable and
+have remaining quota. This runtime discrepancy will be made up in the following
+periods when the interactive application idles.
+
  Examples
  --------
  1. Limit a group to 1 CPU worth of runtime::
diff --git a/MAINTAINERS b/MAINTAINERS

index cbe625343277ea0277e470e413c9cdbf79cabc04..49f75d1b7b51a95d1177f6c207b9ecb0e3a1b8dc 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12578,6 +12578,7 @@ PERFORMANCE EVENTS SUBSYSTEM
  M:     Peter Zijlstra <peterz@infradead.org>
  M:     Ingo Molnar <mingo@redhat.com>
  M:     Arnaldo Carvalho de Melo <acme@kernel.org>
+R:     Mark Rutland <mark.rutland@arm.com>
  R:     Alexander Shishkin <alexander.shishkin@linux.intel.com>
  R:     Jiri Olsa <jolsa@redhat.com>
  R:     Namhyung Kim <namhyung@kernel.org>
@@ -14175,6 +14176,12 @@ F:     drivers/watchdog/sc1200wdt.c
  SCHEDULER
  M:     Ingo Molnar <mingo@redhat.com>
  M:     Peter Zijlstra <peterz@infradead.org>
+M:     Juri Lelli <juri.lelli@redhat.com> (SCHED_DEADLINE)
+M:     Vincent Guittot <vincent.guittot@linaro.org> (SCHED_NORMAL)
+R:     Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
+R:     Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
+R:     Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
+R:     Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
  L:     linux-kernel@vger.kernel.org
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
  S:     Maintained
diff --git a/arch/Kconfig b/arch/Kconfig

index 71d9ae0c0ea16ea8990e1a81841d5bb31b77a07d..6baedab10dcaa14130b15907313200a6f04b38b4 100644 (file)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -106,7 +106,7 @@ config STATIC_KEYS_SELFTEST
  config OPTPROBES
         def_bool y
         depends on KPROBES && HAVE_OPTPROBES
-       select TASKS_RCU if PREEMPT
+       select TASKS_RCU if PREEMPTION
  
  config KPROBES_ON_FTRACE
         def_bool y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig

index 13d49c232556ce9e3bbdf1862fc3ca388d1b2e6a..9711cf73092948678423b9d474030eecd24d4032 100644 (file)
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -311,6 +311,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
  config NUMA
         bool "NUMA support"
         depends on !FLATMEM
+       select SMP
         help
           Say Y to compile the kernel to support NUMA (Non-Uniform Memory
           Access).  This option is for configuring high-end multiprocessor
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S

index 4f86928246e7a37ada2b324e84cf6d26be70dff2..f83ca5aa8b7794102a9bf23e0eb0123a5d162e82 100644 (file)
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -63,7 +63,7 @@
   * enough to patch inline, increasing performance.
   */
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  # define preempt_stop(clobbers)        DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
  #else
  # define preempt_stop(clobbers)
@@ -1084,7 +1084,7 @@ restore_all:
         INTERRUPT_RETURN
  
  restore_all_kernel:
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
         DISABLE_INTERRUPTS(CLBR_ANY)
         cmpl    $0, PER_CPU_VAR(__preempt_count)
         jnz     .Lno_preempt
@@ -1364,7 +1364,7 @@ ENTRY(xen_hypervisor_callback)
  ENTRY(xen_do_upcall)
  1:     mov     %esp, %eax
         call    xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
         call    xen_maybe_preempt_hcall
  #endif
         jmp     ret_from_intr
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S

index be9ca198c581aea7ed29f4417aae9c1c1b835473..af077ded196966256792af01507427fc800cf32e 100644 (file)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -664,7 +664,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
  
  /* Returning to kernel space */
  retint_kernel:
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
         /* Interrupts are off */
         /* Check if we need preemption */
         btl     $9, EFLAGS(%rsp)                /* were interrupts off? */
@@ -1115,7 +1115,7 @@ ENTRY(xen_do_hypervisor_callback)         /* do_hypervisor_callback(struct *pt_regs) */
         call    xen_evtchn_do_upcall
         LEAVE_IRQ_STACK
  
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
         call    xen_maybe_preempt_hcall
  #endif
         jmp     error_exit
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S

index cb3464525b37ba55e3bdde523d08bd716b3b123b..2713490611a3b5361ee82f9d09555fc9f69953ae 100644 (file)
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -34,7 +34,7 @@
         THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
  #endif
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
         THUNK ___preempt_schedule, preempt_schedule
         THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
         EXPORT_SYMBOL(___preempt_schedule)
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S

index cc20465b28672d3f8beb10a62cf9fc6fb86c196a..ea5c4167086c20325a5cccd5579e6bb9c152b42c 100644 (file)
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -46,7 +46,7 @@
         THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
  #endif
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
         THUNK ___preempt_schedule, preempt_schedule
         THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
         EXPORT_SYMBOL(___preempt_schedule)
@@ -55,7 +55,7 @@
  
  #if defined(CONFIG_TRACE_IRQFLAGS) \
   || defined(CONFIG_DEBUG_LOCK_ALLOC) \
- || defined(CONFIG_PREEMPT)
+ || defined(CONFIG_PREEMPTION)
  .L_restore:
         popq %r11
         popq %r10
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h

index 99a7fa9ab0a32a1c24e4b699aa3ad10b4640b9f5..3d4cb83a88284086448f45de9f7be0c0457f78cf 100644 (file)
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -102,7 +102,7 @@ static __always_inline bool should_resched(int preempt_offset)
         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
  }
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
    extern asmlinkage void ___preempt_schedule(void);
  # define __preempt_schedule() \
         asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index 68c363c341bf2a3794ec26f29df4dd28281d4dc5..7d6e0efcc2db3cf909c2fd868311ac496b9a1070 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
  #include <linux/sched.h>
  #include <linux/sched/clock.h>
  #include <linux/random.h>
+#include <linux/topology.h>
  #include <asm/processor.h>
  #include <asm/apic.h>
  #include <asm/cacheinfo.h>
@@ -889,6 +890,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
  {
         set_cpu_cap(c, X86_FEATURE_ZEN);
  
+#ifdef CONFIG_NUMA
+       node_reclaim_distance = 32;
+#endif
+
         /*
          * Fix erratum 1076: CPB feature bit not being set in CPUID.
          * Always set it, except when running under a hypervisor.
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c

index 2b5886401e5f4eb2c221f813675927a53cc97d61..e07424e19274b4125098c3d301fb20b6f54a8399 100644 (file)
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -367,13 +367,18 @@ NOKPROBE_SYMBOL(oops_end);
  
  int __die(const char *str, struct pt_regs *regs, long err)
  {
+       const char *pr = "";
+
         /* Save the regs of the first oops for the executive summary later. */
         if (!die_counter)
                 exec_summary_regs = *regs;
  
+       if (IS_ENABLED(CONFIG_PREEMPTION))
+               pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
+
         printk(KERN_DEFAULT
                "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
-              IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
+              pr,
                IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
                debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
                IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "",
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c

index 0e0b08008b5abe900515a31d7c4b56df0482d317..43fc13c831af0c77db4ff29bbfa8e4b7c9f8bd48 100644 (file)
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -580,7 +580,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
         if (setup_detour_execution(p, regs, reenter))
                 return;
  
-#if !defined(CONFIG_PREEMPT)
+#if !defined(CONFIG_PREEMPTION)
         if (p->ainsn.boostable && !p->post_handler) {
                 /* Boost up -- we can execute copied instructions directly */
                 if (!reenter)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index 4ab377c9fffede8af8c93b620bdb9d90803fd353..4cc967178bf952ca32b01059505b588c3439db1d 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -311,7 +311,7 @@ static void kvm_guest_cpu_init(void)
         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
                 u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
                 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
  #endif
                 pa |= KVM_ASYNC_PF_ENABLED;
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h

index c3046c9200630b15f9ced04213c8415258dd1c81..d683f5e6d7913be7746e375908265b827d1b74c2 100644 (file)
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -78,11 +78,11 @@ static __always_inline bool should_resched(int preempt_offset)
                         tif_need_resched());
  }
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  extern asmlinkage void preempt_schedule(void);
  #define __preempt_schedule() preempt_schedule()
  extern asmlinkage void preempt_schedule_notrace(void);
  #define __preempt_schedule_notrace() preempt_schedule_notrace()
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
  
  #endif /* __ASM_PREEMPT_H */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index f6b048902d6c5337a05b0ef96a955f768483808e..3ba3e6da13a6fb350d93917f57066bc552c38c22 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -150,6 +150,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
  struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
  
+void cgroup_enable_task_cg_lists(void);
  void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                          struct css_task_iter *it);
  struct task_struct *css_task_iter_next(struct css_task_iter *it);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index 934633a05d209138a815af3e0d073ae7a5ec1954..04c20de66afc2136e5bf3251b4b1780e30a1b4be 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
  
  static inline void cpuset_inc(void)
  {
-       static_branch_inc(&cpusets_pre_enable_key);
-       static_branch_inc(&cpusets_enabled_key);
+       static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
+       static_branch_inc_cpuslocked(&cpusets_enabled_key);
  }
  
  static inline void cpuset_dec(void)
  {
-       static_branch_dec(&cpusets_enabled_key);
-       static_branch_dec(&cpusets_pre_enable_key);
+       static_branch_dec_cpuslocked(&cpusets_enabled_key);
+       static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
  }
  
  extern int cpuset_init(void);
@@ -55,6 +55,8 @@ extern void cpuset_init_smp(void);
  extern void cpuset_force_rebuild(void);
  extern void cpuset_update_active_cpus(void);
  extern void cpuset_wait_for_hotplug(void);
+extern void cpuset_read_lock(void);
+extern void cpuset_read_unlock(void);
  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
  extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -176,6 +178,9 @@ static inline void cpuset_update_active_cpus(void)
  
  static inline void cpuset_wait_for_hotplug(void) { }
  
+static inline void cpuset_read_lock(void) { }
+static inline void cpuset_read_unlock(void) { }
+
  static inline void cpuset_cpus_allowed(struct task_struct *p,
                                        struct cpumask *mask)
  {
diff --git a/include/linux/preempt.h b/include/linux/preempt.h

index dd92b1a93919fd26af81eb1f0cb43dae5f5efd2d..bbb68dba37cc8ed00c6cc94ff835a5519c1c26e9 100644 (file)
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -182,7 +182,7 @@ do { \
  
  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  #define preempt_enable() \
  do { \
         barrier(); \
@@ -203,7 +203,7 @@ do { \
                 __preempt_schedule(); \
  } while (0)
  
-#else /* !CONFIG_PREEMPT */
+#else /* !CONFIG_PREEMPTION */
  #define preempt_enable() \
  do { \
         barrier(); \
@@ -217,7 +217,7 @@ do { \
  } while (0)
  
  #define preempt_check_resched() do { } while (0)
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
  
  #define preempt_disable_notrace() \
  do { \
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index 80d6056f58556590af7adb5978daf40692cca87e..75a2eded7aa2ce6973622ecfd5a2a00772f07270 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -585,7 +585,7 @@ do {                                                                              \
   *
   * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU),
   * it is illegal to block while in an RCU read-side critical section.
- * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPT
+ * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
   * kernel builds, RCU read-side critical sections may be preempted,
   * but explicit blocking is illegal.  Finally, in preemptible RCU
   * implementations in real-time (with -rt patchset) kernel builds, RCU
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h

index 735601ac27d3950f8118f200c3e8125bef1fbd80..18b1ed9864b02c8909097fceab9317dc4e97650f 100644 (file)
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -53,7 +53,7 @@ void rcu_scheduler_starting(void);
  extern int rcu_scheduler_active __read_mostly;
  void rcu_end_inkernel_boot(void);
  bool rcu_is_watching(void);
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
  void rcu_all_qs(void);
  #endif
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 9f51932bd543f68e0d18d93dfe7c1bdaa047e616..f0edee94834a8262db98f3abe61721121ff1c625 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -295,6 +295,11 @@ enum uclamp_id {
         UCLAMP_CNT
  };
  
+#ifdef CONFIG_SMP
+extern struct root_domain def_root_domain;
+extern struct mutex sched_domains_mutex;
+#endif
+
  struct sched_info {
  #ifdef CONFIG_SCHED_INFO
         /* Cumulative counters: */
@@ -1767,7 +1772,7 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
   * value indicates whether a reschedule was done in fact.
   * cond_resched_lock() will drop the spinlock before scheduling,
   */
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
  extern int _cond_resched(void);
  #else
  static inline int _cond_resched(void) { return 0; }
@@ -1796,12 +1801,12 @@ static inline void cond_resched_rcu(void)
  
  /*
   * Does a critical section need to be broken due to another
- * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+ * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
   * but a general need for low latency)
   */
  static inline int spin_needbreak(spinlock_t *lock)
  {
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
         return spin_is_contended(lock);
  #else
         return 0;
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h

index 0cb034331cbb80e592da0d52eec8d9ae63d2e6ba..1aff00b65f3cb92c145d764e27eb81329cddeb50 100644 (file)
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -24,3 +24,11 @@ static inline bool dl_time_before(u64 a, u64 b)
  {
         return (s64)(a - b) < 0;
  }
+
+#ifdef CONFIG_SMP
+
+struct root_domain;
+extern void dl_add_task_root_domain(struct task_struct *p);
+extern void dl_clear_root_domain(struct root_domain *rd);
+
+#endif /* CONFIG_SMP */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h

index 0497091e40c12326078554c5dda0155193075a9b..3d90ed8f75f0992e98d9169f087c74b8fd5a4d74 100644 (file)
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -105,7 +105,11 @@ extern void sched_exec(void);
  #define sched_exec()   {}
  #endif
  
-#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0)
+static inline struct task_struct *get_task_struct(struct task_struct *t)
+{
+       refcount_inc(&t->usage);
+       return t;
+}
  
  extern void __put_task_struct(struct task_struct *t);
  
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 7863bb62d2ab1543f3bea336bcb3e156c9414afa..f341163fedc90e46fbeac07430472f5d04d5da50 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -150,6 +150,10 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
         return to_cpumask(sd->span);
  }
  
+extern void partition_sched_domains_locked(int ndoms_new,
+                                          cpumask_var_t doms_new[],
+                                          struct sched_domain_attr *dattr_new);
+
  extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
                                     struct sched_domain_attr *dattr_new);
  
@@ -194,6 +198,12 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
  
  struct sched_domain_attr;
  
+static inline void
+partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+                              struct sched_domain_attr *dattr_new)
+{
+}
+
  static inline void
  partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
                         struct sched_domain_attr *dattr_new)
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h

index ed7c4d6b8235f6c50e1847a7259cabdde5247b97..031ce8617df8fff0eb06caf65df251297b5c6862 100644 (file)
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -214,7 +214,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
  
  /*
   * Define the various spin_lock methods.  Note we define these
- * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The
+ * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The
   * various methods are defined as nops in the case they are not
   * required.
   */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h

index 42dfab89e740aeb08de1896e491607789eacda4b..b762eaba4cdf47d2b6dfffa1b2708dc9c28d0e52 100644 (file)
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -96,7 +96,7 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock)
  
  /*
   * If lockdep is enabled then we use the non-preemption spin-ops
- * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * even on CONFIG_PREEMPTION, because lockdep assumes that interrupts are
   * not re-enabled during lock-acquire (which the preempt-spin-ops do):
   */
  #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
diff --git a/include/linux/topology.h b/include/linux/topology.h

index 2a19d196af2886899a552dda7c68edaf1015dc66..eb2fe6edd73c80ad16ddad96fd9c10777b72d051 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -60,6 +60,20 @@ int arch_update_cpu_topology(void);
   */
  #define RECLAIM_DISTANCE 30
  #endif
+
+/*
+ * The following tunable allows platforms to override the default node
+ * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
+ * sufficiently fast that the default value actually hurts
+ * performance.
+ *
+ * AMD EPYC machines use this because even though the 2-hop distance
+ * is 32 (3.2x slower than a local memory access) performance actually
+ * *improves* if allowed to reclaim memory and load balance tasks
+ * between NUMA nodes 2-hops apart.
+ */
+extern int __read_mostly node_reclaim_distance;
+
  #ifndef PENALTY_FOR_NODE_WITH_CPUS
  #define PENALTY_FOR_NODE_WITH_CPUS     (1)
  #endif
diff --git a/include/linux/torture.h b/include/linux/torture.h

index a620118385bb1a95a137618fb3cd5345979879fe..6241f59e2d6fe19ca60b329c9eed162ed3de2cd7 100644 (file)
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -86,7 +86,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp);
  #define torture_stop_kthread(n, tp) \
         _torture_stop_kthread("Stopping " #n " task", &(tp))
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  #define torture_preempt_schedule() preempt_schedule()
  #else
  #define torture_preempt_schedule()
diff --git a/init/Kconfig b/init/Kconfig

index d96127ebc44e08526f0be1586098d2ecd52e7104..ec1021fd33712afdc98b5ff454518320d858d8f5 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -931,6 +931,28 @@ config RT_GROUP_SCHED
  
  endif #CGROUP_SCHED
  
+config UCLAMP_TASK_GROUP
+       bool "Utilization clamping per group of tasks"
+       depends on CGROUP_SCHED
+       depends on UCLAMP_TASK
+       default n
+       help
+         This feature enables the scheduler to track the clamped utilization
+         of each CPU based on RUNNABLE tasks currently scheduled on that CPU.
+
+         When this option is enabled, the user can specify a min and max
+         CPU bandwidth which is allowed for each single task in a group.
+         The max bandwidth allows to clamp the maximum frequency a task
+         can use, while the min bandwidth allows to define a minimum
+         frequency a task will always use.
+
+         When task group based utilization clamping is enabled, an eventually
+         specified task-specific clamp value is constrained by the cgroup
+         specified clamp value. Both minimum and maximum task clamping cannot
+         be bigger than the corresponding clamping defined at task group level.
+
+         If in doubt, say N.
+
  config CGROUP_PIDS
         bool "PIDs controller"
         help
diff --git a/init/init_task.c b/init/init_task.c

index 7ab773b9b3cd566c504e84d6fc06a0d869705145..bfe06c53b14e06674a5fddde3944a9fdd9b96c35 100644 (file)
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -174,7 +174,7 @@ struct task_struct init_task
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
         .ret_stack      = NULL,
  #endif
-#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT)
+#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
         .trace_recursion = 0,
  #endif
  #ifdef CONFIG_LIVEPATCH
diff --git a/init/main.c b/init/main.c

index 96f8d5af52d61a877d661bfd15ca8fc42e7296ae..653693da8da61529678ffdb6376dc2b5cc3df2ca 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -433,7 +433,7 @@ noinline void __ref rest_init(void)
  
         /*
          * Enable might_sleep() and smp_processor_id() checks.
-        * They cannot be enabled earlier because with CONFIG_PREEMPT=y
+        * They cannot be enabled earlier because with CONFIG_PREEMPTION=y
          * kernel_thread() would trigger might_sleep() splats. With
          * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
          * already, but it's stuck on the kthreadd_done completion.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 8be1da1ebd9a4f3d4ee3f6038a85e18e8d5fa685..a7ce73a2c40198e8cdd53df154db0d16af89daaa 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc)
   */
  static bool use_task_css_set_links __read_mostly;
  
-static void cgroup_enable_task_cg_lists(void)
+void cgroup_enable_task_cg_lists(void)
  {
         struct task_struct *p, *g;
  
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

index 5aa37531ce76fd3a02896dafa6550d0233c19822..c52bc91f882b29e44ea8eb56fd0ba9889c6a86df 100644 (file)
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -45,6 +45,7 @@
  #include <linux/proc_fs.h>
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
+#include <linux/sched/deadline.h>
  #include <linux/sched/mm.h>
  #include <linux/sched/task.h>
  #include <linux/seq_file.h>
@@ -332,7 +333,18 @@ static struct cpuset top_cpuset = {
   * guidelines for accessing subsystem state in kernel/cgroup.c
   */
  
-static DEFINE_MUTEX(cpuset_mutex);
+DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+
+void cpuset_read_lock(void)
+{
+       percpu_down_read(&cpuset_rwsem);
+}
+
+void cpuset_read_unlock(void)
+{
+       percpu_up_read(&cpuset_rwsem);
+}
+
  static DEFINE_SPINLOCK(callback_lock);
  
  static struct workqueue_struct *cpuset_migrate_mm_wq;
@@ -894,6 +906,67 @@ done:
         return ndoms;
  }
  
+static void update_tasks_root_domain(struct cpuset *cs)
+{
+       struct css_task_iter it;
+       struct task_struct *task;
+
+       css_task_iter_start(&cs->css, 0, &it);
+
+       while ((task = css_task_iter_next(&it)))
+               dl_add_task_root_domain(task);
+
+       css_task_iter_end(&it);
+}
+
+static void rebuild_root_domains(void)
+{
+       struct cpuset *cs = NULL;
+       struct cgroup_subsys_state *pos_css;
+
+       percpu_rwsem_assert_held(&cpuset_rwsem);
+       lockdep_assert_cpus_held();
+       lockdep_assert_held(&sched_domains_mutex);
+
+       cgroup_enable_task_cg_lists();
+
+       rcu_read_lock();
+
+       /*
+        * Clear default root domain DL accounting, it will be computed again
+        * if a task belongs to it.
+        */
+       dl_clear_root_domain(&def_root_domain);
+
+       cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+
+               if (cpumask_empty(cs->effective_cpus)) {
+                       pos_css = css_rightmost_descendant(pos_css);
+                       continue;
+               }
+
+               css_get(&cs->css);
+
+               rcu_read_unlock();
+
+               update_tasks_root_domain(cs);
+
+               rcu_read_lock();
+               css_put(&cs->css);
+       }
+       rcu_read_unlock();
+}
+
+static void
+partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+                                   struct sched_domain_attr *dattr_new)
+{
+       mutex_lock(&sched_domains_mutex);
+       partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+       rebuild_root_domains();
+       mutex_unlock(&sched_domains_mutex);
+}
+
  /*
   * Rebuild scheduler domains.
   *
@@ -911,8 +984,8 @@ static void rebuild_sched_domains_locked(void)
         cpumask_var_t *doms;
         int ndoms;
  
-       lockdep_assert_held(&cpuset_mutex);
-       get_online_cpus();
+       lockdep_assert_cpus_held();
+       percpu_rwsem_assert_held(&cpuset_rwsem);
  
         /*
          * We have raced with CPU hotplug. Don't do anything to avoid
@@ -921,19 +994,17 @@ static void rebuild_sched_domains_locked(void)
          */
         if (!top_cpuset.nr_subparts_cpus &&
             !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-               goto out;
+               return;
  
         if (top_cpuset.nr_subparts_cpus &&
            !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
-               goto out;
+               return;
  
         /* Generate domain masks and attrs */
         ndoms = generate_sched_domains(&doms, &attr);
  
         /* Have scheduler rebuild the domains */
-       partition_sched_domains(ndoms, doms, attr);
-out:
-       put_online_cpus();
+       partition_and_rebuild_sched_domains(ndoms, doms, attr);
  }
  #else /* !CONFIG_SMP */
  static void rebuild_sched_domains_locked(void)
@@ -943,9 +1014,11 @@ static void rebuild_sched_domains_locked(void)
  
  void rebuild_sched_domains(void)
  {
-       mutex_lock(&cpuset_mutex);
+       get_online_cpus();
+       percpu_down_write(&cpuset_rwsem);
         rebuild_sched_domains_locked();
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
+       put_online_cpus();
  }
  
  /**
@@ -1051,7 +1124,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
         int deleting;   /* Moving cpus from subparts_cpus to effective_cpus */
         bool part_error = false;        /* Partition error? */
  
-       lockdep_assert_held(&cpuset_mutex);
+       percpu_rwsem_assert_held(&cpuset_rwsem);
  
         /*
          * The parent must be a partition root.
@@ -2039,7 +2112,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
         cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
         cs = css_cs(css);
  
-       mutex_lock(&cpuset_mutex);
+       percpu_down_write(&cpuset_rwsem);
  
         /* allow moving tasks into an empty cpuset if on default hierarchy */
         ret = -ENOSPC;
@@ -2063,7 +2136,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
         cs->attach_in_progress++;
         ret = 0;
  out_unlock:
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
         return ret;
  }
  
@@ -2073,9 +2146,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
  
         cgroup_taskset_first(tset, &css);
  
-       mutex_lock(&cpuset_mutex);
+       percpu_down_write(&cpuset_rwsem);
         css_cs(css)->attach_in_progress--;
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
  }
  
  /*
@@ -2098,7 +2171,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
         cgroup_taskset_first(tset, &css);
         cs = css_cs(css);
  
-       mutex_lock(&cpuset_mutex);
+       percpu_down_write(&cpuset_rwsem);
  
         /* prepare for attach */
         if (cs == &top_cpuset)
@@ -2152,7 +2225,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
         if (!cs->attach_in_progress)
                 wake_up(&cpuset_attach_wq);
  
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
  }
  
  /* The various types of files and directories in a cpuset file system */
@@ -2183,7 +2256,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
         cpuset_filetype_t type = cft->private;
         int retval = 0;
  
-       mutex_lock(&cpuset_mutex);
+       get_online_cpus();
+       percpu_down_write(&cpuset_rwsem);
         if (!is_cpuset_online(cs)) {
                 retval = -ENODEV;
                 goto out_unlock;
@@ -2219,7 +2293,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
                 break;
         }
  out_unlock:
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
+       put_online_cpus();
         return retval;
  }
  
@@ -2230,7 +2305,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
         cpuset_filetype_t type = cft->private;
         int retval = -ENODEV;
  
-       mutex_lock(&cpuset_mutex);
+       get_online_cpus();
+       percpu_down_write(&cpuset_rwsem);
         if (!is_cpuset_online(cs))
                 goto out_unlock;
  
@@ -2243,7 +2319,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
                 break;
         }
  out_unlock:
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
+       put_online_cpus();
         return retval;
  }
  
@@ -2282,7 +2359,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
         kernfs_break_active_protection(of->kn);
         flush_work(&cpuset_hotplug_work);
  
-       mutex_lock(&cpuset_mutex);
+       get_online_cpus();
+       percpu_down_write(&cpuset_rwsem);
         if (!is_cpuset_online(cs))
                 goto out_unlock;
  
@@ -2306,7 +2384,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
  
         free_cpuset(trialcs);
  out_unlock:
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
+       put_online_cpus();
         kernfs_unbreak_active_protection(of->kn);
         css_put(&cs->css);
         flush_workqueue(cpuset_migrate_mm_wq);
@@ -2437,13 +2516,15 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
                 return -EINVAL;
  
         css_get(&cs->css);
-       mutex_lock(&cpuset_mutex);
+       get_online_cpus();
+       percpu_down_write(&cpuset_rwsem);
         if (!is_cpuset_online(cs))
                 goto out_unlock;
  
         retval = update_prstate(cs, val);
  out_unlock:
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
+       put_online_cpus();
         css_put(&cs->css);
         return retval ?: nbytes;
  }
@@ -2649,7 +2730,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
         if (!parent)
                 return 0;
  
-       mutex_lock(&cpuset_mutex);
+       get_online_cpus();
+       percpu_down_write(&cpuset_rwsem);
  
         set_bit(CS_ONLINE, &cs->flags);
         if (is_spread_page(parent))
@@ -2700,7 +2782,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
         spin_unlock_irq(&callback_lock);
  out_unlock:
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
+       put_online_cpus();
         return 0;
  }
  
@@ -2719,7 +2802,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
  {
         struct cpuset *cs = css_cs(css);
  
-       mutex_lock(&cpuset_mutex);
+       get_online_cpus();
+       percpu_down_write(&cpuset_rwsem);
  
         if (is_partition_root(cs))
                 update_prstate(cs, 0);
@@ -2738,7 +2822,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
         cpuset_dec();
         clear_bit(CS_ONLINE, &cs->flags);
  
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
+       put_online_cpus();
  }
  
  static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2750,7 +2835,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
  
  static void cpuset_bind(struct cgroup_subsys_state *root_css)
  {
-       mutex_lock(&cpuset_mutex);
+       percpu_down_write(&cpuset_rwsem);
         spin_lock_irq(&callback_lock);
  
         if (is_in_v2_mode()) {
@@ -2763,7 +2848,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
         }
  
         spin_unlock_irq(&callback_lock);
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
  }
  
  /*
@@ -2805,6 +2890,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
  
  int __init cpuset_init(void)
  {
+       BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
+
         BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
         BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
         BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -2876,7 +2963,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
         is_empty = cpumask_empty(cs->cpus_allowed) ||
                    nodes_empty(cs->mems_allowed);
  
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
  
         /*
          * Move tasks to the nearest ancestor with execution resources,
@@ -2886,7 +2973,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
         if (is_empty)
                 remove_tasks_in_empty_cpuset(cs);
  
-       mutex_lock(&cpuset_mutex);
+       percpu_down_write(&cpuset_rwsem);
  }
  
  static void
@@ -2936,14 +3023,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
  retry:
         wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
  
-       mutex_lock(&cpuset_mutex);
+       percpu_down_write(&cpuset_rwsem);
  
         /*
          * We have raced with task attaching. We wait until attaching
          * is finished, so we won't attach a task to an empty cpuset.
          */
         if (cs->attach_in_progress) {
-               mutex_unlock(&cpuset_mutex);
+               percpu_up_write(&cpuset_rwsem);
                 goto retry;
         }
  
@@ -3011,7 +3098,7 @@ update_tasks:
                 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
                                             cpus_updated, mems_updated);
  
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
  }
  
  /**
@@ -3041,7 +3128,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
         if (on_dfl && !alloc_cpumasks(NULL, &tmp))
                 ptmp = &tmp;
  
-       mutex_lock(&cpuset_mutex);
+       percpu_down_write(&cpuset_rwsem);
  
         /* fetch the available cpus/mems and find out which changed how */
         cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3091,7 +3178,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
                 update_tasks_nodemask(&top_cpuset);
         }
  
-       mutex_unlock(&cpuset_mutex);
+       percpu_up_write(&cpuset_rwsem);
  
         /* if cpus or mems changed, we need to propagate to descendants */
         if (cpus_updated || mems_updated) {
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 2aad959e6def727accc954163a3d8f532e3fd83b..1c414b8866b454aed555aafdf34e823256f0c8ba 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4174,10 +4174,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
                 return NULL;
  
         __perf_event_init_context(ctx);
-       if (task) {
-               ctx->task = task;
-               get_task_struct(task);
-       }
+       if (task)
+               ctx->task = get_task_struct(task);
         ctx->pmu = pmu;
  
         return ctx;
@@ -10440,8 +10438,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                  * and we cannot use the ctx information because we need the
                  * pmu before we get a ctx.
                  */
-               get_task_struct(task);
-               event->hw.target = task;
+               event->hw.target = get_task_struct(task);
         }
  
         event->clock = &local_clock;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c

index e8f7f179bf77e6a721deaaf349462edbc14ef8f3..9d50fbe5531a35a433d415bee40d391373890100 100644 (file)
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
          * the thread dies to avoid that the interrupt code
          * references an already freed task_struct.
          */
-       get_task_struct(t);
-       new->thread = t;
+       new->thread = get_task_struct(t);
         /*
          * Tell the thread to set its affinity. This is
          * important for shared interrupt handlers as we do
diff --git a/kernel/kprobes.c b/kernel/kprobes.c

index ebe8315a756a2593f0e9bab3f37efe7885a347ed..1b66ccbb744a6a991dbaa8670f0bf26ff826837a 100644 (file)
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1907,7 +1907,7 @@ int register_kretprobe(struct kretprobe *rp)
  
         /* Pre-allocate memory for max kretprobe instances */
         if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
                 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
  #else
                 rp->maxactive = num_possible_cpus();
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c

index fa83d36e30c661e8d9200d236dab9f8fc3501929..2874bf55616201d74ba1d4c36279d9df621113ff 100644 (file)
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                 }
  
                 /* [10] Grab the next task, i.e. owner of @lock */
-               task = rt_mutex_owner(lock);
-               get_task_struct(task);
+               task = get_task_struct(rt_mutex_owner(lock));
                 raw_spin_lock(&task->pi_lock);
  
                 /*
@@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         }
  
         /* [10] Grab the next task, i.e. the owner of @lock */
-       task = rt_mutex_owner(lock);
-       get_task_struct(task);
+       task = get_task_struct(rt_mutex_owner(lock));
         raw_spin_lock(&task->pi_lock);
  
         /* [11] requeue the pi waiters if necessary */
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig

index 480edf328b51955760b3e74e6124ce5ca7edb7c8..7644eda17d624cae5a5ed836e8efd38aa3e69905 100644 (file)
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
  
  config TREE_RCU
         bool
-       default y if !PREEMPT && SMP
+       default y if !PREEMPTION && SMP
         help
           This option selects the RCU implementation that is
           designed for very large SMP system with hundreds or
@@ -16,7 +16,7 @@ config TREE_RCU
  
  config PREEMPT_RCU
         bool
-       default y if PREEMPT
+       default y if PREEMPTION
         help
           This option selects the RCU implementation that is
           designed for very large SMP systems with hundreds or
@@ -28,7 +28,7 @@ config PREEMPT_RCU
  
  config TINY_RCU
         bool
-       default y if !PREEMPT && !SMP
+       default y if !PREEMPTION && !SMP
         help
           This option selects the RCU implementation that is
           designed for UP systems from which real-time response
@@ -70,7 +70,7 @@ config TREE_SRCU
           This option selects the full-fledged version of SRCU.
  
  config TASKS_RCU
-       def_bool PREEMPT
+       def_bool PREEMPTION
         select SRCU
         help
           This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index 71395e91b876809bdfabdcf9408a76bb5172b57c..81105141b6a823689254b5a9033cc7b62e330213 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1912,7 +1912,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
         struct rcu_node *rnp_p;
  
         raw_lockdep_assert_held_rcu_node(rnp);
-       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
+       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
             WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
             rnp->qsmask != 0) {
                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2266,7 +2266,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
                 mask = 0;
                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
                 if (rnp->qsmask == 0) {
-                       if (!IS_ENABLED(CONFIG_PREEMPT) ||
+                       if (!IS_ENABLED(CONFIG_PREEMPTION) ||
                             rcu_preempt_blocked_readers_cgp(rnp)) {
                                 /*
                                  * No point in scanning bits because they
@@ -2681,7 +2681,7 @@ static int rcu_blocking_is_gp(void)
  {
         int ret;
  
-       if (IS_ENABLED(CONFIG_PREEMPT))
+       if (IS_ENABLED(CONFIG_PREEMPTION))
                 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
         might_sleep();  /* Check for RCU read-side critical section. */
         preempt_disable();
@@ -3297,13 +3297,13 @@ static int __init rcu_spawn_gp_kthread(void)
         t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
         if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
                 return 0;
-       rnp = rcu_get_root();
-       raw_spin_lock_irqsave_rcu_node(rnp, flags);
-       rcu_state.gp_kthread = t;
         if (kthread_prio) {
                 sp.sched_priority = kthread_prio;
                 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
         }
+       rnp = rcu_get_root();
+       raw_spin_lock_irqsave_rcu_node(rnp, flags);
+       rcu_state.gp_kthread = t;
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
         wake_up_process(t);
         rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h

index 841ab43f3e60d4b26df3a63477fe65814947a826..c0b8c458d8a6ad267151f6cbffc791c217aeefdf 100644 (file)
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)
  //
  // Printing RCU CPU stall warnings
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  
  /*
   * Dump detailed information for all tasks blocking the current RCU
@@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
         return ndetected;
  }
  
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPTION */
  
  /*
   * Because preemptible RCU does not exist, we never have to check for
@@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
  {
         return 0;
  }
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPTION */
  
  /*
   * Dump stacks of all tasks running on stalled CPUs.  First try using
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 7fa8e74ad2ab4003457d266df57373f41f0e0d2a..06961b997ed6d8c13ced5558520f75b07c85aedc 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
  }
  
  #ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
  /* Max allowed minimum utilization */
  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
  
@@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
         return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
  }
  
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
  {
         if (clamp_id == UCLAMP_MIN)
                 return 0;
@@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
  }
  
  static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                   unsigned int clamp_value)
  {
         /*
@@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
         return uclamp_none(UCLAMP_MIN);
  }
  
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                      unsigned int clamp_value)
  {
         /* Reset max-clamp retention only on idle exit */
@@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
  }
  
  static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
-                                unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+                                  unsigned int clamp_value)
  {
         struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
         int bucket_id = UCLAMP_BUCKETS - 1;
@@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
         return uclamp_idle_value(rq, clamp_id, clamp_value);
  }
  
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       struct uclamp_se uc_max;
+
+       /*
+        * Tasks in autogroups or root task group will be
+        * restricted by system defaults.
+        */
+       if (task_group_is_autogroup(task_group(p)))
+               return uc_req;
+       if (task_group(p) == &root_task_group)
+               return uc_req;
+
+       uc_max = task_group(p)->uclamp[clamp_id];
+       if (uc_req.value > uc_max.value || !uc_req.user_defined)
+               return uc_max;
+#endif
+
+       return uc_req;
+}
+
  /*
   * The effective clamp bucket index of a task depends on, by increasing
   * priority:
   * - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ *   group or in an autogroup
   * - the system default clamp value, defined by the sysadmin
   */
  static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
  {
-       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+       struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
         struct uclamp_se uc_max = uclamp_default[clamp_id];
  
         /* System default restrictions always apply */
@@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
         return uc_req;
  }
  
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
  {
         struct uclamp_se uc_eff;
  
@@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
   * for each bucket when all its RUNNABLE tasks require the same clamp.
   */
  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
   * enforce the expected state and warn.
   */
  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
  
  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  
  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
                 uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct rq_flags rf;
+       struct rq *rq;
+
+       /*
+        * Lock the task and the rq where the task is (or was) queued.
+        *
+        * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+        * price to pay to safely serialize util_{min,max} updates with
+        * enqueues, dequeues and migration operations.
+        * This is the same locking schema used by __set_cpus_allowed_ptr().
+        */
+       rq = task_rq_lock(p, &rf);
+
+       /*
+        * Setting the clamp bucket is serialized by task_rq_lock().
+        * If the task is not yet RUNNABLE and its task_struct is not
+        * affecting a valid clamp bucket, the next time it's enqueued,
+        * it will already see the updated clamp bucket value.
+        */
+       if (!p->uclamp[clamp_id].active) {
+               uclamp_rq_dec_id(rq, p, clamp_id);
+               uclamp_rq_inc_id(rq, p, clamp_id);
+       }
+
+       task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+                          unsigned int clamps)
+{
+       enum uclamp_id clamp_id;
+       struct css_task_iter it;
+       struct task_struct *p;
+
+       css_task_iter_start(css, 0, &it);
+       while ((p = css_task_iter_next(&it))) {
+               for_each_clamp_id(clamp_id) {
+                       if ((0x1 << clamp_id) & clamps)
+                               uclamp_update_active(p, clamp_id);
+               }
+       }
+       css_task_iter_end(&it);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+       struct task_group *tg = &root_task_group;
+
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+                     sysctl_sched_uclamp_util_min, false);
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+                     sysctl_sched_uclamp_util_max, false);
+
+       rcu_read_lock();
+       cpu_util_update_eff(&root_task_group.css);
+       rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
  int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos)
  {
+       bool update_root_tg = false;
         int old_min, old_max;
-       static DEFINE_MUTEX(mutex);
         int result;
  
-       mutex_lock(&mutex);
+       mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
  
@@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
         if (old_min != sysctl_sched_uclamp_util_min) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                               sysctl_sched_uclamp_util_min, false);
+               update_root_tg = true;
         }
         if (old_max != sysctl_sched_uclamp_util_max) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                               sysctl_sched_uclamp_util_max, false);
+               update_root_tg = true;
         }
  
+       if (update_root_tg)
+               uclamp_update_root_tg();
+
         /*
-        * Updating all the RUNNABLE task is expensive, keep it simple and do
-        * just a lazy update at each next enqueue time.
+        * We update all RUNNABLE tasks only when task groups are in use.
+        * Otherwise, keep it simple and do just a lazy update at each next
+        * task enqueue time.
          */
+
         goto done;
  
  undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
  done:
-       mutex_unlock(&mutex);
+       mutex_unlock(&uclamp_mutex);
  
         return result;
  }
@@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,
  static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         /*
          * On scheduling class change, reset to default clamps for tasks
@@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
  
  static void uclamp_fork(struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
@@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)
  static void __init init_uclamp(void)
  {
         struct uclamp_se uc_max = {};
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
         int cpu;
  
+       mutex_init(&uclamp_mutex);
+
         for_each_possible_cpu(cpu) {
                 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                 cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)
  
         /* System defaults allow max clamp values for both indexes */
         uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
-       for_each_clamp_id(clamp_id)
+       for_each_clamp_id(clamp_id) {
                 uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+               root_task_group.uclamp_req[clamp_id] = uc_max;
+               root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+       }
  }
  
  #else /* CONFIG_UCLAMP_TASK */
@@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  }
  
  /*
@@ -3214,12 +3333,8 @@ static __always_inline struct rq *
  context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next, struct rq_flags *rf)
  {
-       struct mm_struct *mm, *oldmm;
-
         prepare_task_switch(rq, prev, next);
  
-       mm = next->mm;
-       oldmm = prev->active_mm;
         /*
          * For paravirt, this is coupled with an exit in switch_to to
          * combine the page table reload and the switch backend into
@@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
         arch_start_context_switch(prev);
  
         /*
-        * If mm is non-NULL, we pass through switch_mm(). If mm is
-        * NULL, we will pass through mmdrop() in finish_task_switch().
-        * Both of these contain the full memory barrier required by
-        * membarrier after storing to rq->curr, before returning to
-        * user-space.
+        * kernel -> kernel   lazy + transfer active
+        *   user -> kernel   lazy + mmgrab() active
+        *
+        * kernel ->   user   switch + mmdrop() active
+        *   user ->   user   switch
          */
-       if (!mm) {
-               next->active_mm = oldmm;
-               mmgrab(oldmm);
-               enter_lazy_tlb(oldmm, next);
-       } else
-               switch_mm_irqs_off(oldmm, mm, next);
+       if (!next->mm) {                                // to kernel
+               enter_lazy_tlb(prev->active_mm, next);
+
+               next->active_mm = prev->active_mm;
+               if (prev->mm)                           // from user
+                       mmgrab(prev->active_mm);
+               else
+                       prev->active_mm = NULL;
+       } else {                                        // to user
+               /*
+                * sys_membarrier() requires an smp_mb() between setting
+                * rq->curr and returning to userspace.
+                *
+                * The below provides this either through switch_mm(), or in
+                * case 'prev->active_mm == next->mm' through
+                * finish_task_switch()'s mmdrop().
+                */
+
+               switch_mm_irqs_off(prev->active_mm, next->mm, next);
  
-       if (!prev->mm) {
-               prev->active_mm = NULL;
-               rq->prev_mm = oldmm;
+               if (!prev->mm) {                        // from kernel
+                       /* will mmdrop() in finish_task_switch(). */
+                       rq->prev_mm = prev->active_mm;
+                       prev->active_mm = NULL;
+               }
         }
  
         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3622,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
  static inline void sched_tick_stop(int cpu) { }
  #endif
  
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
  /*
   * If the value passed in is equal to the current preempt count
@@ -3780,7 +3910,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
                 p = fair_sched_class.pick_next_task(rq, prev, rf);
                 if (unlikely(p == RETRY_TASK))
-                       goto again;
+                       goto restart;
  
                 /* Assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
@@ -3789,14 +3919,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                 return p;
         }
  
-again:
+restart:
+       /*
+        * Ensure that we put DL/RT tasks before the pick loop, such that they
+        * can PULL higher prio tasks when we lower the RQ 'priority'.
+        */
+       prev->sched_class->put_prev_task(rq, prev, rf);
+       if (!rq->nr_running)
+               newidle_balance(rq, rf);
+
         for_each_class(class) {
-               p = class->pick_next_task(rq, prev, rf);
-               if (p) {
-                       if (unlikely(p == RETRY_TASK))
-                               goto again;
+               p = class->pick_next_task(rq, NULL, NULL);
+               if (p)
                         return p;
-               }
         }
  
         /* The idle class should always have a runnable task: */
@@ -3823,7 +3958,7 @@ again:
   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
   *      called on the nearest possible occasion:
   *
- *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
   *
   *         - in syscall or exception context, at the next outmost
   *           preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3832,7 +3967,7 @@ again:
   *         - in IRQ context, return from interrupt-handler to
   *           preemptible context
   *
- *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
   *         then at the next:
   *
   *          - cond_resched() call
@@ -4077,7 +4212,7 @@ static void __sched notrace preempt_schedule_common(void)
         } while (need_resched());
  }
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  /*
   * this is the entry point to schedule() from in-kernel preemption
   * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -4149,7 +4284,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
  
  /*
   * this is the entry point to schedule() from kernel preemption
@@ -4317,7 +4452,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
         if (queued)
                 enqueue_task(rq, p, queue_flag);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@ -4384,7 +4519,7 @@ void set_user_nice(struct task_struct *p, long nice)
                         resched_curr(rq);
         }
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  out_unlock:
         task_rq_unlock(rq, p, &rf);
  }
@@ -4701,6 +4836,9 @@ recheck:
                         return retval;
         }
  
+       if (pi)
+               cpuset_read_lock();
+
         /*
          * Make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
@@ -4715,8 +4853,8 @@ recheck:
          * Changing the policy of the stop threads its a very bad idea:
          */
         if (p == rq->stop) {
-               task_rq_unlock(rq, p, &rf);
-               return -EINVAL;
+               retval = -EINVAL;
+               goto unlock;
         }
  
         /*
@@ -4734,8 +4872,8 @@ recheck:
                         goto change;
  
                 p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &rf);
-               return 0;
+               retval = 0;
+               goto unlock;
         }
  change:
  
@@ -4748,8 +4886,8 @@ change:
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &rf);
-                       return -EPERM;
+                       retval = -EPERM;
+                       goto unlock;
                 }
  #endif
  #ifdef CONFIG_SMP
@@ -4764,8 +4902,8 @@ change:
                          */
                         if (!cpumask_subset(span, p->cpus_ptr) ||
                             rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &rf);
-                               return -EPERM;
+                               retval = -EPERM;
+                               goto unlock;
                         }
                 }
  #endif
@@ -4775,6 +4913,8 @@ change:
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
                 task_rq_unlock(rq, p, &rf);
+               if (pi)
+                       cpuset_read_unlock();
                 goto recheck;
         }
  
@@ -4784,8 +4924,8 @@ change:
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &rf);
-               return -EBUSY;
+               retval = -EBUSY;
+               goto unlock;
         }
  
         p->sched_reset_on_fork = reset_on_fork;
@@ -4827,7 +4967,7 @@ change:
                 enqueue_task(rq, p, queue_flags);
         }
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  
         check_class_changed(rq, p, prev_class, oldprio);
  
@@ -4835,14 +4975,22 @@ change:
         preempt_disable();
         task_rq_unlock(rq, p, &rf);
  
-       if (pi)
+       if (pi) {
+               cpuset_read_unlock();
                 rt_mutex_adjust_pi(p);
+       }
  
         /* Run balance callbacks after we've adjusted the PI chain: */
         balance_callback(rq);
         preempt_enable();
  
         return 0;
+
+unlock:
+       task_rq_unlock(rq, p, &rf);
+       if (pi)
+               cpuset_read_unlock();
+       return retval;
  }
  
  static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4926,10 +5074,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
         rcu_read_lock();
         retval = -ESRCH;
         p = find_process_by_pid(pid);
-       if (p != NULL)
-               retval = sched_setscheduler(p, policy, &lparam);
+       if (likely(p))
+               get_task_struct(p);
         rcu_read_unlock();
  
+       if (likely(p)) {
+               retval = sched_setscheduler(p, policy, &lparam);
+               put_task_struct(p);
+       }
+
         return retval;
  }
  
@@ -5460,7 +5613,7 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
  int __sched _cond_resched(void)
  {
         if (should_resched(0)) {
@@ -5477,7 +5630,7 @@ EXPORT_SYMBOL(_cond_resched);
   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
@@ -6016,7 +6169,7 @@ void sched_setnuma(struct task_struct *p, int nid)
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
         task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@ -6056,21 +6209,22 @@ static void calc_load_migrate(struct rq *rq)
                 atomic_long_add(delta, &calc_load_tasks);
  }
  
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
  {
-}
+       const struct sched_class *class;
+       struct task_struct *next;
  
-static const struct sched_class fake_sched_class = {
-       .put_prev_task = put_prev_task_fake,
-};
+       for_each_class(class) {
+               next = class->pick_next_task(rq, NULL, NULL);
+               if (next) {
+                       next->sched_class->put_prev_task(rq, next, NULL);
+                       return next;
+               }
+       }
  
-static struct task_struct fake_task = {
-       /*
-        * Avoid pull_{rt,dl}_task()
-        */
-       .prio = MAX_PRIO + 1,
-       .sched_class = &fake_sched_class,
-};
+       /* The idle class should always have a runnable task */
+       BUG();
+}
  
  /*
   * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -6113,12 +6267,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
                 if (rq->nr_running == 1)
                         break;
  
-               /*
-                * pick_next_task() assumes pinned rq->lock:
-                */
-               next = pick_next_task(rq, &fake_task, rf);
-               BUG_ON(!next);
-               put_prev_task(rq, next);
+               next = __pick_migrate_task(rq);
  
                 /*
                  * Rules for changing task_struct::cpus_mask are holding
@@ -6415,19 +6564,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
  
  void __init sched_init(void)
  {
-       unsigned long alloc_size = 0, ptr;
+       unsigned long ptr = 0;
         int i;
  
         wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
-       if (alloc_size) {
-               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+       if (ptr) {
+               ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.se = (struct sched_entity **)ptr;
@@ -6746,7 +6895,7 @@ struct task_struct *curr_task(int cpu)
  
  #ifdef CONFIG_IA64
  /**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
   * @cpu: the processor in question.
   * @p: the task pointer to set.
   *
@@ -6771,6 +6920,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+                                           struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       enum uclamp_id clamp_id;
+
+       for_each_clamp_id(clamp_id) {
+               uclamp_se_set(&tg->uclamp_req[clamp_id],
+                             uclamp_none(clamp_id), false);
+               tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+       }
+#endif
+}
+
  static void sched_free_group(struct task_group *tg)
  {
         free_fair_sched_group(tg);
@@ -6794,6 +6957,8 @@ struct task_group *sched_create_group(struct task_group *parent)
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
  
+       alloc_uclamp_sched_group(tg, parent);
+
         return tg;
  
  err:
@@ -6897,7 +7062,7 @@ void sched_move_task(struct task_struct *tsk)
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
         if (running)
-               set_curr_task(rq, tsk);
+               set_next_task(rq, tsk);
  
         task_rq_unlock(rq, tsk, &rf);
  }
@@ -6980,10 +7145,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
  #ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
-#else
-               /* We don't support RT-tasks being in separate groups */
-               if (task->sched_class != &fair_sched_class)
-                       return -EINVAL;
  #endif
                 /*
                  * Serialize against wake_up_new_task() such that if its
@@ -7014,6 +7175,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
                 sched_move_task(task);
  }
  
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+       struct cgroup_subsys_state *top_css = css;
+       struct uclamp_se *uc_parent = NULL;
+       struct uclamp_se *uc_se = NULL;
+       unsigned int eff[UCLAMP_CNT];
+       enum uclamp_id clamp_id;
+       unsigned int clamps;
+
+       css_for_each_descendant_pre(css, top_css) {
+               uc_parent = css_tg(css)->parent
+                       ? css_tg(css)->parent->uclamp : NULL;
+
+               for_each_clamp_id(clamp_id) {
+                       /* Assume effective clamps matches requested clamps */
+                       eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+                       /* Cap effective clamps with parent's effective clamps */
+                       if (uc_parent &&
+                           eff[clamp_id] > uc_parent[clamp_id].value) {
+                               eff[clamp_id] = uc_parent[clamp_id].value;
+                       }
+               }
+               /* Ensure protection is always capped by limit */
+               eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+               /* Propagate most restrictive effective clamps */
+               clamps = 0x0;
+               uc_se = css_tg(css)->uclamp;
+               for_each_clamp_id(clamp_id) {
+                       if (eff[clamp_id] == uc_se[clamp_id].value)
+                               continue;
+                       uc_se[clamp_id].value = eff[clamp_id];
+                       uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+                       clamps |= (0x1 << clamp_id);
+               }
+               if (!clamps) {
+                       css = css_rightmost_descendant(css);
+                       continue;
+               }
+
+               /* Immediately update descendants RUNNABLE tasks */
+               uclamp_update_active_tasks(css, clamps);
+       }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT   2
+#define UCLAMP_PERCENT_SCALE   (100 * POW10(UCLAMP_PERCENT_SHIFT))
+       s64 percent;
+       u64 util;
+       int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+       struct uclamp_request req = {
+               .percent = UCLAMP_PERCENT_SCALE,
+               .util = SCHED_CAPACITY_SCALE,
+               .ret = 0,
+       };
+
+       buf = strim(buf);
+       if (strcmp(buf, "max")) {
+               req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+                                            &req.percent);
+               if (req.ret)
+                       return req;
+               if (req.percent > UCLAMP_PERCENT_SCALE) {
+                       req.ret = -ERANGE;
+                       return req;
+               }
+
+               req.util = req.percent << SCHED_CAPACITY_SHIFT;
+               req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+       }
+
+       return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off,
+                               enum uclamp_id clamp_id)
+{
+       struct uclamp_request req;
+       struct task_group *tg;
+
+       req = capacity_from_percent(buf);
+       if (req.ret)
+               return req.ret;
+
+       mutex_lock(&uclamp_mutex);
+       rcu_read_lock();
+
+       tg = css_tg(of_css(of));
+       if (tg->uclamp_req[clamp_id].value != req.util)
+               uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+       /*
+        * Because of not recoverable conversion rounding we keep track of the
+        * exact requested value
+        */
+       tg->uclamp_pct[clamp_id] = req.percent;
+
+       /* Update effective clamps to track the most restrictive value */
+       cpu_util_update_eff(of_css(of));
+
+       rcu_read_unlock();
+       mutex_unlock(&uclamp_mutex);
+
+       return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+                                   enum uclamp_id clamp_id)
+{
+       struct task_group *tg;
+       u64 util_clamp;
+       u64 percent;
+       u32 rem;
+
+       rcu_read_lock();
+       tg = css_tg(seq_css(sf));
+       util_clamp = tg->uclamp_req[clamp_id].value;
+       rcu_read_unlock();
+
+       if (util_clamp == SCHED_CAPACITY_SCALE) {
+               seq_puts(sf, "max\n");
+               return;
+       }
+
+       percent = tg->uclamp_pct[clamp_id];
+       percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+       seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MIN);
+       return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MAX);
+       return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@ -7358,6 +7691,20 @@ static struct cftype cpu_legacy_files[] = {
                 .read_u64 = cpu_rt_period_read_uint,
                 .write_u64 = cpu_rt_period_write_uint,
         },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
  #endif
         { }     /* Terminate */
  };
@@ -7525,6 +7872,20 @@ static struct cftype cpu_files[] = {
                 .seq_show = cpu_max_show,
                 .write = cpu_max_write,
         },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
  #endif
         { }     /* terminate */
  };
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

index 867b4bb6d4beb541d1d9eb087711d1e52a446416..fdce9cfaca05b802c87a232d77903d6731809716 100644 (file)
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -263,9 +263,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
          * irq metric. Because IRQ/steal time is hidden from the task clock we
          * need to scale the task numbers:
          *
-        *              1 - irq
-        *   U' = irq + ------- * U
-        *                max
+        *              max - irq
+        *   U' = irq + --------- * U
+        *                 max
          */
         util = scale_irq_capacity(util, irq, max);
         util += irq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 46122edd8552c9abd7acb3cf665332d91746ed7d..39dc9f74f2898f13b56837f8073f49043275a5d2 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
  static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
  {
         struct rq *later_rq = NULL;
+       struct dl_bw *dl_b;
  
         later_rq = find_lock_later_rq(p, rq);
         if (!later_rq) {
@@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                 double_lock_balance(rq, later_rq);
         }
  
+       if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+               /*
+                * Inactive timer is armed (or callback is running, but
+                * waiting for us to release rq locks). In any case, when it
+                * will fire (or continue), it will see running_bw of this
+                * task migrated to later_rq (and correctly handle it).
+                */
+               sub_running_bw(&p->dl, &rq->dl);
+               sub_rq_bw(&p->dl, &rq->dl);
+
+               add_rq_bw(&p->dl, &later_rq->dl);
+               add_running_bw(&p->dl, &later_rq->dl);
+       } else {
+               sub_rq_bw(&p->dl, &rq->dl);
+               add_rq_bw(&p->dl, &later_rq->dl);
+       }
+
+       /*
+        * And we finally need to fixup root_domain(s) bandwidth accounting,
+        * since p is still hanging out in the old (now moved to default) root
+        * domain.
+        */
+       dl_b = &rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+       __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+       raw_spin_unlock(&dl_b->lock);
+
+       dl_b = &later_rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+       __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+       raw_spin_unlock(&dl_b->lock);
+
         set_task_cpu(p, later_rq->cpu);
         double_unlock_balance(later_rq, rq);
  
@@ -1694,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
  }
  #endif
  
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p)
  {
         p->se.exec_start = rq_clock_task(rq);
  
         /* You can't push away the running task */
         dequeue_pushable_dl_task(rq, p);
+
+       if (hrtick_enabled(rq))
+               start_hrtick_dl(rq, p);
+
+       if (rq->curr->sched_class != &dl_sched_class)
+               update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+       deadline_queue_push_tasks(rq);
  }
  
  static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1720,64 +1761,42 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
         struct task_struct *p;
         struct dl_rq *dl_rq;
  
-       dl_rq = &rq->dl;
+       WARN_ON_ONCE(prev || rf);
  
-       if (need_pull_dl_task(rq, prev)) {
-               /*
-                * This is OK, because current is on_cpu, which avoids it being
-                * picked for load-balance and preemption/IRQs are still
-                * disabled avoiding further scheduler activity on it and we're
-                * being very careful to re-start the picking loop.
-                */
-               rq_unpin_lock(rq, rf);
-               pull_dl_task(rq);
-               rq_repin_lock(rq, rf);
-               /*
-                * pull_dl_task() can drop (and re-acquire) rq->lock; this
-                * means a stop task can slip in, in which case we need to
-                * re-start task selection.
-                */
-               if (rq->stop && task_on_rq_queued(rq->stop))
-                       return RETRY_TASK;
-       }
-
-       /*
-        * When prev is DL, we may throttle it in put_prev_task().
-        * So, we update time before we check for dl_nr_running.
-        */
-       if (prev->sched_class == &dl_sched_class)
-               update_curr_dl(rq);
+       dl_rq = &rq->dl;
  
         if (unlikely(!dl_rq->dl_nr_running))
                 return NULL;
  
-       put_prev_task(rq, prev);
-
         dl_se = pick_next_dl_entity(rq, dl_rq);
         BUG_ON(!dl_se);
  
         p = dl_task_of(dl_se);
  
-       set_next_task(rq, p);
-
-       if (hrtick_enabled(rq))
-               start_hrtick_dl(rq, p);
-
-       deadline_queue_push_tasks(rq);
-
-       if (rq->curr->sched_class != &dl_sched_class)
-               update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+       set_next_task_dl(rq, p);
  
         return p;
  }
  
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  {
         update_curr_dl(rq);
  
         update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
         if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_dl_task(rq, p);
+
+       if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+               /*
+                * This is OK, because current is on_cpu, which avoids it being
+                * picked for load-balance and preemption/IRQs are still
+                * disabled avoiding further scheduler activity on it and we've
+                * not yet started the picking loop.
+                */
+               rq_unpin_lock(rq, rf);
+               pull_dl_task(rq);
+               rq_repin_lock(rq, rf);
+       }
  }
  
  /*
@@ -1811,11 +1830,6 @@ static void task_fork_dl(struct task_struct *p)
          */
  }
  
-static void set_curr_task_dl(struct rq *rq)
-{
-       set_next_task(rq, rq->curr);
-}
-
  #ifdef CONFIG_SMP
  
  /* Only try algorithms three times */
@@ -2275,6 +2289,36 @@ void __init init_sched_dl_class(void)
                                         GFP_KERNEL, cpu_to_node(i));
  }
  
+void dl_add_task_root_domain(struct task_struct *p)
+{
+       struct rq_flags rf;
+       struct rq *rq;
+       struct dl_bw *dl_b;
+
+       rq = task_rq_lock(p, &rf);
+       if (!dl_task(p))
+               goto unlock;
+
+       dl_b = &rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+
+       __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+
+       raw_spin_unlock(&dl_b->lock);
+
+unlock:
+       task_rq_unlock(rq, p, &rf);
+}
+
+void dl_clear_root_domain(struct root_domain *rd)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+       rd->dl_bw.total_bw = 0;
+       raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+}
+
  #endif /* CONFIG_SMP */
  
  static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -2395,6 +2439,7 @@ const struct sched_class dl_sched_class = {
  
         .pick_next_task         = pick_next_task_dl,
         .put_prev_task          = put_prev_task_dl,
+       .set_next_task          = set_next_task_dl,
  
  #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_dl,
@@ -2405,7 +2450,6 @@ const struct sched_class dl_sched_class = {
         .task_woken             = task_woken_dl,
  #endif
  
-       .set_curr_task          = set_curr_task_dl,
         .task_tick              = task_tick_dl,
         .task_fork              = task_fork_dl,
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 500f5db0de0ba86a331586d4189e3b299cb6148e..d4bbf68c31611fcd6fa3da456ef435021cefae53 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu)
  }
  
  /*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
+ * The margin used when comparing utilization with CPU capacity.
   *
   * (default: ~20%)
   */
-static unsigned int capacity_margin                    = 1280;
+#define fits_capacity(cap, max)        ((cap) * 1280 < (max) * 1024)
+
  #endif
  
  #ifdef CONFIG_CFS_BANDWIDTH
@@ -1188,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p)
         return max(smin, smax);
  }
  
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
-{
-       int mm_users = 0;
-       struct mm_struct *mm = p->mm;
-
-       if (mm) {
-               mm_users = atomic_read(&mm->mm_users);
-               if (mm_users == 1) {
-                       mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-                       mm->numa_scan_seq = 0;
-               }
-       }
-       p->node_stamp                   = 0;
-       p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
-       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
-       p->numa_work.next               = &p->numa_work;
-       p->numa_faults                  = NULL;
-       RCU_INIT_POINTER(p->numa_group, NULL);
-       p->last_task_numa_placement     = 0;
-       p->last_sum_exec_runtime        = 0;
-
-       /* New address space, reset the preferred nid */
-       if (!(clone_flags & CLONE_VM)) {
-               p->numa_preferred_nid = NUMA_NO_NODE;
-               return;
-       }
-
-       /*
-        * New thread, keep existing numa_preferred_nid which should be copied
-        * already by arch_dup_task_struct but stagger when scans start.
-        */
-       if (mm) {
-               unsigned int delay;
-
-               delay = min_t(unsigned int, task_scan_max(current),
-                       current->numa_scan_period * mm_users * NSEC_PER_MSEC);
-               delay += 2 * TICK_NSEC;
-               p->node_stamp = delay;
-       }
-}
-
  static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
  {
         rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@ -2523,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
   * The expensive part of numa migration is done from task_work context.
   * Triggered from task_tick_numa().
   */
-void task_numa_work(struct callback_head *work)
+static void task_numa_work(struct callback_head *work)
  {
         unsigned long migrate, next_scan, now = jiffies;
         struct task_struct *p = current;
@@ -2536,7 +2495,7 @@ void task_numa_work(struct callback_head *work)
  
         SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
  
-       work->next = work; /* protect against double add */
+       work->next = work;
         /*
          * Who cares about NUMA placement when they're dying.
          *
@@ -2665,6 +2624,50 @@ out:
         }
  }
  
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+       int mm_users = 0;
+       struct mm_struct *mm = p->mm;
+
+       if (mm) {
+               mm_users = atomic_read(&mm->mm_users);
+               if (mm_users == 1) {
+                       mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+                       mm->numa_scan_seq = 0;
+               }
+       }
+       p->node_stamp                   = 0;
+       p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
+       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
+       /* Protect against double add, see task_tick_numa and task_numa_work */
+       p->numa_work.next               = &p->numa_work;
+       p->numa_faults                  = NULL;
+       RCU_INIT_POINTER(p->numa_group, NULL);
+       p->last_task_numa_placement     = 0;
+       p->last_sum_exec_runtime        = 0;
+
+       init_task_work(&p->numa_work, task_numa_work);
+
+       /* New address space, reset the preferred nid */
+       if (!(clone_flags & CLONE_VM)) {
+               p->numa_preferred_nid = NUMA_NO_NODE;
+               return;
+       }
+
+       /*
+        * New thread, keep existing numa_preferred_nid which should be copied
+        * already by arch_dup_task_struct but stagger when scans start.
+        */
+       if (mm) {
+               unsigned int delay;
+
+               delay = min_t(unsigned int, task_scan_max(current),
+                       current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+               delay += 2 * TICK_NSEC;
+               p->node_stamp = delay;
+       }
+}
+
  /*
   * Drive the periodic memory faults..
   */
@@ -2693,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
                         curr->numa_scan_period = task_scan_start(curr);
                 curr->node_stamp += period;
  
-               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
-                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+               if (!time_before(jiffies, curr->mm->numa_next_scan))
                         task_work_add(curr, work, true);
-               }
         }
  }
  
@@ -3689,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
         return cfs_rq->avg.load_avg;
  }
  
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
-
  static inline unsigned long task_util(struct task_struct *p)
  {
         return READ_ONCE(p->se.avg.util_avg);
@@ -3807,7 +3806,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
  
  static inline int task_fits_capacity(struct task_struct *p, long capacity)
  {
-       return capacity * 1024 > task_util_est(p) * capacity_margin;
+       return fits_capacity(task_util_est(p), capacity);
  }
  
  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4370,8 +4369,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
  
         now = sched_clock_cpu(smp_processor_id());
         cfs_b->runtime = cfs_b->quota;
-       cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
-       cfs_b->expires_seq++;
  }
  
  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4393,8 +4390,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
         struct task_group *tg = cfs_rq->tg;
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount, expires;
-       int expires_seq;
+       u64 amount = 0, min_amount;
  
         /* note: this is a positive sum as runtime_remaining <= 0 */
         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4411,61 +4407,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                         cfs_b->idle = 0;
                 }
         }
-       expires_seq = cfs_b->expires_seq;
-       expires = cfs_b->runtime_expires;
         raw_spin_unlock(&cfs_b->lock);
  
         cfs_rq->runtime_remaining += amount;
-       /*
-        * we may have advanced our local expiration to account for allowed
-        * spread between our sched_clock and the one on which runtime was
-        * issued.
-        */
-       if (cfs_rq->expires_seq != expires_seq) {
-               cfs_rq->expires_seq = expires_seq;
-               cfs_rq->runtime_expires = expires;
-       }
  
         return cfs_rq->runtime_remaining > 0;
  }
  
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
-       /* if the deadline is ahead of our clock, nothing to do */
-       if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
-               return;
-
-       if (cfs_rq->runtime_remaining < 0)
-               return;
-
-       /*
-        * If the local deadline has passed we have to consider the
-        * possibility that our sched_clock is 'fast' and the global deadline
-        * has not truly expired.
-        *
-        * Fortunately we can check determine whether this the case by checking
-        * whether the global deadline(cfs_b->expires_seq) has advanced.
-        */
-       if (cfs_rq->expires_seq == cfs_b->expires_seq) {
-               /* extend local deadline, drift is bounded above by 2 ticks */
-               cfs_rq->runtime_expires += TICK_NSEC;
-       } else {
-               /* global deadline is ahead, expiration has passed */
-               cfs_rq->runtime_remaining = 0;
-       }
-}
-
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
         /* dock delta_exec before expiring quota (as it could span periods) */
         cfs_rq->runtime_remaining -= delta_exec;
-       expire_cfs_rq_runtime(cfs_rq);
  
         if (likely(cfs_rq->runtime_remaining > 0))
                 return;
@@ -4556,7 +4508,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         struct rq *rq = rq_of(cfs_rq);
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
-       long task_delta, dequeue = 1;
+       long task_delta, idle_task_delta, dequeue = 1;
         bool empty;
  
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4567,6 +4519,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         rcu_read_unlock();
  
         task_delta = cfs_rq->h_nr_running;
+       idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
                 /* throttled entity or throttle-on-deactivate */
@@ -4576,6 +4529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                 if (dequeue)
                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
                 qcfs_rq->h_nr_running -= task_delta;
+               qcfs_rq->idle_h_nr_running -= idle_task_delta;
  
                 if (qcfs_rq->load.weight)
                         dequeue = 0;
@@ -4615,7 +4569,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         int enqueue = 1;
-       long task_delta;
+       long task_delta, idle_task_delta;
  
         se = cfs_rq->tg->se[cpu_of(rq)];
  
@@ -4635,6 +4589,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                 return;
  
         task_delta = cfs_rq->h_nr_running;
+       idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
                 if (se->on_rq)
                         enqueue = 0;
@@ -4643,6 +4598,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                 if (enqueue)
                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
                 cfs_rq->h_nr_running += task_delta;
+               cfs_rq->idle_h_nr_running += idle_task_delta;
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -4658,8 +4614,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                 resched_curr(rq);
  }
  
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
-               u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
  {
         struct cfs_rq *cfs_rq;
         u64 runtime;
@@ -4684,7 +4639,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                 remaining -= runtime;
  
                 cfs_rq->runtime_remaining += runtime;
-               cfs_rq->runtime_expires = expires;
  
                 /* we check whether we're throttled above */
                 if (cfs_rq->runtime_remaining > 0)
@@ -4709,7 +4663,7 @@ next:
   */
  static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
  {
-       u64 runtime, runtime_expires;
+       u64 runtime;
         int throttled;
  
         /* no need to continue the timer with no bandwidth constraint */
@@ -4737,8 +4691,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
         /* account preceding periods in which throttling occurred */
         cfs_b->nr_throttled += overrun;
  
-       runtime_expires = cfs_b->runtime_expires;
-
         /*
          * This check is repeated as we are holding onto the new bandwidth while
          * we unthrottle. This can potentially race with an unthrottled group
@@ -4751,8 +4703,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
                 cfs_b->distribute_running = 1;
                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
-               runtime = distribute_cfs_runtime(cfs_b, runtime,
-                                                runtime_expires);
+               runtime = distribute_cfs_runtime(cfs_b, runtime);
                 raw_spin_lock_irqsave(&cfs_b->lock, flags);
  
                 cfs_b->distribute_running = 0;
@@ -4834,8 +4785,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                 return;
  
         raw_spin_lock(&cfs_b->lock);
-       if (cfs_b->quota != RUNTIME_INF &&
-           cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+       if (cfs_b->quota != RUNTIME_INF) {
                 cfs_b->runtime += slack_runtime;
  
                 /* we are under rq->lock, defer unthrottling using a timer */
@@ -4868,7 +4818,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  {
         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
         unsigned long flags;
-       u64 expires;
  
         /* confirm we're still not at a refresh boundary */
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -4886,7 +4835,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                 runtime = cfs_b->runtime;
  
-       expires = cfs_b->runtime_expires;
         if (runtime)
                 cfs_b->distribute_running = 1;
  
@@ -4895,11 +4843,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         if (!runtime)
                 return;
  
-       runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+       runtime = distribute_cfs_runtime(cfs_b, runtime);
  
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
-       if (expires == cfs_b->runtime_expires)
-               lsub_positive(&cfs_b->runtime, runtime);
+       lsub_positive(&cfs_b->runtime, runtime);
         cfs_b->distribute_running = 0;
         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  }
@@ -5056,8 +5003,6 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  
         cfs_b->period_active = 1;
         overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
-       cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
-       cfs_b->expires_seq++;
         hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
  }
  
@@ -5235,7 +5180,7 @@ static inline unsigned long cpu_util(int cpu);
  
  static inline bool cpu_overutilized(int cpu)
  {
-       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+       return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
  }
  
  static inline void update_overutilized_status(struct rq *rq)
@@ -5259,6 +5204,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
+       int idle_h_nr_running = task_has_idle_policy(p);
  
         /*
          * The code below (indirectly) updates schedutil which looks at
@@ -5291,6 +5237,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
  
                 flags = ENQUEUE_WAKEUP;
         }
@@ -5298,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -5359,6 +5307,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
         int task_sleep = flags & DEQUEUE_SLEEP;
+       int idle_h_nr_running = task_has_idle_policy(p);
  
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
@@ -5373,6 +5322,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
  
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
@@ -5392,6 +5342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -5425,6 +5376,15 @@ static struct {
  
  #endif /* CONFIG_NO_HZ_COMMON */
  
+/* CPU only has SCHED_IDLE tasks enqueued */
+static int sched_idle_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+                       rq->nr_running);
+}
+
  static unsigned long cpu_runnable_load(struct rq *rq)
  {
         return cfs_rq_runnable_load_avg(&rq->cfs);
@@ -5747,7 +5707,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
         unsigned int min_exit_latency = UINT_MAX;
         u64 latest_idle_timestamp = 0;
         int least_loaded_cpu = this_cpu;
-       int shallowest_idle_cpu = -1;
+       int shallowest_idle_cpu = -1, si_cpu = -1;
         int i;
  
         /* Check if we have any choice: */
@@ -5778,7 +5738,12 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                                 latest_idle_timestamp = rq->idle_stamp;
                                 shallowest_idle_cpu = i;
                         }
-               } else if (shallowest_idle_cpu == -1) {
+               } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+                       if (sched_idle_cpu(i)) {
+                               si_cpu = i;
+                               continue;
+                       }
+
                         load = cpu_runnable_load(cpu_rq(i));
                         if (load < min_load) {
                                 min_load = load;
@@ -5787,7 +5752,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                 }
         }
  
-       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+       if (shallowest_idle_cpu != -1)
+               return shallowest_idle_cpu;
+       if (si_cpu != -1)
+               return si_cpu;
+       return least_loaded_cpu;
  }
  
  static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5940,7 +5909,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
   */
  static int select_idle_smt(struct task_struct *p, int target)
  {
-       int cpu;
+       int cpu, si_cpu = -1;
  
         if (!static_branch_likely(&sched_smt_present))
                 return -1;
@@ -5950,9 +5919,11 @@ static int select_idle_smt(struct task_struct *p, int target)
                         continue;
                 if (available_idle_cpu(cpu))
                         return cpu;
+               if (si_cpu == -1 && sched_idle_cpu(cpu))
+                       si_cpu = cpu;
         }
  
-       return -1;
+       return si_cpu;
  }
  
  #else /* CONFIG_SCHED_SMT */
@@ -5980,8 +5951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
         u64 avg_cost, avg_idle;
         u64 time, cost;
         s64 delta;
-       int cpu, nr = INT_MAX;
         int this = smp_processor_id();
+       int cpu, nr = INT_MAX, si_cpu = -1;
  
         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
         if (!this_sd)
@@ -6009,11 +5980,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
  
         for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                 if (!--nr)
-                       return -1;
+                       return si_cpu;
                 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                         continue;
                 if (available_idle_cpu(cpu))
                         break;
+               if (si_cpu == -1 && sched_idle_cpu(cpu))
+                       si_cpu = cpu;
         }
  
         time = cpu_clock(this) - time;
@@ -6032,13 +6005,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         struct sched_domain *sd;
         int i, recent_used_cpu;
  
-       if (available_idle_cpu(target))
+       if (available_idle_cpu(target) || sched_idle_cpu(target))
                 return target;
  
         /*
          * If the previous CPU is cache affine and idle, don't be stupid:
          */
-       if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+       if (prev != target && cpus_share_cache(prev, target) &&
+           (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                 return prev;
  
         /* Check a recently used CPU as a potential idle candidate: */
@@ -6046,7 +6020,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         if (recent_used_cpu != prev &&
             recent_used_cpu != target &&
             cpus_share_cache(recent_used_cpu, target) &&
-           available_idle_cpu(recent_used_cpu) &&
+           (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
             cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
                 /*
                  * Replace recent_used_cpu with prev as it is a potential
@@ -6282,69 +6256,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
  }
  
  /*
- * compute_energy(): Estimates the energy that would be consumed if @p was
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
   * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
   * to compute what would be the energy if we decided to actually migrate that
   * task.
   */
  static long
  compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
  {
-       unsigned int max_util, util_cfs, cpu_util, cpu_cap;
-       unsigned long sum_util, energy = 0;
-       struct task_struct *tsk;
+       struct cpumask *pd_mask = perf_domain_span(pd);
+       unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+       unsigned long max_util = 0, sum_util = 0;
         int cpu;
  
-       for (; pd; pd = pd->next) {
-               struct cpumask *pd_mask = perf_domain_span(pd);
+       /*
+        * The capacity state of CPUs of the current rd can be driven by CPUs
+        * of another rd if they belong to the same pd. So, account for the
+        * utilization of these CPUs too by masking pd with cpu_online_mask
+        * instead of the rd span.
+        *
+        * If an entire pd is outside of the current rd, it will not appear in
+        * its pd list and will not be accounted by compute_energy().
+        */
+       for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+               unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+               struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
  
                 /*
-                * The energy model mandates all the CPUs of a performance
-                * domain have the same capacity.
+                * Busy time computation: utilization clamping is not
+                * required since the ratio (sum_util / cpu_capacity)
+                * is already enough to scale the EM reported power
+                * consumption at the (eventually clamped) cpu_capacity.
                  */
-               cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
-               max_util = sum_util = 0;
+               sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                              ENERGY_UTIL, NULL);
  
                 /*
-                * The capacity state of CPUs of the current rd can be driven by
-                * CPUs of another rd if they belong to the same performance
-                * domain. So, account for the utilization of these CPUs too
-                * by masking pd with cpu_online_mask instead of the rd span.
-                *
-                * If an entire performance domain is outside of the current rd,
-                * it will not appear in its pd list and will not be accounted
-                * by compute_energy().
+                * Performance domain frequency: utilization clamping
+                * must be considered since it affects the selection
+                * of the performance domain frequency.
+                * NOTE: in case RT tasks are running, by default the
+                * FREQUENCY_UTIL's utilization can be max OPP.
                  */
-               for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-                       util_cfs = cpu_util_next(cpu, p, dst_cpu);
-
-                       /*
-                        * Busy time computation: utilization clamping is not
-                        * required since the ratio (sum_util / cpu_capacity)
-                        * is already enough to scale the EM reported power
-                        * consumption at the (eventually clamped) cpu_capacity.
-                        */
-                       sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                      ENERGY_UTIL, NULL);
-
-                       /*
-                        * Performance domain frequency: utilization clamping
-                        * must be considered since it affects the selection
-                        * of the performance domain frequency.
-                        * NOTE: in case RT tasks are running, by default the
-                        * FREQUENCY_UTIL's utilization can be max OPP.
-                        */
-                       tsk = cpu == dst_cpu ? p : NULL;
-                       cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                     FREQUENCY_UTIL, tsk);
-                       max_util = max(max_util, cpu_util);
-               }
-
-               energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+               cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                             FREQUENCY_UTIL, tsk);
+               max_util = max(max_util, cpu_util);
         }
  
-       return energy;
+       return em_pd_energy(pd->em_pd, max_util, sum_util);
  }
  
  /*
@@ -6386,21 +6346,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
   * other use-cases too. So, until someone finds a better way to solve this,
   * let's keep things simple by re-using the existing slow path.
   */
-
  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
  {
-       unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+       unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+       unsigned long cpu_cap, util, base_energy = 0;
         int cpu, best_energy_cpu = prev_cpu;
-       struct perf_domain *head, *pd;
-       unsigned long cpu_cap, util;
         struct sched_domain *sd;
+       struct perf_domain *pd;
  
         rcu_read_lock();
         pd = rcu_dereference(rd->pd);
         if (!pd || READ_ONCE(rd->overutilized))
                 goto fail;
-       head = pd;
  
         /*
          * Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6417,9 +6375,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                 goto unlock;
  
         for (; pd; pd = pd->next) {
-               unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+               unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+               unsigned long base_energy_pd;
                 int max_spare_cap_cpu = -1;
  
+               /* Compute the 'base' energy of the pd, without @p */
+               base_energy_pd = compute_energy(p, -1, pd);
+               base_energy += base_energy_pd;
+
                 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                 continue;
@@ -6427,14 +6390,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                         /* Skip CPUs that will be overutilized. */
                         util = cpu_util_next(cpu, p, cpu);
                         cpu_cap = capacity_of(cpu);
-                       if (cpu_cap * 1024 < util * capacity_margin)
+                       if (!fits_capacity(util, cpu_cap))
                                 continue;
  
                         /* Always use prev_cpu as a candidate. */
                         if (cpu == prev_cpu) {
-                               prev_energy = compute_energy(p, prev_cpu, head);
-                               best_energy = min(best_energy, prev_energy);
-                               continue;
+                               prev_delta = compute_energy(p, prev_cpu, pd);
+                               prev_delta -= base_energy_pd;
+                               best_delta = min(best_delta, prev_delta);
                         }
  
                         /*
@@ -6450,9 +6413,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
  
                 /* Evaluate the energy impact of using this CPU. */
                 if (max_spare_cap_cpu >= 0) {
-                       cur_energy = compute_energy(p, max_spare_cap_cpu, head);
-                       if (cur_energy < best_energy) {
-                               best_energy = cur_energy;
+                       cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+                       cur_delta -= base_energy_pd;
+                       if (cur_delta < best_delta) {
+                               best_delta = cur_delta;
                                 best_energy_cpu = max_spare_cap_cpu;
                         }
                 }
@@ -6464,10 +6428,10 @@ unlock:
          * Pick the best CPU if prev_cpu cannot be used, or if it saves at
          * least 6% of the energy used by prev_cpu.
          */
-       if (prev_energy == ULONG_MAX)
+       if (prev_delta == ULONG_MAX)
                 return best_energy_cpu;
  
-       if ((prev_energy - best_energy) > (prev_energy >> 4))
+       if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
                 return best_energy_cpu;
  
         return prev_cpu;
@@ -6801,7 +6765,7 @@ again:
                 goto idle;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (prev->sched_class != &fair_sched_class)
+       if (!prev || prev->sched_class != &fair_sched_class)
                 goto simple;
  
         /*
@@ -6878,8 +6842,8 @@ again:
         goto done;
  simple:
  #endif
-
-       put_prev_task(rq, prev);
+       if (prev)
+               put_prev_task(rq, prev);
  
         do {
                 se = pick_next_entity(cfs_rq, NULL);
@@ -6907,11 +6871,13 @@ done: __maybe_unused;
         return p;
  
  idle:
-       update_misfit_status(NULL, rq);
-       new_tasks = idle_balance(rq, rf);
+       if (!rf)
+               return NULL;
+
+       new_tasks = newidle_balance(rq, rf);
  
         /*
-        * Because idle_balance() releases (and re-acquires) rq->lock, it is
+        * Because newidle_balance() releases (and re-acquires) rq->lock, it is
          * possible for any higher priority task to appear. In that case we
          * must re-start the pick_next_entity() loop.
          */
@@ -6933,7 +6899,7 @@ idle:
  /*
   * Account for a descheduled task:
   */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
         struct sched_entity *se = &prev->se;
         struct cfs_rq *cfs_rq;
@@ -7435,7 +7401,7 @@ static int detach_tasks(struct lb_env *env)
                 detached++;
                 env->imbalance -= load;
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
                 /*
                  * NEWIDLE balancing is a source of latency, so preemptible
                  * kernels will stop after the first task is detached to minimize
@@ -7982,8 +7948,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
  static inline bool
  group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
  {
-       return sg->sgc->min_capacity * capacity_margin <
-                                               ref->sgc->min_capacity * 1024;
+       return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
  }
  
  /*
@@ -7993,8 +7958,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
  static inline bool
  group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
  {
-       return sg->sgc->max_capacity * capacity_margin <
-                                               ref->sgc->max_capacity * 1024;
+       return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
  }
  
  static inline enum
@@ -9052,9 +9016,10 @@ more_balance:
  out_balanced:
         /*
          * We reach balance although we may have faced some affinity
-        * constraints. Clear the imbalance flag if it was set.
+        * constraints. Clear the imbalance flag only if other tasks got
+        * a chance to move and fix the imbalance.
          */
-       if (sd_parent) {
+       if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
  
                 if (*group_imbalance)
@@ -9075,10 +9040,10 @@ out_one_pinned:
         ld_moved = 0;
  
         /*
-        * idle_balance() disregards balance intervals, so we could repeatedly
-        * reach this code, which would lead to balance_interval skyrocketting
-        * in a short amount of time. Skip the balance_interval increase logic
-        * to avoid that.
+        * newidle_balance() disregards balance intervals, so we could
+        * repeatedly reach this code, which would lead to balance_interval
+        * skyrocketting in a short amount of time. Skip the balance_interval
+        * increase logic to avoid that.
          */
         if (env.idle == CPU_NEWLY_IDLE)
                 goto out;
@@ -9788,7 +9753,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
   * idle_balance is called by schedule() if this_cpu is about to become
   * idle. Attempts to pull tasks from other CPUs.
   */
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
  {
         unsigned long next_balance = jiffies + HZ;
         int this_cpu = this_rq->cpu;
@@ -9796,6 +9761,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
         int pulled_task = 0;
         u64 curr_cost = 0;
  
+       update_misfit_status(NULL, this_rq);
         /*
          * We must set idle_stamp _before_ calling idle_balance(), such that we
          * measure the duration of idle_balance() as idle time.
@@ -10180,9 +10146,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
   * This routine is mostly called to set cfs_rq->curr field when a task
   * migrates between groups/classes.
   */
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p)
  {
-       struct sched_entity *se = &rq->curr->se;
+       struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_SMP
+       if (task_on_rq_queued(p)) {
+               /*
+                * Move the next running task to the front of the list, so our
+                * cfs_tasks list becomes MRU one.
+                */
+               list_move(&se->group_node, &rq->cfs_tasks);
+       }
+#endif
  
         for_each_sched_entity(se) {
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10300,18 +10276,18 @@ err:
  void online_fair_sched_group(struct task_group *tg)
  {
         struct sched_entity *se;
+       struct rq_flags rf;
         struct rq *rq;
         int i;
  
         for_each_possible_cpu(i) {
                 rq = cpu_rq(i);
                 se = tg->se[i];
-
-               raw_spin_lock_irq(&rq->lock);
+               rq_lock_irq(rq, &rf);
                 update_rq_clock(rq);
                 attach_entity_cfs_rq(se);
                 sync_throttle(tg, i);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
         }
  }
  
@@ -10453,7 +10429,9 @@ const struct sched_class fair_sched_class = {
         .check_preempt_curr     = check_preempt_wakeup,
  
         .pick_next_task         = pick_next_task_fair,
+
         .put_prev_task          = put_prev_task_fair,
+       .set_next_task          = set_next_task_fair,
  
  #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_fair,
@@ -10466,7 +10444,6 @@ const struct sched_class fair_sched_class = {
         .set_cpus_allowed       = set_cpus_allowed_common,
  #endif
  
-       .set_curr_task          = set_curr_task_fair,
         .task_tick              = task_tick_fair,
         .task_fork              = task_fork_fair,
  
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

index e4bc4aa739b830c5236cf84445b6278aa3c0470b..8bfeb6395bddb9f5c3ccc1267475b55ae7c3d086 100644 (file)
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -375,14 +375,27 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
         resched_curr(rq);
  }
  
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+}
+
+static void set_next_task_idle(struct rq *rq, struct task_struct *next)
  {
-       put_prev_task(rq, prev);
         update_idle_core(rq);
         schedstat_inc(rq->sched_goidle);
+}
+
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+       struct task_struct *next = rq->idle;
+
+       if (prev)
+               put_prev_task(rq, prev);
+
+       set_next_task_idle(rq, next);
  
-       return rq->idle;
+       return next;
  }
  
  /*
@@ -398,10 +411,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
         raw_spin_lock_irq(&rq->lock);
  }
  
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-
  /*
   * scheduler tick hitting a task of our scheduling class.
   *
@@ -414,10 +423,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
  {
  }
  
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
  static void switched_to_idle(struct rq *rq, struct task_struct *p)
  {
         BUG();
@@ -452,13 +457,13 @@ const struct sched_class idle_sched_class = {
  
         .pick_next_task         = pick_next_task_idle,
         .put_prev_task          = put_prev_task_idle,
+       .set_next_task          = set_next_task_idle,
  
  #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_idle,
         .set_cpus_allowed       = set_cpus_allowed_common,
  #endif
  
-       .set_curr_task          = set_curr_task_idle,
         .task_tick              = task_tick_idle,
  
         .get_rr_interval        = get_rr_interval_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c

index ccb28085b11418f539766b96d32901bf05e0a7f6..9fcb2a695a41289fe490c64a0656300a74fefb99 100644 (file)
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -22,9 +22,17 @@ EXPORT_SYMBOL_GPL(housekeeping_enabled);
  
  int housekeeping_any_cpu(enum hk_flags flags)
  {
-       if (static_branch_unlikely(&housekeeping_overridden))
-               if (housekeeping_flags & flags)
+       int cpu;
+
+       if (static_branch_unlikely(&housekeeping_overridden)) {
+               if (housekeeping_flags & flags) {
+                       cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+                       if (cpu < nr_cpu_ids)
+                               return cpu;
+
                         return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+               }
+       }
         return smp_processor_id();
  }
  EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index 6e52b67b420e7a3312f463f8d3bb6baad6576041..517e3719027e619e5c7b565d1de9294dfffb5a3c 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1198,7 +1198,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
         if (static_branch_likely(&psi_disabled))
                 return -EOPNOTSUPP;
  
-       buf_size = min(nbytes, (sizeof(buf) - 1));
+       buf_size = min(nbytes, sizeof(buf));
         if (copy_from_user(buf, user_buf, buf_size))
                 return -EFAULT;
  
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index a532558a51768de0ec30f9492764fca7292c5058..858c4cc6f99bccd888b4388c87c217052b33560a 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1498,12 +1498,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
  #endif
  }
  
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
  {
         p->se.exec_start = rq_clock_task(rq);
  
         /* The running task is never eligible for pushing */
         dequeue_pushable_task(rq, p);
+
+       /*
+        * If prev task was rt, put_prev_task() has already updated the
+        * utilization. We only care of the case where we start to schedule a
+        * rt task
+        */
+       if (rq->curr->sched_class != &rt_sched_class)
+               update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+       rt_queue_push_tasks(rq);
  }
  
  static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -1543,56 +1553,19 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
         struct task_struct *p;
         struct rt_rq *rt_rq = &rq->rt;
  
-       if (need_pull_rt_task(rq, prev)) {
-               /*
-                * This is OK, because current is on_cpu, which avoids it being
-                * picked for load-balance and preemption/IRQs are still
-                * disabled avoiding further scheduler activity on it and we're
-                * being very careful to re-start the picking loop.
-                */
-               rq_unpin_lock(rq, rf);
-               pull_rt_task(rq);
-               rq_repin_lock(rq, rf);
-               /*
-                * pull_rt_task() can drop (and re-acquire) rq->lock; this
-                * means a dl or stop task can slip in, in which case we need
-                * to re-start task selection.
-                */
-               if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
-                            rq->dl.dl_nr_running))
-                       return RETRY_TASK;
-       }
-
-       /*
-        * We may dequeue prev's rt_rq in put_prev_task().
-        * So, we update time before rt_queued check.
-        */
-       if (prev->sched_class == &rt_sched_class)
-               update_curr_rt(rq);
+       WARN_ON_ONCE(prev || rf);
  
         if (!rt_rq->rt_queued)
                 return NULL;
  
-       put_prev_task(rq, prev);
-
         p = _pick_next_task_rt(rq);
  
-       set_next_task(rq, p);
-
-       rt_queue_push_tasks(rq);
-
-       /*
-        * If prev task was rt, put_prev_task() has already updated the
-        * utilization. We only care of the case where we start to schedule a
-        * rt task
-        */
-       if (rq->curr->sched_class != &rt_sched_class)
-               update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+       set_next_task_rt(rq, p);
  
         return p;
  }
  
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  {
         update_curr_rt(rq);
  
@@ -1604,6 +1577,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
          */
         if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_task(rq, p);
+
+       if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
+               /*
+                * This is OK, because current is on_cpu, which avoids it being
+                * picked for load-balance and preemption/IRQs are still
+                * disabled avoiding further scheduler activity on it and we've
+                * not yet started the picking loop.
+                */
+               rq_unpin_lock(rq, rf);
+               pull_rt_task(rq);
+               rq_repin_lock(rq, rf);
+       }
  }
  
  #ifdef CONFIG_SMP
@@ -2354,11 +2339,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
         }
  }
  
-static void set_curr_task_rt(struct rq *rq)
-{
-       set_next_task(rq, rq->curr);
-}
-
  static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
  {
         /*
@@ -2380,6 +2360,7 @@ const struct sched_class rt_sched_class = {
  
         .pick_next_task         = pick_next_task_rt,
         .put_prev_task          = put_prev_task_rt,
+       .set_next_task          = set_next_task_rt,
  
  #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_rt,
@@ -2391,7 +2372,6 @@ const struct sched_class rt_sched_class = {
         .switched_from          = switched_from_rt,
  #endif
  
-       .set_curr_task          = set_curr_task_rt,
         .task_tick              = task_tick_rt,
  
         .get_rr_interval        = get_rr_interval_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 802b1f3405f247897571aa041035cac97b4d7ddd..b3cb895d14a2088eef7a8853f00e7fc9b5823dd6 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -335,8 +335,6 @@ struct cfs_bandwidth {
         u64                     quota;
         u64                     runtime;
         s64                     hierarchical_quota;
-       u64                     runtime_expires;
-       int                     expires_seq;
  
         u8                      idle;
         u8                      period_active;
@@ -393,6 +391,16 @@ struct task_group {
  #endif
  
         struct cfs_bandwidth    cfs_bandwidth;
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       /* The two decimal precision [%] value requested from user-space */
+       unsigned int            uclamp_pct[UCLAMP_CNT];
+       /* Clamp values requested for a task group */
+       struct uclamp_se        uclamp_req[UCLAMP_CNT];
+       /* Effective clamp values used for a task group */
+       struct uclamp_se        uclamp[UCLAMP_CNT];
+#endif
+
  };
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -483,7 +491,8 @@ struct cfs_rq {
         struct load_weight      load;
         unsigned long           runnable_weight;
         unsigned int            nr_running;
-       unsigned int            h_nr_running;
+       unsigned int            h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
+       unsigned int            idle_h_nr_running; /* SCHED_IDLE */
  
         u64                     exec_clock;
         u64                     min_vruntime;
@@ -556,8 +565,6 @@ struct cfs_rq {
  
  #ifdef CONFIG_CFS_BANDWIDTH
         int                     runtime_enabled;
-       int                     expires_seq;
-       u64                     runtime_expires;
         s64                     runtime_remaining;
  
         u64                     throttled_clock;
@@ -777,9 +784,6 @@ struct root_domain {
         struct perf_domain __rcu *pd;
  };
  
-extern struct root_domain def_root_domain;
-extern struct mutex sched_domains_mutex;
-
  extern void init_defrootdomain(void);
  extern int sched_init_domains(const struct cpumask *cpu_map);
  extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
@@ -1261,16 +1265,18 @@ enum numa_topology_type {
  extern enum numa_topology_type sched_numa_topology_type;
  extern int sched_max_numa_distance;
  extern bool find_numa_distance(int distance);
-#endif
-
-#ifdef CONFIG_NUMA
  extern void sched_init_numa(void);
  extern void sched_domains_numa_masks_set(unsigned int cpu);
  extern void sched_domains_numa_masks_clear(unsigned int cpu);
+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
  #else
  static inline void sched_init_numa(void) { }
  static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
  static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+       return nr_cpu_ids;
+}
  #endif
  
  #ifdef CONFIG_NUMA_BALANCING
@@ -1449,10 +1455,14 @@ static inline void unregister_sched_domain_sysctl(void)
  }
  #endif
  
+extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
  #else
  
  static inline void sched_ttwu_pending(void) { }
  
+static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
+
  #endif /* CONFIG_SMP */
  
  #include "stats.h"
@@ -1700,17 +1710,21 @@ struct sched_class {
         void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
  
         /*
-        * It is the responsibility of the pick_next_task() method that will
-        * return the next task to call put_prev_task() on the @prev task or
-        * something equivalent.
+        * Both @prev and @rf are optional and may be NULL, in which case the
+        * caller must already have invoked put_prev_task(rq, prev, rf).
+        *
+        * Otherwise it is the responsibility of the pick_next_task() to call
+        * put_prev_task() on the @prev task or something equivalent, IFF it
+        * returns a next task.
          *
-        * May return RETRY_TASK when it finds a higher prio class has runnable
-        * tasks.
+        * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
+        * higher prio class has runnable tasks.
          */
         struct task_struct * (*pick_next_task)(struct rq *rq,
                                                struct task_struct *prev,
                                                struct rq_flags *rf);
-       void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+       void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
+       void (*set_next_task)(struct rq *rq, struct task_struct *p);
  
  #ifdef CONFIG_SMP
         int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
@@ -1725,7 +1739,6 @@ struct sched_class {
         void (*rq_offline)(struct rq *rq);
  #endif
  
-       void (*set_curr_task)(struct rq *rq);
         void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
         void (*task_fork)(struct task_struct *p);
         void (*task_dead)(struct task_struct *p);
@@ -1755,12 +1768,14 @@ struct sched_class {
  
  static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
  {
-       prev->sched_class->put_prev_task(rq, prev);
+       WARN_ON_ONCE(rq->curr != prev);
+       prev->sched_class->put_prev_task(rq, prev, NULL);
  }
  
-static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+static inline void set_next_task(struct rq *rq, struct task_struct *next)
  {
-       curr->sched_class->set_curr_task(rq);
+       WARN_ON_ONCE(rq->curr != next);
+       next->sched_class->set_next_task(rq, next);
  }
  
  #ifdef CONFIG_SMP
@@ -1943,7 +1958,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
  #endif
  
  #ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  
  static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
  
@@ -1995,7 +2010,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         return ret;
  }
  
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
  
  /*
   * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -2266,7 +2281,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
  #endif /* CONFIG_CPU_FREQ */
  
  #ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
  
  static __always_inline
  unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index aa0de240fb419c966281f7fa5972f4d76b435615..ba683fe81a6eb1d9d24f7897891f229f7073d23f 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
  {
         unsigned long long now = rq_clock(rq), delta = 0;
  
-       if (unlikely(sched_info_on()))
+       if (sched_info_on()) {
                 if (t->sched_info.last_queued)
                         delta = now - t->sched_info.last_queued;
+       }
         sched_info_reset_dequeued(t);
         t->sched_info.run_delay += delta;
  
@@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
   */
  static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
  {
-       if (unlikely(sched_info_on())) {
+       if (sched_info_on()) {
                 if (!t->sched_info.last_queued)
                         t->sched_info.last_queued = rq_clock(rq);
         }
@@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
  static inline void
  sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
  {
-       if (unlikely(sched_info_on()))
+       if (sched_info_on())
                 __sched_info_switch(rq, prev, next);
  }
  
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c

index c183b790ca54a90151d60b4cf1866143b4f3cfb8..7e1cee4e65b211ece98bd640bf691a29de329ed1 100644 (file)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,17 +23,22 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
         /* we're never preempted */
  }
  
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
+{
+       stop->se.exec_start = rq_clock_task(rq);
+}
+
  static struct task_struct *
  pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
         struct task_struct *stop = rq->stop;
  
+       WARN_ON_ONCE(prev || rf);
+
         if (!stop || !task_on_rq_queued(stop))
                 return NULL;
  
-       put_prev_task(rq, prev);
-
-       stop->se.exec_start = rq_clock_task(rq);
+       set_next_task_stop(rq, stop);
  
         return stop;
  }
@@ -55,7 +60,7 @@ static void yield_task_stop(struct rq *rq)
         BUG(); /* the stop task should never yield, its pointless. */
  }
  
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
         struct task_struct *curr = rq->curr;
         u64 delta_exec;
@@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
  {
  }
  
-static void set_curr_task_stop(struct rq *rq)
-{
-       struct task_struct *stop = rq->stop;
-
-       stop->se.exec_start = rq_clock_task(rq);
-}
-
  static void switched_to_stop(struct rq *rq, struct task_struct *p)
  {
         BUG(); /* its impossible to change to this class */
@@ -128,13 +126,13 @@ const struct sched_class stop_sched_class = {
  
         .pick_next_task         = pick_next_task_stop,
         .put_prev_task          = put_prev_task_stop,
+       .set_next_task          = set_next_task_stop,
  
  #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_stop,
         .set_cpus_allowed       = set_cpus_allowed_common,
  #endif
  
-       .set_curr_task          = set_curr_task_stop,
         .task_tick              = task_tick_stop,
  
         .get_rr_interval        = get_rr_interval_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index f751ce0b783e57eff53f4d5feb73be1859d05cd6..b5667a273bf67e0718371a783b259ffb4c91cbc7 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1284,6 +1284,7 @@ static int                        sched_domains_curr_level;
  int                            sched_max_numa_distance;
  static int                     *sched_domains_numa_distance;
  static struct cpumask          ***sched_domains_numa_masks;
+int __read_mostly              node_reclaim_distance = RECLAIM_DISTANCE;
  #endif
  
  /*
@@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl,
  
                 sd->flags &= ~SD_PREFER_SIBLING;
                 sd->flags |= SD_SERIALIZE;
-               if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+               if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
                         sd->flags &= ~(SD_BALANCE_EXEC |
                                        SD_BALANCE_FORK |
                                        SD_WAKE_AFFINE);
@@ -1724,6 +1725,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
         }
  }
  
+/*
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
+ *                             closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: cpu to be close to
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+       int i, j = cpu_to_node(cpu);
+
+       for (i = 0; i < sched_domains_numa_levels; i++) {
+               cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
+               if (cpu < nr_cpu_ids)
+                       return cpu;
+       }
+       return nr_cpu_ids;
+}
+
  #endif /* CONFIG_NUMA */
  
  static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -2149,16 +2170,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
   * ndoms_new == 0 is a special case for destroying existing domains,
   * and it will not create the default domain.
   *
- * Call with hotplug lock held
+ * Call with hotplug lock and sched_domains_mutex held
   */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-                            struct sched_domain_attr *dattr_new)
+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+                                   struct sched_domain_attr *dattr_new)
  {
         bool __maybe_unused has_eas = false;
         int i, j, n;
         int new_topology;
  
-       mutex_lock(&sched_domains_mutex);
+       lockdep_assert_held(&sched_domains_mutex);
  
         /* Always unregister in case we don't destroy any domains: */
         unregister_sched_domain_sysctl();
@@ -2183,8 +2204,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
         for (i = 0; i < ndoms_cur; i++) {
                 for (j = 0; j < n && !new_topology; j++) {
                         if (cpumask_equal(doms_cur[i], doms_new[j]) &&
-                           dattrs_equal(dattr_cur, i, dattr_new, j))
+                           dattrs_equal(dattr_cur, i, dattr_new, j)) {
+                               struct root_domain *rd;
+
+                               /*
+                                * This domain won't be destroyed and as such
+                                * its dl_bw->total_bw needs to be cleared.  It
+                                * will be recomputed in function
+                                * update_tasks_root_domain().
+                                */
+                               rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
+                               dl_clear_root_domain(rd);
                                 goto match1;
+                       }
                 }
                 /* No match - a current sched domain not in new doms_new[] */
                 detach_destroy_domains(doms_cur[i]);
@@ -2241,6 +2273,15 @@ match3:
         ndoms_cur = ndoms_new;
  
         register_sched_domain_sysctl();
+}
  
+/*
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+                            struct sched_domain_attr *dattr_new)
+{
+       mutex_lock(&sched_domains_mutex);
+       partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
         mutex_unlock(&sched_domains_mutex);
  }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c

index b4f83f7bdf86cdfa21c3d52563a9ad45d37a1164..c7031a22aa7bcb9dfd0f73da764108a321e99442 100644 (file)
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
          */
         preempt_disable();
         stop_cpus_in_progress = true;
+       barrier();
         for_each_cpu(cpu, cpumask) {
                 work = &per_cpu(cpu_stopper.stop_work, cpu);
                 work->fn = fn;
@@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
                 if (cpu_stop_queue_work(cpu, work))
                         queued = true;
         }
+       barrier();
         stop_cpus_in_progress = false;
         preempt_enable();
  
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig

index 98da8998c25ce406a2b0c9620f7295231120ae27..6a64d77728704c11e71798f6d26ed4cc7338f762 100644 (file)
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -146,7 +146,7 @@ config FUNCTION_TRACER
         select GENERIC_TRACER
         select CONTEXT_SWITCH_TRACER
         select GLOB
-       select TASKS_RCU if PREEMPT
+       select TASKS_RCU if PREEMPTION
         help
           Enable the kernel to trace every kernel function. This is done
           by using a compiler feature to insert a small, 5-byte No-Operation
@@ -179,7 +179,7 @@ config TRACE_PREEMPT_TOGGLE
  config PREEMPTIRQ_EVENTS
         bool "Enable trace events for preempt and irq disable/enable"
         select TRACE_IRQFLAGS
-       select TRACE_PREEMPT_TOGGLE if PREEMPT
+       select TRACE_PREEMPT_TOGGLE if PREEMPTION
         select GENERIC_TRACER
         default n
         help
@@ -214,7 +214,7 @@ config PREEMPT_TRACER
         bool "Preemption-off Latency Tracer"
         default n
         depends on !ARCH_USES_GETTIMEOFFSET
-       depends on PREEMPT
+       depends on PREEMPTION
         select GENERIC_TRACER
         select TRACER_MAX_TRACE
         select RING_BUFFER_ALLOW_SWAP
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index f9821a3374e9dd4b81ed0f7cfe8c436d7a58b4c5..356b848c697aa75b43a908de85c35a3658b8981e 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2814,7 +2814,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
                  * synchornize_rcu_tasks() will wait for those tasks to
                  * execute and either schedule voluntarily or enter user space.
                  */
-               if (IS_ENABLED(CONFIG_PREEMPT))
+               if (IS_ENABLED(CONFIG_PREEMPTION))
                         synchronize_rcu_tasks();
  
   free_ops:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c

index 0564f6db05617ae0dcebbd82c6bd7af33ec62e44..09b0b49f346ecbcfa591be3d3a8b41d7c6d7d028 100644 (file)
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -267,7 +267,7 @@ static void ring_buffer_producer(void)
                 if (consumer && !(cnt % wakeup_interval))
                         wake_up_process(consumer);
  
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
                 /*
                  * If we are a non preempt kernel, the 10 second run will
                  * stop everything while it runs. Instead, we will call
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c

index 648930823b571083c1a95000937962e565162eb2..b89cdfe20bc1626b1c4632c6bfcace5eedbfe1ef 100644 (file)
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -255,12 +255,12 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
         local_save_flags(fbuffer->flags);
         fbuffer->pc = preempt_count();
         /*
-        * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+        * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
          * preemption (adding one to the preempt_count). Since we are
          * interested in the preempt_count at the time the tracepoint was
          * hit, we need to subtract one to offset the increment.
          */
-       if (IS_ENABLED(CONFIG_PREEMPT))
+       if (IS_ENABLED(CONFIG_PREEMPTION))
                 fbuffer->pc--;
         fbuffer->trace_file = trace_file;
  
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c

index 743b2b520d34235135f2b85201b8a05622eb1c46..5e43b9664ecabb366dc8e81faa4d09d962084baa 100644 (file)
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -579,8 +579,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
         else
                 tracing_dl = 0;
  
-       wakeup_task = p;
-       get_task_struct(wakeup_task);
+       wakeup_task = get_task_struct(p);
  
         local_save_flags(flags);
  
diff --git a/mm/khugepaged.c b/mm/khugepaged.c

index eaaa21b2321565decde43a03f73dffabd443eba3..ccede2425c3f88da0529a1040025771539748e8b 100644 (file)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid)
         for (i = 0; i < MAX_NUMNODES; i++) {
                 if (!khugepaged_node_load[i])
                         continue;
-               if (node_distance(nid, i) > RECLAIM_DISTANCE)
+               if (node_distance(nid, i) > node_reclaim_distance)
                         return true;
         }
         return false;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 9c9194959271cfc0d9214bf60bb09b96c5b1a96a..6991ccec9c322ffb843110bb69cf2326d64b266c 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3511,7 +3511,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
-                               RECLAIM_DISTANCE;
+                               node_reclaim_distance;
  }
  #else  /* CONFIG_NUMA */
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| blame \| history
Documentation/scheduler/sched-bwc.rst		patch \| blob \| blame \| history
MAINTAINERS		patch \| blob \| blame \| history
arch/Kconfig		patch \| blob \| blame \| history
arch/ia64/Kconfig		patch \| blob \| blame \| history
arch/x86/entry/entry_32.S		patch \| blob \| blame \| history
arch/x86/entry/entry_64.S		patch \| blob \| blame \| history
arch/x86/entry/thunk_32.S		patch \| blob \| blame \| history
arch/x86/entry/thunk_64.S		patch \| blob \| blame \| history
arch/x86/include/asm/preempt.h		patch \| blob \| blame \| history
arch/x86/kernel/cpu/amd.c		patch \| blob \| blame \| history
arch/x86/kernel/dumpstack.c		patch \| blob \| blame \| history
arch/x86/kernel/kprobes/core.c		patch \| blob \| blame \| history
arch/x86/kernel/kvm.c		patch \| blob \| blame \| history
include/asm-generic/preempt.h		patch \| blob \| blame \| history
include/linux/cgroup.h		patch \| blob \| blame \| history
include/linux/cpuset.h		patch \| blob \| blame \| history
include/linux/preempt.h		patch \| blob \| blame \| history
include/linux/rcupdate.h		patch \| blob \| blame \| history
include/linux/rcutree.h		patch \| blob \| blame \| history
include/linux/sched.h		patch \| blob \| blame \| history
include/linux/sched/deadline.h		patch \| blob \| blame \| history
include/linux/sched/task.h		patch \| blob \| blame \| history
include/linux/sched/topology.h		patch \| blob \| blame \| history
include/linux/spinlock.h		patch \| blob \| blame \| history
include/linux/spinlock_api_smp.h		patch \| blob \| blame \| history
include/linux/topology.h		patch \| blob \| blame \| history
include/linux/torture.h		patch \| blob \| blame \| history
init/Kconfig		patch \| blob \| blame \| history
init/init_task.c		patch \| blob \| blame \| history
init/main.c		patch \| blob \| blame \| history
kernel/cgroup/cgroup.c		patch \| blob \| blame \| history
kernel/cgroup/cpuset.c		patch \| blob \| blame \| history
kernel/events/core.c		patch \| blob \| blame \| history
kernel/irq/manage.c		patch \| blob \| blame \| history
kernel/kprobes.c		patch \| blob \| blame \| history
kernel/locking/rtmutex.c		patch \| blob \| blame \| history
kernel/rcu/Kconfig		patch \| blob \| blame \| history
kernel/rcu/tree.c		patch \| blob \| blame \| history
kernel/rcu/tree_stall.h		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/cpufreq_schedutil.c		patch \| blob \| blame \| history
kernel/sched/deadline.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/idle.c		patch \| blob \| blame \| history
kernel/sched/isolation.c		patch \| blob \| blame \| history
kernel/sched/psi.c		patch \| blob \| blame \| history
kernel/sched/rt.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
kernel/sched/stats.h		patch \| blob \| blame \| history
kernel/sched/stop_task.c		patch \| blob \| blame \| history
kernel/sched/topology.c		patch \| blob \| blame \| history
kernel/stop_machine.c		patch \| blob \| blame \| history
kernel/trace/Kconfig		patch \| blob \| blame \| history
kernel/trace/ftrace.c		patch \| blob \| blame \| history
kernel/trace/ring_buffer_benchmark.c		patch \| blob \| blame \| history
kernel/trace/trace_events.c		patch \| blob \| blame \| history
kernel/trace/trace_sched_wakeup.c		patch \| blob \| blame \| history
mm/khugepaged.c		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history