Merge branch 'sched/rt' into sched/core, to pick up -rt changes

author Ingo Molnar <mingo@kernel.org>

Mon, 16 Sep 2019 12:04:28 +0000 (14:04 +0200)

committer Ingo Molnar <mingo@kernel.org>

Mon, 16 Sep 2019 12:05:04 +0000 (14:05 +0200)
author Ingo Molnar <mingo@kernel.org>
Mon, 16 Sep 2019 12:04:28 +0000 (14:04 +0200)
committer Ingo Molnar <mingo@kernel.org>
Mon, 16 Sep 2019 12:05:04 +0000 (14:05 +0200)
diff --combined MAINTAINERS

index bab0ca437e31df37904117f3b752b2ae3d71dc8b,6426db5198f0537746c22d10f95ce4a5004fdde3..3a5ef62c9dd11a70d6ecc5da353b46bf11770ebd
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -899,7 -899,7 +899,7 @@@ L: linux-iio@vger.kernel.or
   W:    http://ez.analog.com/community/linux-device-drivers
   S:    Supported
   F:    drivers/iio/adc/ad7124.c
- F:    Documentation/devicetree/bindings/iio/adc/adi,ad7124.txt
+ F:    Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml
   
   ANALOG DEVICES INC AD7606 DRIVER
   M:    Stefan Popa <stefan.popa@analog.com>
@@@ -1194,7 -1194,7 +1194,7 @@@ F:      include/uapi/linux/if_arcnet.
   
   ARM ARCHITECTED TIMER DRIVER
   M:    Mark Rutland <mark.rutland@arm.com>
- M:    Marc Zyngier <marc.zyngier@arm.com>
+ M:    Marc Zyngier <maz@kernel.org>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
   F:    arch/arm/include/asm/arch_timer.h
@@@ -4190,7 -4190,7 +4190,7 @@@ M:      Jens Axboe <axboe@kernel.dk
   L:    cgroups@vger.kernel.org
   L:    linux-block@vger.kernel.org
   T:    git git://git.kernel.dk/linux-block
- F:    Documentation/cgroup-v1/blkio-controller.rst
+ F:    Documentation/admin-guide/cgroup-v1/blkio-controller.rst
   F:    block/blk-cgroup.c
   F:    include/linux/blk-cgroup.h
   F:    block/blk-throttle.c
@@@ -4469,7 -4469,7 +4469,7 @@@ F:      arch/powerpc/platforms/powernv/pci-c
   F:    drivers/misc/cxl/
   F:    include/misc/cxl*
   F:    include/uapi/misc/cxl.h
- F:    Documentation/powerpc/cxl.txt
+ F:    Documentation/powerpc/cxl.rst
   F:    Documentation/ABI/testing/sysfs-class-cxl
   
   CXLFLASH (IBM Coherent Accelerator Processor Interface CAPI Flash) SCSI DRIVER
@@@ -4480,7 -4480,7 +4480,7 @@@ L:      linux-scsi@vger.kernel.or
   S:    Supported
   F:    drivers/scsi/cxlflash/
   F:    include/uapi/scsi/cxlflash_ioctl.h
- F:    Documentation/powerpc/cxlflash.txt
+ F:    Documentation/powerpc/cxlflash.rst
   
   CYBERPRO FB DRIVER
   M:    Russell King <linux@armlinux.org.uk>
@@@ -6856,7 -6856,7 +6856,7 @@@ R:      Sagi Shahar <sagis@google.com
   R:    Jon Olson <jonolson@google.com>
   L:    netdev@vger.kernel.org
   S:    Supported
- F:    Documentation/networking/device_drivers/google/gve.txt
+ F:    Documentation/networking/device_drivers/google/gve.rst
   F:    drivers/net/ethernet/google
   
   GPD POCKET FAN DRIVER
@@@ -8490,7 -8490,7 +8490,7 @@@ S:      Obsolet
   F:    include/uapi/linux/ipx.h
   
   IRQ DOMAINS (IRQ NUMBER MAPPING LIBRARY)
- M:    Marc Zyngier <marc.zyngier@arm.com>
+ M:    Marc Zyngier <maz@kernel.org>
   S:    Maintained
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
   F:    Documentation/IRQ-domain.txt
@@@ -8508,7 -8508,7 +8508,7 @@@ F:      kernel/irq
   IRQCHIP DRIVERS
   M:    Thomas Gleixner <tglx@linutronix.de>
   M:    Jason Cooper <jason@lakedaemon.net>
- M:    Marc Zyngier <marc.zyngier@arm.com>
+ M:    Marc Zyngier <maz@kernel.org>
   L:    linux-kernel@vger.kernel.org
   S:    Maintained
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
@@@ -8808,7 -8808,7 +8808,7 @@@ L:      kvm@vger.kernel.or
   W:    http://www.linux-kvm.org
   T:    git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
   S:    Supported
- F:    Documentation/virtual/kvm/
+ F:    Documentation/virt/kvm/
   F:    include/trace/events/kvm.h
   F:    include/uapi/asm-generic/kvm*
   F:    include/uapi/linux/kvm*
@@@ -8828,10 -8828,10 +8828,10 @@@ F:   arch/x86/include/asm/svm.
   F:    arch/x86/kvm/svm.c
   
   KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64)
- M:    Marc Zyngier <marc.zyngier@arm.com>
+ M:    Marc Zyngier <maz@kernel.org>
   R:    James Morse <james.morse@arm.com>
- R:    Julien Thierry <julien.thierry@arm.com>
- R:    Suzuki K Pouloze <suzuki.poulose@arm.com>
+ R:    Julien Thierry <julien.thierry.kdev@gmail.com>
+ R:    Suzuki K Poulose <suzuki.poulose@arm.com>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   L:    kvmarm@lists.cs.columbia.edu
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
@@@ -12137,7 -12137,7 +12137,7 @@@ M:   Thomas Hellstrom <thellstrom@vmware.
   M:    "VMware, Inc." <pv-drivers@vmware.com>
   L:    virtualization@lists.linux-foundation.org
   S:    Supported
- F:    Documentation/virtual/paravirt_ops.txt
+ F:    Documentation/virt/paravirt_ops.rst
   F:    arch/*/kernel/paravirt*
   F:    arch/*/include/asm/paravirt*.h
   F:    include/linux/hypervisor.h
@@@ -12394,7 -12394,7 +12394,7 @@@ F:   Documentation/PCI/pci-error-recovery
   F:    drivers/pci/pcie/aer.c
   F:    drivers/pci/pcie/dpc.c
   F:    drivers/pci/pcie/err.c
- F:    Documentation/powerpc/eeh-pci-error-recovery.txt
+ F:    Documentation/powerpc/eeh-pci-error-recovery.rst
   F:    arch/powerpc/kernel/eeh*.c
   F:    arch/powerpc/platforms/*/eeh*.c
   F:    arch/powerpc/include/*/eeh*.h
@@@ -12590,7 -12590,6 +12590,7 @@@ PERFORMANCE EVENTS SUBSYSTE
   M:    Peter Zijlstra <peterz@infradead.org>
   M:    Ingo Molnar <mingo@redhat.com>
   M:    Arnaldo Carvalho de Melo <acme@kernel.org>
+ +R:    Mark Rutland <mark.rutland@arm.com>
   R:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
   R:    Jiri Olsa <jolsa@redhat.com>
   R:    Namhyung Kim <namhyung@kernel.org>
@@@ -13726,6 -13725,7 +13726,7 @@@ F:   drivers/mtd/nand/raw/r852.
   F:    drivers/mtd/nand/raw/r852.h
   
   RISC-V ARCHITECTURE
+ M:    Paul Walmsley <paul.walmsley@sifive.com>
   M:    Palmer Dabbelt <palmer@sifive.com>
   M:    Albert Ou <aou@eecs.berkeley.edu>
   L:    linux-riscv@lists.infradead.org
@@@ -13948,7 -13948,6 +13949,6 @@@ F:   drivers/pci/hotplug/s390_pci_hpc.
   
   S390 VFIO-CCW DRIVER
   M:    Cornelia Huck <cohuck@redhat.com>
- M:    Farhan Ali <alifm@linux.ibm.com>
   M:    Eric Farman <farman@linux.ibm.com>
   R:    Halil Pasic <pasic@linux.ibm.com>
   L:    linux-s390@vger.kernel.org
@@@ -14177,12 -14176,6 +14177,12 @@@ F: drivers/watchdog/sc1200wdt.
   SCHEDULER
   M:    Ingo Molnar <mingo@redhat.com>
   M:    Peter Zijlstra <peterz@infradead.org>
+ +M:    Juri Lelli <juri.lelli@redhat.com> (SCHED_DEADLINE)
+ +M:    Vincent Guittot <vincent.guittot@linaro.org> (SCHED_NORMAL)
+ +R:    Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
+ +R:    Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
+ +R:    Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
+ +R:    Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
   L:    linux-kernel@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
   S:    Maintained
@@@ -16861,7 -16854,7 +16861,7 @@@ W:   http://user-mode-linux.sourceforge.n
   Q:    https://patchwork.ozlabs.org/project/linux-um/list/
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rw/uml.git
   S:    Maintained
- F:    Documentation/virtual/uml/
+ F:    Documentation/virt/uml/
   F:    arch/um/
   F:    arch/x86/um/
   F:    fs/hostfs/
@@@ -17130,7 -17123,7 +17130,7 @@@ F:   drivers/virtio/virtio_input.
   F:    include/uapi/linux/virtio_input.h
   
   VIRTIO IOMMU DRIVER
- M:    Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
+ M:    Jean-Philippe Brucker <jean-philippe@linaro.org>
   L:    virtualization@lists.linux-foundation.org
   S:    Maintained
   F:    drivers/iommu/virtio-iommu.c
@@@ -17179,7 -17172,6 +17179,6 @@@ F:   drivers/vme
   F:    include/linux/vme*
   
   VMWARE BALLOON DRIVER
- M:    Julien Freche <jfreche@vmware.com>
   M:    Nadav Amit <namit@vmware.com>
   M:    "VMware, Inc." <pv-drivers@vmware.com>
   L:    linux-kernel@vger.kernel.org
diff --combined include/linux/sched.h

index b94ad92dfbe6a11853d5aa1c12db709838e311a3,6947516a2d3ebd61c3d2cb8597f13335972d84fa..f0edee94834a8262db98f3abe61721121ff1c625
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -295,11 -295,6 +295,11 @@@ enum uclamp_id 
         UCLAMP_CNT
   };
   
+ +#ifdef CONFIG_SMP
+ +extern struct root_domain def_root_domain;
+ +extern struct mutex sched_domains_mutex;
+ +#endif
+ +
   struct sched_info {
   #ifdef CONFIG_SCHED_INFO
         /* Cumulative counters: */
@@@ -1772,7 -1767,7 +1772,7 @@@ static inline int test_tsk_need_resched
    * value indicates whether a reschedule was done in fact.
    * cond_resched_lock() will drop the spinlock before scheduling,
    */
- #ifndef CONFIG_PREEMPT
+ #ifndef CONFIG_PREEMPTION
   extern int _cond_resched(void);
   #else
   static inline int _cond_resched(void) { return 0; }
@@@ -1801,12 -1796,12 +1801,12 @@@ static inline void cond_resched_rcu(voi
   
   /*
    * Does a critical section need to be broken due to another
-  * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+  * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
    * but a general need for low latency)
    */
   static inline int spin_needbreak(spinlock_t *lock)
   {
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
         return spin_is_contended(lock);
   #else
         return 0;
diff --combined kernel/events/core.c

index ea5e8139fe6282a6623b0a590d4f0e02df6d936f,0463c1151baebb612b617cbf3b189fdde1990d28..c1f52a749db25cd89f5ccc7cf5e6d0237677adae
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -4089,8 -4089,10 +4089,8 @@@ alloc_perf_context(struct pmu *pmu, str
                 return NULL;
   
         __perf_event_init_context(ctx);
- -      if (task) {
- -              ctx->task = task;
- -              get_task_struct(task);
- -      }
+ +      if (task)
+ +              ctx->task = get_task_struct(task);
         ctx->pmu = pmu;
   
         return ctx;
@@@ -10353,7 -10355,8 +10353,7 @@@ perf_event_alloc(struct perf_event_att
                  * and we cannot use the ctx information because we need the
                  * pmu before we get a ctx.
                  */
- -              get_task_struct(task);
- -              event->hw.target = task;
+ +              event->hw.target = get_task_struct(task);
         }
   
         event->clock = &local_clock;
@@@ -11271,7 -11274,7 +11271,7 @@@ perf_event_create_kernel_counter(struc
                 goto err_unlock;
         }
   
-       perf_install_in_context(ctx, event, cpu);
+       perf_install_in_context(ctx, event, event->cpu);
         perf_unpin_context(ctx);
         mutex_unlock(&ctx->mutex);
   
diff --combined kernel/rcu/tree.c

index 5efdce756fdf041074e6632d6d49e83246487e28,5962636502bc4ad933a714f83989a4fc21838bb3..6a37afd5436c32a6635fee042c5562f10ed24baf
--- 1/kernel/rcu/tree.c
--- 2/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@@ -1881,7 -1881,7 +1881,7 @@@ rcu_report_unblock_qs_rnp(struct rcu_no
         struct rcu_node *rnp_p;
   
         raw_lockdep_assert_held_rcu_node(rnp);
-       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
+       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
             WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
             rnp->qsmask != 0) {
                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@@ -2205,7 -2205,7 +2205,7 @@@ static void force_qs_rnp(int (*f)(struc
                 mask = 0;
                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
                 if (rnp->qsmask == 0) {
-                       if (!IS_ENABLED(CONFIG_PREEMPT) ||
+                       if (!IS_ENABLED(CONFIG_PREEMPTION) ||
                             rcu_preempt_blocked_readers_cgp(rnp)) {
                                 /*
                                  * No point in scanning bits because they
@@@ -2622,7 -2622,7 +2622,7 @@@ static int rcu_blocking_is_gp(void
   {
         int ret;
   
-       if (IS_ENABLED(CONFIG_PREEMPT))
+       if (IS_ENABLED(CONFIG_PREEMPTION))
                 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
         might_sleep();  /* Check for RCU read-side critical section. */
         preempt_disable();
@@@ -3234,13 -3234,13 +3234,13 @@@ static int __init rcu_spawn_gp_kthread(
         t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
         if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
                 return 0;
- -      rnp = rcu_get_root();
- -      raw_spin_lock_irqsave_rcu_node(rnp, flags);
- -      rcu_state.gp_kthread = t;
         if (kthread_prio) {
                 sp.sched_priority = kthread_prio;
                 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
         }
+ +      rnp = rcu_get_root();
+ +      raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ +      rcu_state.gp_kthread = t;
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
         wake_up_process(t);
         rcu_spawn_nocb_kthreads();
diff --combined kernel/sched/core.c

index 3c7b90bcbe4e438d3b43738c02c0f9bb0e967c1d,604a5e137efeef856a62be9fdccafccae84f721a..87b84a726db448c76edd1fd46a387e392de9255c
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -773,18 -773,6 +773,18 @@@ static void set_load_weight(struct task
   }
   
   #ifdef CONFIG_UCLAMP_TASK
+ +/*
+ + * Serializes updates of utilization clamp values
+ + *
+ + * The (slow-path) user-space triggers utilization clamp value updates which
+ + * can require updates on (fast-path) scheduler's data structures used to
+ + * support enqueue/dequeue operations.
+ + * While the per-CPU rq lock protects fast-path update operations, user-space
+ + * requests are serialized using a mutex to reduce the risk of conflicting
+ + * updates or API abuses.
+ + */
+ +static DEFINE_MUTEX(uclamp_mutex);
+ +
   /* Max allowed minimum utilization */
   unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
   
@@@ -810,7 -798,7 +810,7 @@@ static inline unsigned int uclamp_bucke
         return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
   }
   
- -static inline unsigned int uclamp_none(int clamp_id)
+ +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
   {
         if (clamp_id == UCLAMP_MIN)
                 return 0;
@@@ -826,7 -814,7 +826,7 @@@ static inline void uclamp_se_set(struc
   }
   
   static inline unsigned int
- -uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+ +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                   unsigned int clamp_value)
   {
         /*
@@@ -842,7 -830,7 +842,7 @@@
         return uclamp_none(UCLAMP_MIN);
   }
   
- -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+ +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                      unsigned int clamp_value)
   {
         /* Reset max-clamp retention only on idle exit */
@@@ -853,8 -841,8 +853,8 @@@
   }
   
   static inline
- -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
- -                               unsigned int clamp_value)
+ +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ +                                 unsigned int clamp_value)
   {
         struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
         int bucket_id = UCLAMP_BUCKETS - 1;
@@@ -873,42 -861,16 +873,42 @@@
         return uclamp_idle_value(rq, clamp_id, clamp_value);
   }
   
+ +static inline struct uclamp_se
+ +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+ +{
+ +      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      struct uclamp_se uc_max;
+ +
+ +      /*
+ +       * Tasks in autogroups or root task group will be
+ +       * restricted by system defaults.
+ +       */
+ +      if (task_group_is_autogroup(task_group(p)))
+ +              return uc_req;
+ +      if (task_group(p) == &root_task_group)
+ +              return uc_req;
+ +
+ +      uc_max = task_group(p)->uclamp[clamp_id];
+ +      if (uc_req.value > uc_max.value || !uc_req.user_defined)
+ +              return uc_max;
+ +#endif
+ +
+ +      return uc_req;
+ +}
+ +
   /*
    * The effective clamp bucket index of a task depends on, by increasing
    * priority:
    * - the task specific clamp value, when explicitly requested from userspace
+ + * - the task group effective clamp value, for tasks not either in the root
+ + *   group or in an autogroup
    * - the system default clamp value, defined by the sysadmin
    */
   static inline struct uclamp_se
- -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+ +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
   {
- -      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ +      struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
         struct uclamp_se uc_max = uclamp_default[clamp_id];
   
         /* System default restrictions always apply */
@@@ -918,7 -880,7 +918,7 @@@
         return uc_req;
   }
   
- -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+ +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
   {
         struct uclamp_se uc_eff;
   
@@@ -942,7 -904,7 +942,7 @@@
    * for each bucket when all its RUNNABLE tasks require the same clamp.
    */
   static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
- -                                  unsigned int clamp_id)
+ +                                  enum uclamp_id clamp_id)
   {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@@ -980,7 -942,7 +980,7 @@@
    * enforce the expected state and warn.
    */
   static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
- -                                  unsigned int clamp_id)
+ +                                  enum uclamp_id clamp_id)
   {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@@ -1019,7 -981,7 +1019,7 @@@
   
   static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@@ -1034,7 -996,7 +1034,7 @@@
   
   static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@@ -1043,82 -1005,15 +1043,82 @@@
                 uclamp_rq_dec_id(rq, p, clamp_id);
   }
   
+ +static inline void
+ +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+ +{
+ +      struct rq_flags rf;
+ +      struct rq *rq;
+ +
+ +      /*
+ +       * Lock the task and the rq where the task is (or was) queued.
+ +       *
+ +       * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ +       * price to pay to safely serialize util_{min,max} updates with
+ +       * enqueues, dequeues and migration operations.
+ +       * This is the same locking schema used by __set_cpus_allowed_ptr().
+ +       */
+ +      rq = task_rq_lock(p, &rf);
+ +
+ +      /*
+ +       * Setting the clamp bucket is serialized by task_rq_lock().
+ +       * If the task is not yet RUNNABLE and its task_struct is not
+ +       * affecting a valid clamp bucket, the next time it's enqueued,
+ +       * it will already see the updated clamp bucket value.
+ +       */
+ +      if (!p->uclamp[clamp_id].active) {
+ +              uclamp_rq_dec_id(rq, p, clamp_id);
+ +              uclamp_rq_inc_id(rq, p, clamp_id);
+ +      }
+ +
+ +      task_rq_unlock(rq, p, &rf);
+ +}
+ +
+ +static inline void
+ +uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+ +                         unsigned int clamps)
+ +{
+ +      enum uclamp_id clamp_id;
+ +      struct css_task_iter it;
+ +      struct task_struct *p;
+ +
+ +      css_task_iter_start(css, 0, &it);
+ +      while ((p = css_task_iter_next(&it))) {
+ +              for_each_clamp_id(clamp_id) {
+ +                      if ((0x1 << clamp_id) & clamps)
+ +                              uclamp_update_active(p, clamp_id);
+ +              }
+ +      }
+ +      css_task_iter_end(&it);
+ +}
+ +
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+ +static void uclamp_update_root_tg(void)
+ +{
+ +      struct task_group *tg = &root_task_group;
+ +
+ +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ +                    sysctl_sched_uclamp_util_min, false);
+ +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ +                    sysctl_sched_uclamp_util_max, false);
+ +
+ +      rcu_read_lock();
+ +      cpu_util_update_eff(&root_task_group.css);
+ +      rcu_read_unlock();
+ +}
+ +#else
+ +static void uclamp_update_root_tg(void) { }
+ +#endif
+ +
   int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos)
   {
+ +      bool update_root_tg = false;
         int old_min, old_max;
- -      static DEFINE_MUTEX(mutex);
         int result;
   
- -      mutex_lock(&mutex);
+ +      mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
   
@@@ -1137,30 -1032,23 +1137,30 @@@
         if (old_min != sysctl_sched_uclamp_util_min) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                               sysctl_sched_uclamp_util_min, false);
+ +              update_root_tg = true;
         }
         if (old_max != sysctl_sched_uclamp_util_max) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                               sysctl_sched_uclamp_util_max, false);
+ +              update_root_tg = true;
         }
   
+ +      if (update_root_tg)
+ +              uclamp_update_root_tg();
+ +
         /*
- -       * Updating all the RUNNABLE task is expensive, keep it simple and do
- -       * just a lazy update at each next enqueue time.
+ +       * We update all RUNNABLE tasks only when task groups are in use.
+ +       * Otherwise, keep it simple and do just a lazy update at each next
+ +       * task enqueue time.
          */
+ +
         goto done;
   
   undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
   done:
- -      mutex_unlock(&mutex);
+ +      mutex_unlock(&uclamp_mutex);
   
         return result;
   }
@@@ -1187,7 -1075,7 +1187,7 @@@ static int uclamp_validate(struct task_
   static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         /*
          * On scheduling class change, reset to default clamps for tasks
@@@ -1224,7 -1112,7 +1224,7 @@@
   
   static void uclamp_fork(struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
@@@ -1246,11 -1134,9 +1246,11 @@@
   static void __init init_uclamp(void)
   {
         struct uclamp_se uc_max = {};
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
         int cpu;
   
+ +      mutex_init(&uclamp_mutex);
+ +
         for_each_possible_cpu(cpu) {
                 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                 cpu_rq(cpu)->uclamp_flags = 0;
@@@ -1263,13 -1149,8 +1263,13 @@@
   
         /* System defaults allow max clamp values for both indexes */
         uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
- -      for_each_clamp_id(clamp_id)
+ +      for_each_clamp_id(clamp_id) {
                 uclamp_default[clamp_id] = uc_max;
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +              root_task_group.uclamp_req[clamp_id] = uc_max;
+ +              root_task_group.uclamp[clamp_id] = uc_max;
+ +#endif
+ +      }
   }
   
   #else /* CONFIG_UCLAMP_TASK */
@@@ -1613,7 -1494,7 +1613,7 @@@ void do_set_cpus_allowed(struct task_st
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   }
   
   /*
@@@ -3333,8 -3214,12 +3333,8 @@@ static __always_inline struct rq 
   context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next, struct rq_flags *rf)
   {
- -      struct mm_struct *mm, *oldmm;
- -
         prepare_task_switch(rq, prev, next);
   
- -      mm = next->mm;
- -      oldmm = prev->active_mm;
         /*
          * For paravirt, this is coupled with an exit in switch_to to
          * combine the page table reload and the switch backend into
@@@ -3343,37 -3228,22 +3343,37 @@@
         arch_start_context_switch(prev);
   
         /*
- -       * If mm is non-NULL, we pass through switch_mm(). If mm is
- -       * NULL, we will pass through mmdrop() in finish_task_switch().
- -       * Both of these contain the full memory barrier required by
- -       * membarrier after storing to rq->curr, before returning to
- -       * user-space.
+ +       * kernel -> kernel   lazy + transfer active
+ +       *   user -> kernel   lazy + mmgrab() active
+ +       *
+ +       * kernel ->   user   switch + mmdrop() active
+ +       *   user ->   user   switch
          */
- -      if (!mm) {
- -              next->active_mm = oldmm;
- -              mmgrab(oldmm);
- -              enter_lazy_tlb(oldmm, next);
- -      } else
- -              switch_mm_irqs_off(oldmm, mm, next);
+ +      if (!next->mm) {                                // to kernel
+ +              enter_lazy_tlb(prev->active_mm, next);
+ +
+ +              next->active_mm = prev->active_mm;
+ +              if (prev->mm)                           // from user
+ +                      mmgrab(prev->active_mm);
+ +              else
+ +                      prev->active_mm = NULL;
+ +      } else {                                        // to user
+ +              /*
+ +               * sys_membarrier() requires an smp_mb() between setting
+ +               * rq->curr and returning to userspace.
+ +               *
+ +               * The below provides this either through switch_mm(), or in
+ +               * case 'prev->active_mm == next->mm' through
+ +               * finish_task_switch()'s mmdrop().
+ +               */
+ +
+ +              switch_mm_irqs_off(prev->active_mm, next->mm, next);
   
- -      if (!prev->mm) {
- -              prev->active_mm = NULL;
- -              rq->prev_mm = oldmm;
+ +              if (!prev->mm) {                        // from kernel
+ +                      /* will mmdrop() in finish_task_switch(). */
+ +                      rq->prev_mm = prev->active_mm;
+ +                      prev->active_mm = NULL;
+ +              }
         }
   
         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@@ -3616,36 -3486,8 +3616,36 @@@ void scheduler_tick(void
   
   struct tick_work {
         int                     cpu;
+ +      atomic_t                state;
         struct delayed_work     work;
   };
+ +/* Values for ->state, see diagram below. */
+ +#define TICK_SCHED_REMOTE_OFFLINE     0
+ +#define TICK_SCHED_REMOTE_OFFLINING   1
+ +#define TICK_SCHED_REMOTE_RUNNING     2
+ +
+ +/*
+ + * State diagram for ->state:
+ + *
+ + *
+ + *          TICK_SCHED_REMOTE_OFFLINE
+ + *                    |   ^
+ + *                    |   |
+ + *                    |   | sched_tick_remote()
+ + *                    |   |
+ + *                    |   |
+ + *                    +--TICK_SCHED_REMOTE_OFFLINING
+ + *                    |   ^
+ + *                    |   |
+ + * sched_tick_start() |   | sched_tick_stop()
+ + *                    |   |
+ + *                    V   |
+ + *          TICK_SCHED_REMOTE_RUNNING
+ + *
+ + *
+ + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ + * and sched_tick_start() are happy to leave the state in RUNNING.
+ + */
   
   static struct tick_work __percpu *tick_work_cpu;
   
@@@ -3658,7 -3500,6 +3658,7 @@@ static void sched_tick_remote(struct wo
         struct task_struct *curr;
         struct rq_flags rf;
         u64 delta;
+ +      int os;
   
         /*
          * Handle the tick only if it appears the remote CPU is running in full
@@@ -3672,7 -3513,7 +3672,7 @@@
   
         rq_lock_irq(rq, &rf);
         curr = rq->curr;
- -      if (is_idle_task(curr))
+ +      if (is_idle_task(curr) || cpu_is_offline(cpu))
                 goto out_unlock;
   
         update_rq_clock(rq);
@@@ -3692,18 -3533,13 +3692,18 @@@ out_requeue
         /*
          * Run the remote tick once per second (1Hz). This arbitrary
          * frequency is large enough to avoid overload but short enough
- -       * to keep scheduler internal stats reasonably up to date.
+ +       * to keep scheduler internal stats reasonably up to date.  But
+ +       * first update state to reflect hotplug activity if required.
          */
- -      queue_delayed_work(system_unbound_wq, dwork, HZ);
+ +      os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ +      if (os == TICK_SCHED_REMOTE_RUNNING)
+ +              queue_delayed_work(system_unbound_wq, dwork, HZ);
   }
   
   static void sched_tick_start(int cpu)
   {
+ +      int os;
         struct tick_work *twork;
   
         if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@@ -3712,20 -3548,15 +3712,20 @@@
         WARN_ON_ONCE(!tick_work_cpu);
   
         twork = per_cpu_ptr(tick_work_cpu, cpu);
- -      twork->cpu = cpu;
- -      INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- -      queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ +      if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ +              twork->cpu = cpu;
+ +              INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ +              queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ +      }
   }
   
   #ifdef CONFIG_HOTPLUG_CPU
   static void sched_tick_stop(int cpu)
   {
         struct tick_work *twork;
+ +      int os;
   
         if (housekeeping_cpu(cpu, HK_FLAG_TICK))
                 return;
@@@ -3733,10 -3564,7 +3733,10 @@@
         WARN_ON_ONCE(!tick_work_cpu);
   
         twork = per_cpu_ptr(tick_work_cpu, cpu);
- -      cancel_delayed_work_sync(&twork->work);
+ +      /* There cannot be competing actions, but don't rely on stop-machine. */
+ +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ +      WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ +      /* Don't cancel, as this would mess up the state machine. */
   }
   #endif /* CONFIG_HOTPLUG_CPU */
   
@@@ -3744,6 -3572,7 +3744,6 @@@ int __init sched_tick_offload_init(void
   {
         tick_work_cpu = alloc_percpu(struct tick_work);
         BUG_ON(!tick_work_cpu);
- -
         return 0;
   }
   
@@@ -3752,7 -3581,7 +3752,7 @@@ static inline void sched_tick_start(in
   static inline void sched_tick_stop(int cpu) { }
   #endif
   
- #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
   /*
    * If the value passed in is equal to the current preempt count
@@@ -3910,7 -3739,7 +3910,7 @@@ pick_next_task(struct rq *rq, struct ta
   
                 p = fair_sched_class.pick_next_task(rq, prev, rf);
                 if (unlikely(p == RETRY_TASK))
- -                      goto again;
+ +                      goto restart;
   
                 /* Assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
@@@ -3919,19 -3748,14 +3919,19 @@@
                 return p;
         }
   
- -again:
+ +restart:
+ +      /*
+ +       * Ensure that we put DL/RT tasks before the pick loop, such that they
+ +       * can PULL higher prio tasks when we lower the RQ 'priority'.
+ +       */
+ +      prev->sched_class->put_prev_task(rq, prev, rf);
+ +      if (!rq->nr_running)
+ +              newidle_balance(rq, rf);
+ +
         for_each_class(class) {
- -              p = class->pick_next_task(rq, prev, rf);
- -              if (p) {
- -                      if (unlikely(p == RETRY_TASK))
- -                              goto again;
+ +              p = class->pick_next_task(rq, NULL, NULL);
+ +              if (p)
                         return p;
- -              }
         }
   
         /* The idle class should always have a runnable task: */
@@@ -3958,7 -3782,7 +3958,7 @@@
    *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
    *      called on the nearest possible occasion:
    *
-  *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+  *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
    *
    *         - in syscall or exception context, at the next outmost
    *           preempt_enable(). (this might be as soon as the wake_up()'s
@@@ -3967,7 -3791,7 +3967,7 @@@
    *         - in IRQ context, return from interrupt-handler to
    *           preemptible context
    *
-  *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+  *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
    *         then at the next:
    *
    *          - cond_resched() call
@@@ -4209,7 -4033,7 +4209,7 @@@ static void __sched notrace preempt_sch
         } while (need_resched());
   }
   
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
   /*
    * this is the entry point to schedule() from in-kernel preemption
    * off of preempt_enable. Kernel preemptions off return from interrupt
@@@ -4281,7 -4105,7 +4281,7 @@@ asmlinkage __visible void __sched notra
   }
   EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
   
- #endif /* CONFIG_PREEMPT */
+ #endif /* CONFIG_PREEMPTION */
   
   /*
    * this is the entry point to schedule() from kernel preemption
@@@ -4449,7 -4273,7 +4449,7 @@@ void rt_mutex_setprio(struct task_struc
         if (queued)
                 enqueue_task(rq, p, queue_flag);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   
         check_class_changed(rq, p, prev_class, oldprio);
   out_unlock:
@@@ -4516,7 -4340,7 +4516,7 @@@ void set_user_nice(struct task_struct *
                         resched_curr(rq);
         }
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   out_unlock:
         task_rq_unlock(rq, p, &rf);
   }
@@@ -4833,9 -4657,6 +4833,9 @@@ recheck
                         return retval;
         }
   
+ +      if (pi)
+ +              cpuset_read_lock();
+ +
         /*
          * Make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
@@@ -4850,8 -4671,8 +4850,8 @@@
          * Changing the policy of the stop threads its a very bad idea:
          */
         if (p == rq->stop) {
- -              task_rq_unlock(rq, p, &rf);
- -              return -EINVAL;
+ +              retval = -EINVAL;
+ +              goto unlock;
         }
   
         /*
@@@ -4869,8 -4690,8 +4869,8 @@@
                         goto change;
   
                 p->sched_reset_on_fork = reset_on_fork;
- -              task_rq_unlock(rq, p, &rf);
- -              return 0;
+ +              retval = 0;
+ +              goto unlock;
         }
   change:
   
@@@ -4883,8 -4704,8 +4883,8 @@@
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
- -                      task_rq_unlock(rq, p, &rf);
- -                      return -EPERM;
+ +                      retval = -EPERM;
+ +                      goto unlock;
                 }
   #endif
   #ifdef CONFIG_SMP
@@@ -4899,8 -4720,8 +4899,8 @@@
                          */
                         if (!cpumask_subset(span, p->cpus_ptr) ||
                             rq->rd->dl_bw.bw == 0) {
- -                              task_rq_unlock(rq, p, &rf);
- -                              return -EPERM;
+ +                              retval = -EPERM;
+ +                              goto unlock;
                         }
                 }
   #endif
@@@ -4910,8 -4731,6 +4910,8 @@@
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
                 task_rq_unlock(rq, p, &rf);
+ +              if (pi)
+ +                      cpuset_read_unlock();
                 goto recheck;
         }
   
@@@ -4921,8 -4740,8 +4921,8 @@@
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- -              task_rq_unlock(rq, p, &rf);
- -              return -EBUSY;
+ +              retval = -EBUSY;
+ +              goto unlock;
         }
   
         p->sched_reset_on_fork = reset_on_fork;
@@@ -4964,7 -4783,7 +4964,7 @@@
                 enqueue_task(rq, p, queue_flags);
         }
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   
         check_class_changed(rq, p, prev_class, oldprio);
   
@@@ -4972,22 -4791,14 +4972,22 @@@
         preempt_disable();
         task_rq_unlock(rq, p, &rf);
   
- -      if (pi)
+ +      if (pi) {
+ +              cpuset_read_unlock();
                 rt_mutex_adjust_pi(p);
+ +      }
   
         /* Run balance callbacks after we've adjusted the PI chain: */
         balance_callback(rq);
         preempt_enable();
   
         return 0;
+ +
+ +unlock:
+ +      task_rq_unlock(rq, p, &rf);
+ +      if (pi)
+ +              cpuset_read_unlock();
+ +      return retval;
   }
   
   static int _sched_setscheduler(struct task_struct *p, int policy,
@@@ -5071,15 -4882,10 +5071,15 @@@ do_sched_setscheduler(pid_t pid, int po
         rcu_read_lock();
         retval = -ESRCH;
         p = find_process_by_pid(pid);
- -      if (p != NULL)
- -              retval = sched_setscheduler(p, policy, &lparam);
+ +      if (likely(p))
+ +              get_task_struct(p);
         rcu_read_unlock();
   
+ +      if (likely(p)) {
+ +              retval = sched_setscheduler(p, policy, &lparam);
+ +              put_task_struct(p);
+ +      }
+ +
         return retval;
   }
   
@@@ -5610,7 -5416,7 +5610,7 @@@ SYSCALL_DEFINE0(sched_yield
         return 0;
   }
   
- #ifndef CONFIG_PREEMPT
+ #ifndef CONFIG_PREEMPTION
   int __sched _cond_resched(void)
   {
         if (should_resched(0)) {
@@@ -5627,7 -5433,7 +5627,7 @@@ EXPORT_SYMBOL(_cond_resched)
    * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
    * call schedule, and on return reacquire the lock.
    *
-  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+  * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
    * operations here to prevent schedule() from being called twice (once via
    * spin_unlock(), once by hand).
    */
@@@ -6166,7 -5972,7 +6166,7 @@@ void sched_setnuma(struct task_struct *
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
         task_rq_unlock(rq, p, &rf);
   }
   #endif /* CONFIG_NUMA_BALANCING */
@@@ -6206,22 -6012,21 +6206,22 @@@ static void calc_load_migrate(struct r
                 atomic_long_add(delta, &calc_load_tasks);
   }
   
- -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+ +static struct task_struct *__pick_migrate_task(struct rq *rq)
   {
- -}
+ +      const struct sched_class *class;
+ +      struct task_struct *next;
   
- -static const struct sched_class fake_sched_class = {
- -      .put_prev_task = put_prev_task_fake,
- -};
+ +      for_each_class(class) {
+ +              next = class->pick_next_task(rq, NULL, NULL);
+ +              if (next) {
+ +                      next->sched_class->put_prev_task(rq, next, NULL);
+ +                      return next;
+ +              }
+ +      }
   
- -static struct task_struct fake_task = {
- -      /*
- -       * Avoid pull_{rt,dl}_task()
- -       */
- -      .prio = MAX_PRIO + 1,
- -      .sched_class = &fake_sched_class,
- -};
+ +      /* The idle class should always have a runnable task */
+ +      BUG();
+ +}
   
   /*
    * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@@ -6264,7 -6069,12 +6264,7 @@@ static void migrate_tasks(struct rq *de
                 if (rq->nr_running == 1)
                         break;
   
- -              /*
- -               * pick_next_task() assumes pinned rq->lock:
- -               */
- -              next = pick_next_task(rq, &fake_task, rf);
- -              BUG_ON(!next);
- -              put_prev_task(rq, next);
+ +              next = __pick_migrate_task(rq);
   
                 /*
                  * Rules for changing task_struct::cpus_mask are holding
@@@ -6561,19 -6371,19 +6561,19 @@@ DECLARE_PER_CPU(cpumask_var_t, select_i
   
   void __init sched_init(void)
   {
- -      unsigned long alloc_size = 0, ptr;
+ +      unsigned long ptr = 0;
         int i;
   
         wait_bit_init();
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ +      ptr += 2 * nr_cpu_ids * sizeof(void **);
   #endif
   #ifdef CONFIG_RT_GROUP_SCHED
- -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ +      ptr += 2 * nr_cpu_ids * sizeof(void **);
   #endif
- -      if (alloc_size) {
- -              ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ +      if (ptr) {
+ +              ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.se = (struct sched_entity **)ptr;
@@@ -6892,7 -6702,7 +6892,7 @@@ struct task_struct *curr_task(int cpu
   
   #ifdef CONFIG_IA64
   /**
- - * set_curr_task - set the current task for a given CPU.
+ + * ia64_set_curr_task - set the current task for a given CPU.
    * @cpu: the processor in question.
    * @p: the task pointer to set.
    *
@@@ -6917,20 -6727,6 +6917,20 @@@ void ia64_set_curr_task(int cpu, struc
   /* task_group_lock serializes the addition/removal of task groups */
   static DEFINE_SPINLOCK(task_group_lock);
   
+ +static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ +                                          struct task_group *parent)
+ +{
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      enum uclamp_id clamp_id;
+ +
+ +      for_each_clamp_id(clamp_id) {
+ +              uclamp_se_set(&tg->uclamp_req[clamp_id],
+ +                            uclamp_none(clamp_id), false);
+ +              tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ +      }
+ +#endif
+ +}
+ +
   static void sched_free_group(struct task_group *tg)
   {
         free_fair_sched_group(tg);
@@@ -6954,8 -6750,6 +6954,8 @@@ struct task_group *sched_create_group(s
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
   
+ +      alloc_uclamp_sched_group(tg, parent);
+ +
         return tg;
   
   err:
@@@ -7059,7 -6853,7 +7059,7 @@@ void sched_move_task(struct task_struc
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
         if (running)
- -              set_curr_task(rq, tsk);
+ +              set_next_task(rq, tsk);
   
         task_rq_unlock(rq, tsk, &rf);
   }
@@@ -7142,6 -6936,10 +7142,6 @@@ static int cpu_cgroup_can_attach(struc
   #ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
- -#else
- -              /* We don't support RT-tasks being in separate groups */
- -              if (task->sched_class != &fair_sched_class)
- -                      return -EINVAL;
   #endif
                 /*
                  * Serialize against wake_up_new_task() such that if its
@@@ -7172,178 -6970,6 +7172,178 @@@ static void cpu_cgroup_attach(struct cg
                 sched_move_task(task);
   }
   
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+ +{
+ +      struct cgroup_subsys_state *top_css = css;
+ +      struct uclamp_se *uc_parent = NULL;
+ +      struct uclamp_se *uc_se = NULL;
+ +      unsigned int eff[UCLAMP_CNT];
+ +      enum uclamp_id clamp_id;
+ +      unsigned int clamps;
+ +
+ +      css_for_each_descendant_pre(css, top_css) {
+ +              uc_parent = css_tg(css)->parent
+ +                      ? css_tg(css)->parent->uclamp : NULL;
+ +
+ +              for_each_clamp_id(clamp_id) {
+ +                      /* Assume effective clamps matches requested clamps */
+ +                      eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ +                      /* Cap effective clamps with parent's effective clamps */
+ +                      if (uc_parent &&
+ +                          eff[clamp_id] > uc_parent[clamp_id].value) {
+ +                              eff[clamp_id] = uc_parent[clamp_id].value;
+ +                      }
+ +              }
+ +              /* Ensure protection is always capped by limit */
+ +              eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+ +
+ +              /* Propagate most restrictive effective clamps */
+ +              clamps = 0x0;
+ +              uc_se = css_tg(css)->uclamp;
+ +              for_each_clamp_id(clamp_id) {
+ +                      if (eff[clamp_id] == uc_se[clamp_id].value)
+ +                              continue;
+ +                      uc_se[clamp_id].value = eff[clamp_id];
+ +                      uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ +                      clamps |= (0x1 << clamp_id);
+ +              }
+ +              if (!clamps) {
+ +                      css = css_rightmost_descendant(css);
+ +                      continue;
+ +              }
+ +
+ +              /* Immediately update descendants RUNNABLE tasks */
+ +              uclamp_update_active_tasks(css, clamps);
+ +      }
+ +}
+ +
+ +/*
+ + * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ + * C expression. Since there is no way to convert a macro argument (N) into a
+ + * character constant, use two levels of macros.
+ + */
+ +#define _POW10(exp) ((unsigned int)1e##exp)
+ +#define POW10(exp) _POW10(exp)
+ +
+ +struct uclamp_request {
+ +#define UCLAMP_PERCENT_SHIFT  2
+ +#define UCLAMP_PERCENT_SCALE  (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ +      s64 percent;
+ +      u64 util;
+ +      int ret;
+ +};
+ +
+ +static inline struct uclamp_request
+ +capacity_from_percent(char *buf)
+ +{
+ +      struct uclamp_request req = {
+ +              .percent = UCLAMP_PERCENT_SCALE,
+ +              .util = SCHED_CAPACITY_SCALE,
+ +              .ret = 0,
+ +      };
+ +
+ +      buf = strim(buf);
+ +      if (strcmp(buf, "max")) {
+ +              req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ +                                           &req.percent);
+ +              if (req.ret)
+ +                      return req;
+ +              if (req.percent > UCLAMP_PERCENT_SCALE) {
+ +                      req.ret = -ERANGE;
+ +                      return req;
+ +              }
+ +
+ +              req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ +              req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ +      }
+ +
+ +      return req;
+ +}
+ +
+ +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ +                              size_t nbytes, loff_t off,
+ +                              enum uclamp_id clamp_id)
+ +{
+ +      struct uclamp_request req;
+ +      struct task_group *tg;
+ +
+ +      req = capacity_from_percent(buf);
+ +      if (req.ret)
+ +              return req.ret;
+ +
+ +      mutex_lock(&uclamp_mutex);
+ +      rcu_read_lock();
+ +
+ +      tg = css_tg(of_css(of));
+ +      if (tg->uclamp_req[clamp_id].value != req.util)
+ +              uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+ +
+ +      /*
+ +       * Because of not recoverable conversion rounding we keep track of the
+ +       * exact requested value
+ +       */
+ +      tg->uclamp_pct[clamp_id] = req.percent;
+ +
+ +      /* Update effective clamps to track the most restrictive value */
+ +      cpu_util_update_eff(of_css(of));
+ +
+ +      rcu_read_unlock();
+ +      mutex_unlock(&uclamp_mutex);
+ +
+ +      return nbytes;
+ +}
+ +
+ +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ +                                  char *buf, size_t nbytes,
+ +                                  loff_t off)
+ +{
+ +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+ +}
+ +
+ +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ +                                  char *buf, size_t nbytes,
+ +                                  loff_t off)
+ +{
+ +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+ +}
+ +
+ +static inline void cpu_uclamp_print(struct seq_file *sf,
+ +                                  enum uclamp_id clamp_id)
+ +{
+ +      struct task_group *tg;
+ +      u64 util_clamp;
+ +      u64 percent;
+ +      u32 rem;
+ +
+ +      rcu_read_lock();
+ +      tg = css_tg(seq_css(sf));
+ +      util_clamp = tg->uclamp_req[clamp_id].value;
+ +      rcu_read_unlock();
+ +
+ +      if (util_clamp == SCHED_CAPACITY_SCALE) {
+ +              seq_puts(sf, "max\n");
+ +              return;
+ +      }
+ +
+ +      percent = tg->uclamp_pct[clamp_id];
+ +      percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ +      seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+ +}
+ +
+ +static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+ +{
+ +      cpu_uclamp_print(sf, UCLAMP_MIN);
+ +      return 0;
+ +}
+ +
+ +static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+ +{
+ +      cpu_uclamp_print(sf, UCLAMP_MAX);
+ +      return 0;
+ +}
+ +#endif /* CONFIG_UCLAMP_TASK_GROUP */
+ +
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@@ -7688,20 -7314,6 +7688,20 @@@ static struct cftype cpu_legacy_files[
                 .read_u64 = cpu_rt_period_read_uint,
                 .write_u64 = cpu_rt_period_write_uint,
         },
+ +#endif
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      {
+ +              .name = "uclamp.min",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_min_show,
+ +              .write = cpu_uclamp_min_write,
+ +      },
+ +      {
+ +              .name = "uclamp.max",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_max_show,
+ +              .write = cpu_uclamp_max_write,
+ +      },
   #endif
         { }     /* Terminate */
   };
@@@ -7869,20 -7481,6 +7869,20 @@@ static struct cftype cpu_files[] = 
                 .seq_show = cpu_max_show,
                 .write = cpu_max_write,
         },
+ +#endif
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      {
+ +              .name = "uclamp.min",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_min_show,
+ +              .write = cpu_uclamp_min_write,
+ +      },
+ +      {
+ +              .name = "uclamp.max",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_max_show,
+ +              .write = cpu_uclamp_max_write,
+ +      },
   #endif
         { }     /* terminate */
   };
diff --combined kernel/sched/fair.c

index 8b665110a44ad53025f40eb4ea72555639b2e873,aff9d76d8d650de151c20415f47cb76cc9bdd777..1f0a5e1a90faf36d18277401df7d321c1dbfdb26
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -96,12 -96,12 +96,12 @@@ int __weak arch_asym_cpu_priority(int c
   }
   
   /*
- - * The margin used when comparing utilization with CPU capacity:
- - * util * margin < capacity * 1024
+ + * The margin used when comparing utilization with CPU capacity.
    *
    * (default: ~20%)
    */
- -static unsigned int capacity_margin                   = 1280;
+ +#define fits_capacity(cap, max)       ((cap) * 1280 < (max) * 1024)
+ +
   #endif
   
   #ifdef CONFIG_CFS_BANDWIDTH
@@@ -1188,6 -1188,47 +1188,6 @@@ static unsigned int task_scan_max(struc
         return max(smin, smax);
   }
   
- -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
- -{
- -      int mm_users = 0;
- -      struct mm_struct *mm = p->mm;
- -
- -      if (mm) {
- -              mm_users = atomic_read(&mm->mm_users);
- -              if (mm_users == 1) {
- -                      mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
- -                      mm->numa_scan_seq = 0;
- -              }
- -      }
- -      p->node_stamp                   = 0;
- -      p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
- -      p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
- -      p->numa_work.next               = &p->numa_work;
- -      p->numa_faults                  = NULL;
- -      RCU_INIT_POINTER(p->numa_group, NULL);
- -      p->last_task_numa_placement     = 0;
- -      p->last_sum_exec_runtime        = 0;
- -
- -      /* New address space, reset the preferred nid */
- -      if (!(clone_flags & CLONE_VM)) {
- -              p->numa_preferred_nid = NUMA_NO_NODE;
- -              return;
- -      }
- -
- -      /*
- -       * New thread, keep existing numa_preferred_nid which should be copied
- -       * already by arch_dup_task_struct but stagger when scans start.
- -       */
- -      if (mm) {
- -              unsigned int delay;
- -
- -              delay = min_t(unsigned int, task_scan_max(current),
- -                      current->numa_scan_period * mm_users * NSEC_PER_MSEC);
- -              delay += 2 * TICK_NSEC;
- -              p->node_stamp = delay;
- -      }
- -}
- -
   static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
   {
         rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@@ -2482,7 -2523,7 +2482,7 @@@ static void reset_ptenuma_scan(struct t
    * The expensive part of numa migration is done from task_work context.
    * Triggered from task_tick_numa().
    */
- -void task_numa_work(struct callback_head *work)
+ +static void task_numa_work(struct callback_head *work)
   {
         unsigned long migrate, next_scan, now = jiffies;
         struct task_struct *p = current;
@@@ -2495,7 -2536,7 +2495,7 @@@
   
         SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
   
- -      work->next = work; /* protect against double add */
+ +      work->next = work;
         /*
          * Who cares about NUMA placement when they're dying.
          *
@@@ -2624,50 -2665,6 +2624,50 @@@ out
         }
   }
   
+ +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+ +{
+ +      int mm_users = 0;
+ +      struct mm_struct *mm = p->mm;
+ +
+ +      if (mm) {
+ +              mm_users = atomic_read(&mm->mm_users);
+ +              if (mm_users == 1) {
+ +                      mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ +                      mm->numa_scan_seq = 0;
+ +              }
+ +      }
+ +      p->node_stamp                   = 0;
+ +      p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
+ +      p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
+ +      /* Protect against double add, see task_tick_numa and task_numa_work */
+ +      p->numa_work.next               = &p->numa_work;
+ +      p->numa_faults                  = NULL;
+ +      RCU_INIT_POINTER(p->numa_group, NULL);
+ +      p->last_task_numa_placement     = 0;
+ +      p->last_sum_exec_runtime        = 0;
+ +
+ +      init_task_work(&p->numa_work, task_numa_work);
+ +
+ +      /* New address space, reset the preferred nid */
+ +      if (!(clone_flags & CLONE_VM)) {
+ +              p->numa_preferred_nid = NUMA_NO_NODE;
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * New thread, keep existing numa_preferred_nid which should be copied
+ +       * already by arch_dup_task_struct but stagger when scans start.
+ +       */
+ +      if (mm) {
+ +              unsigned int delay;
+ +
+ +              delay = min_t(unsigned int, task_scan_max(current),
+ +                      current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ +              delay += 2 * TICK_NSEC;
+ +              p->node_stamp = delay;
+ +      }
+ +}
+ +
   /*
    * Drive the periodic memory faults..
    */
@@@ -2696,8 -2693,10 +2696,8 @@@ static void task_tick_numa(struct rq *r
                         curr->numa_scan_period = task_scan_start(curr);
                 curr->node_stamp += period;
   
- -              if (!time_before(jiffies, curr->mm->numa_next_scan)) {
- -                      init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ +              if (!time_before(jiffies, curr->mm->numa_next_scan))
                         task_work_add(curr, work, true);
- -              }
         }
   }
   
@@@ -3690,6 -3689,8 +3690,6 @@@ static inline unsigned long cfs_rq_load
         return cfs_rq->avg.load_avg;
   }
   
- -static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
- -
   static inline unsigned long task_util(struct task_struct *p)
   {
         return READ_ONCE(p->se.avg.util_avg);
@@@ -3806,7 -3807,7 +3806,7 @@@ util_est_dequeue(struct cfs_rq *cfs_rq
   
   static inline int task_fits_capacity(struct task_struct *p, long capacity)
   {
- -      return capacity * 1024 > task_util_est(p) * capacity_margin;
+ +      return fits_capacity(task_util_est(p), capacity);
   }
   
   static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@@ -4369,6 -4370,8 +4369,6 @@@ void __refill_cfs_bandwidth_runtime(str
   
         now = sched_clock_cpu(smp_processor_id());
         cfs_b->runtime = cfs_b->quota;
- -      cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
- -      cfs_b->expires_seq++;
   }
   
   static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@@ -4390,7 -4393,8 +4390,7 @@@ static int assign_cfs_rq_runtime(struc
   {
         struct task_group *tg = cfs_rq->tg;
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- -      u64 amount = 0, min_amount, expires;
- -      int expires_seq;
+ +      u64 amount = 0, min_amount;
   
         /* note: this is a positive sum as runtime_remaining <= 0 */
         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@@ -4407,17 -4411,61 +4407,17 @@@
                         cfs_b->idle = 0;
                 }
         }
- -      expires_seq = cfs_b->expires_seq;
- -      expires = cfs_b->runtime_expires;
         raw_spin_unlock(&cfs_b->lock);
   
         cfs_rq->runtime_remaining += amount;
- -      /*
- -       * we may have advanced our local expiration to account for allowed
- -       * spread between our sched_clock and the one on which runtime was
- -       * issued.
- -       */
- -      if (cfs_rq->expires_seq != expires_seq) {
- -              cfs_rq->expires_seq = expires_seq;
- -              cfs_rq->runtime_expires = expires;
- -      }
   
         return cfs_rq->runtime_remaining > 0;
   }
   
- -/*
- - * Note: This depends on the synchronization provided by sched_clock and the
- - * fact that rq->clock snapshots this value.
- - */
- -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
- -{
- -      struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- -
- -      /* if the deadline is ahead of our clock, nothing to do */
- -      if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- -              return;
- -
- -      if (cfs_rq->runtime_remaining < 0)
- -              return;
- -
- -      /*
- -       * If the local deadline has passed we have to consider the
- -       * possibility that our sched_clock is 'fast' and the global deadline
- -       * has not truly expired.
- -       *
- -       * Fortunately we can check determine whether this the case by checking
- -       * whether the global deadline(cfs_b->expires_seq) has advanced.
- -       */
- -      if (cfs_rq->expires_seq == cfs_b->expires_seq) {
- -              /* extend local deadline, drift is bounded above by 2 ticks */
- -              cfs_rq->runtime_expires += TICK_NSEC;
- -      } else {
- -              /* global deadline is ahead, expiration has passed */
- -              cfs_rq->runtime_remaining = 0;
- -      }
- -}
- -
   static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
   {
         /* dock delta_exec before expiring quota (as it could span periods) */
         cfs_rq->runtime_remaining -= delta_exec;
- -      expire_cfs_rq_runtime(cfs_rq);
   
         if (likely(cfs_rq->runtime_remaining > 0))
                 return;
@@@ -4506,7 -4554,7 +4506,7 @@@ static void throttle_cfs_rq(struct cfs_
         struct rq *rq = rq_of(cfs_rq);
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
- -      long task_delta, dequeue = 1;
+ +      long task_delta, idle_task_delta, dequeue = 1;
         bool empty;
   
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@@ -4517,7 -4565,6 +4517,7 @@@
         rcu_read_unlock();
   
         task_delta = cfs_rq->h_nr_running;
+ +      idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
                 /* throttled entity or throttle-on-deactivate */
@@@ -4527,7 -4574,6 +4527,7 @@@
                 if (dequeue)
                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
                 qcfs_rq->h_nr_running -= task_delta;
+ +              qcfs_rq->idle_h_nr_running -= idle_task_delta;
   
                 if (qcfs_rq->load.weight)
                         dequeue = 0;
@@@ -4567,7 -4613,7 +4567,7 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         int enqueue = 1;
- -      long task_delta;
+ +      long task_delta, idle_task_delta;
   
         se = cfs_rq->tg->se[cpu_of(rq)];
   
@@@ -4587,7 -4633,6 +4587,7 @@@
                 return;
   
         task_delta = cfs_rq->h_nr_running;
+ +      idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
                 if (se->on_rq)
                         enqueue = 0;
@@@ -4596,7 -4641,6 +4596,7 @@@
                 if (enqueue)
                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
                 cfs_rq->h_nr_running += task_delta;
+ +              cfs_rq->idle_h_nr_running += idle_task_delta;
   
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@@ -4612,7 -4656,8 +4612,7 @@@
                 resched_curr(rq);
   }
   
- -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- -              u64 remaining, u64 expires)
+ +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
   {
         struct cfs_rq *cfs_rq;
         u64 runtime;
@@@ -4634,6 -4679,7 +4634,6 @@@
                 remaining -= runtime;
   
                 cfs_rq->runtime_remaining += runtime;
- -              cfs_rq->runtime_expires = expires;
   
                 /* we check whether we're throttled above */
                 if (cfs_rq->runtime_remaining > 0)
@@@ -4658,7 -4704,7 +4658,7 @@@ next
    */
   static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
   {
- -      u64 runtime, runtime_expires;
+ +      u64 runtime;
         int throttled;
   
         /* no need to continue the timer with no bandwidth constraint */
@@@ -4686,6 -4732,8 +4686,6 @@@
         /* account preceding periods in which throttling occurred */
         cfs_b->nr_throttled += overrun;
   
- -      runtime_expires = cfs_b->runtime_expires;
- -
         /*
          * This check is repeated as we are holding onto the new bandwidth while
          * we unthrottle. This can potentially race with an unthrottled group
@@@ -4698,7 -4746,8 +4698,7 @@@
                 cfs_b->distribute_running = 1;
                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
- -              runtime = distribute_cfs_runtime(cfs_b, runtime,
- -                                               runtime_expires);
+ +              runtime = distribute_cfs_runtime(cfs_b, runtime);
                 raw_spin_lock_irqsave(&cfs_b->lock, flags);
   
                 cfs_b->distribute_running = 0;
@@@ -4780,7 -4829,8 +4780,7 @@@ static void __return_cfs_rq_runtime(str
                 return;
   
         raw_spin_lock(&cfs_b->lock);
- -      if (cfs_b->quota != RUNTIME_INF &&
- -          cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ +      if (cfs_b->quota != RUNTIME_INF) {
                 cfs_b->runtime += slack_runtime;
   
                 /* we are under rq->lock, defer unthrottling using a timer */
@@@ -4813,6 -4863,7 +4813,6 @@@ static void do_sched_cfs_slack_timer(st
   {
         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
         unsigned long flags;
- -      u64 expires;
   
         /* confirm we're still not at a refresh boundary */
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@@ -4830,6 -4881,7 +4830,6 @@@
         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                 runtime = cfs_b->runtime;
   
- -      expires = cfs_b->runtime_expires;
         if (runtime)
                 cfs_b->distribute_running = 1;
   
@@@ -4838,10 -4890,11 +4838,10 @@@
         if (!runtime)
                 return;
   
- -      runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ +      runtime = distribute_cfs_runtime(cfs_b, runtime);
   
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
- -      if (expires == cfs_b->runtime_expires)
- -              lsub_positive(&cfs_b->runtime, runtime);
+ +      lsub_positive(&cfs_b->runtime, runtime);
         cfs_b->distribute_running = 0;
         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   }
@@@ -4998,6 -5051,8 +4998,6 @@@ void start_cfs_bandwidth(struct cfs_ban
   
         cfs_b->period_active = 1;
         overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- -      cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
- -      cfs_b->expires_seq++;
         hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
   }
   
@@@ -5175,7 -5230,7 +5175,7 @@@ static inline unsigned long cpu_util(in
   
   static inline bool cpu_overutilized(int cpu)
   {
- -      return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+ +      return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
   }
   
   static inline void update_overutilized_status(struct rq *rq)
@@@ -5199,7 -5254,6 +5199,7 @@@ enqueue_task_fair(struct rq *rq, struc
   {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
+ +      int idle_h_nr_running = task_has_idle_policy(p);
   
         /*
          * The code below (indirectly) updates schedutil which looks at
@@@ -5232,7 -5286,6 +5232,7 @@@
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running++;
+ +              cfs_rq->idle_h_nr_running += idle_h_nr_running;
   
                 flags = ENQUEUE_WAKEUP;
         }
@@@ -5240,7 -5293,6 +5240,7 @@@
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running++;
+ +              cfs_rq->idle_h_nr_running += idle_h_nr_running;
   
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@@ -5302,7 -5354,6 +5302,7 @@@ static void dequeue_task_fair(struct r
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
         int task_sleep = flags & DEQUEUE_SLEEP;
+ +      int idle_h_nr_running = task_has_idle_policy(p);
   
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
@@@ -5317,7 -5368,6 +5317,7 @@@
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running--;
+ +              cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
@@@ -5337,7 -5387,6 +5337,7 @@@
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running--;
+ +              cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@@ -5371,15 -5420,6 +5371,15 @@@ static struct 
   
   #endif /* CONFIG_NO_HZ_COMMON */
   
+ +/* CPU only has SCHED_IDLE tasks enqueued */
+ +static int sched_idle_cpu(int cpu)
+ +{
+ +      struct rq *rq = cpu_rq(cpu);
+ +
+ +      return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ +                      rq->nr_running);
+ +}
+ +
   static unsigned long cpu_runnable_load(struct rq *rq)
   {
         return cfs_rq_runnable_load_avg(&rq->cfs);
@@@ -5702,7 -5742,7 +5702,7 @@@ find_idlest_group_cpu(struct sched_grou
         unsigned int min_exit_latency = UINT_MAX;
         u64 latest_idle_timestamp = 0;
         int least_loaded_cpu = this_cpu;
- -      int shallowest_idle_cpu = -1;
+ +      int shallowest_idle_cpu = -1, si_cpu = -1;
         int i;
   
         /* Check if we have any choice: */
@@@ -5733,12 -5773,7 +5733,12 @@@
                                 latest_idle_timestamp = rq->idle_stamp;
                                 shallowest_idle_cpu = i;
                         }
- -              } else if (shallowest_idle_cpu == -1) {
+ +              } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+ +                      if (sched_idle_cpu(i)) {
+ +                              si_cpu = i;
+ +                              continue;
+ +                      }
+ +
                         load = cpu_runnable_load(cpu_rq(i));
                         if (load < min_load) {
                                 min_load = load;
@@@ -5747,11 -5782,7 +5747,11 @@@
                 }
         }
   
- -      return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ +      if (shallowest_idle_cpu != -1)
+ +              return shallowest_idle_cpu;
+ +      if (si_cpu != -1)
+ +              return si_cpu;
+ +      return least_loaded_cpu;
   }
   
   static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@@ -5904,7 -5935,7 +5904,7 @@@ static int select_idle_core(struct task
    */
   static int select_idle_smt(struct task_struct *p, int target)
   {
- -      int cpu;
+ +      int cpu, si_cpu = -1;
   
         if (!static_branch_likely(&sched_smt_present))
                 return -1;
@@@ -5914,11 -5945,9 +5914,11 @@@
                         continue;
                 if (available_idle_cpu(cpu))
                         return cpu;
+ +              if (si_cpu == -1 && sched_idle_cpu(cpu))
+ +                      si_cpu = cpu;
         }
   
- -      return -1;
+ +      return si_cpu;
   }
   
   #else /* CONFIG_SCHED_SMT */
@@@ -5946,8 -5975,8 +5946,8 @@@ static int select_idle_cpu(struct task_
         u64 avg_cost, avg_idle;
         u64 time, cost;
         s64 delta;
- -      int cpu, nr = INT_MAX;
         int this = smp_processor_id();
+ +      int cpu, nr = INT_MAX, si_cpu = -1;
   
         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
         if (!this_sd)
@@@ -5975,13 -6004,11 +5975,13 @@@
   
         for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                 if (!--nr)
- -                      return -1;
+ +                      return si_cpu;
                 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                         continue;
                 if (available_idle_cpu(cpu))
                         break;
+ +              if (si_cpu == -1 && sched_idle_cpu(cpu))
+ +                      si_cpu = cpu;
         }
   
         time = cpu_clock(this) - time;
@@@ -6000,14 -6027,13 +6000,14 @@@ static int select_idle_sibling(struct t
         struct sched_domain *sd;
         int i, recent_used_cpu;
   
- -      if (available_idle_cpu(target))
+ +      if (available_idle_cpu(target) || sched_idle_cpu(target))
                 return target;
   
         /*
          * If the previous CPU is cache affine and idle, don't be stupid:
          */
- -      if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+ +      if (prev != target && cpus_share_cache(prev, target) &&
+ +          (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                 return prev;
   
         /* Check a recently used CPU as a potential idle candidate: */
@@@ -6015,7 -6041,7 +6015,7 @@@
         if (recent_used_cpu != prev &&
             recent_used_cpu != target &&
             cpus_share_cache(recent_used_cpu, target) &&
- -          available_idle_cpu(recent_used_cpu) &&
+ +          (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
             cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
                 /*
                  * Replace recent_used_cpu with prev as it is a potential
@@@ -6251,55 -6277,69 +6251,55 @@@ static unsigned long cpu_util_next(int 
   }
   
   /*
- - * compute_energy(): Estimates the energy that would be consumed if @p was
+ + * compute_energy(): Estimates the energy that @pd would consume if @p was
    * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- - * landscape of the * CPUs after the task migration, and uses the Energy Model
+ + * landscape of @pd's CPUs after the task migration, and uses the Energy Model
    * to compute what would be the energy if we decided to actually migrate that
    * task.
    */
   static long
   compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
   {
- -      unsigned int max_util, util_cfs, cpu_util, cpu_cap;
- -      unsigned long sum_util, energy = 0;
- -      struct task_struct *tsk;
+ +      struct cpumask *pd_mask = perf_domain_span(pd);
+ +      unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+ +      unsigned long max_util = 0, sum_util = 0;
         int cpu;
   
- -      for (; pd; pd = pd->next) {
- -              struct cpumask *pd_mask = perf_domain_span(pd);
+ +      /*
+ +       * The capacity state of CPUs of the current rd can be driven by CPUs
+ +       * of another rd if they belong to the same pd. So, account for the
+ +       * utilization of these CPUs too by masking pd with cpu_online_mask
+ +       * instead of the rd span.
+ +       *
+ +       * If an entire pd is outside of the current rd, it will not appear in
+ +       * its pd list and will not be accounted by compute_energy().
+ +       */
+ +      for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+ +              unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+ +              struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
   
                 /*
- -               * The energy model mandates all the CPUs of a performance
- -               * domain have the same capacity.
+ +               * Busy time computation: utilization clamping is not
+ +               * required since the ratio (sum_util / cpu_capacity)
+ +               * is already enough to scale the EM reported power
+ +               * consumption at the (eventually clamped) cpu_capacity.
                  */
- -              cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- -              max_util = sum_util = 0;
+ +              sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ +                                             ENERGY_UTIL, NULL);
   
                 /*
- -               * The capacity state of CPUs of the current rd can be driven by
- -               * CPUs of another rd if they belong to the same performance
- -               * domain. So, account for the utilization of these CPUs too
- -               * by masking pd with cpu_online_mask instead of the rd span.
- -               *
- -               * If an entire performance domain is outside of the current rd,
- -               * it will not appear in its pd list and will not be accounted
- -               * by compute_energy().
+ +               * Performance domain frequency: utilization clamping
+ +               * must be considered since it affects the selection
+ +               * of the performance domain frequency.
+ +               * NOTE: in case RT tasks are running, by default the
+ +               * FREQUENCY_UTIL's utilization can be max OPP.
                  */
- -              for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- -                      util_cfs = cpu_util_next(cpu, p, dst_cpu);
- -
- -                      /*
- -                       * Busy time computation: utilization clamping is not
- -                       * required since the ratio (sum_util / cpu_capacity)
- -                       * is already enough to scale the EM reported power
- -                       * consumption at the (eventually clamped) cpu_capacity.
- -                       */
- -                      sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- -                                                     ENERGY_UTIL, NULL);
- -
- -                      /*
- -                       * Performance domain frequency: utilization clamping
- -                       * must be considered since it affects the selection
- -                       * of the performance domain frequency.
- -                       * NOTE: in case RT tasks are running, by default the
- -                       * FREQUENCY_UTIL's utilization can be max OPP.
- -                       */
- -                      tsk = cpu == dst_cpu ? p : NULL;
- -                      cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- -                                                    FREQUENCY_UTIL, tsk);
- -                      max_util = max(max_util, cpu_util);
- -              }
- -
- -              energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+ +              cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ +                                            FREQUENCY_UTIL, tsk);
+ +              max_util = max(max_util, cpu_util);
         }
   
- -      return energy;
+ +      return em_pd_energy(pd->em_pd, max_util, sum_util);
   }
   
   /*
@@@ -6341,19 -6381,21 +6341,19 @@@
    * other use-cases too. So, until someone finds a better way to solve this,
    * let's keep things simple by re-using the existing slow path.
    */
- -
   static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
   {
- -      unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+ +      unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ +      unsigned long cpu_cap, util, base_energy = 0;
         int cpu, best_energy_cpu = prev_cpu;
- -      struct perf_domain *head, *pd;
- -      unsigned long cpu_cap, util;
         struct sched_domain *sd;
+ +      struct perf_domain *pd;
   
         rcu_read_lock();
         pd = rcu_dereference(rd->pd);
         if (!pd || READ_ONCE(rd->overutilized))
                 goto fail;
- -      head = pd;
   
         /*
          * Energy-aware wake-up happens on the lowest sched_domain starting
@@@ -6370,14 -6412,9 +6370,14 @@@
                 goto unlock;
   
         for (; pd; pd = pd->next) {
- -              unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+ +              unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ +              unsigned long base_energy_pd;
                 int max_spare_cap_cpu = -1;
   
+ +              /* Compute the 'base' energy of the pd, without @p */
+ +              base_energy_pd = compute_energy(p, -1, pd);
+ +              base_energy += base_energy_pd;
+ +
                 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                 continue;
@@@ -6385,14 -6422,14 +6385,14 @@@
                         /* Skip CPUs that will be overutilized. */
                         util = cpu_util_next(cpu, p, cpu);
                         cpu_cap = capacity_of(cpu);
- -                      if (cpu_cap * 1024 < util * capacity_margin)
+ +                      if (!fits_capacity(util, cpu_cap))
                                 continue;
   
                         /* Always use prev_cpu as a candidate. */
                         if (cpu == prev_cpu) {
- -                              prev_energy = compute_energy(p, prev_cpu, head);
- -                              best_energy = min(best_energy, prev_energy);
- -                              continue;
+ +                              prev_delta = compute_energy(p, prev_cpu, pd);
+ +                              prev_delta -= base_energy_pd;
+ +                              best_delta = min(best_delta, prev_delta);
                         }
   
                         /*
@@@ -6408,10 -6445,9 +6408,10 @@@
   
                 /* Evaluate the energy impact of using this CPU. */
                 if (max_spare_cap_cpu >= 0) {
- -                      cur_energy = compute_energy(p, max_spare_cap_cpu, head);
- -                      if (cur_energy < best_energy) {
- -                              best_energy = cur_energy;
+ +                      cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ +                      cur_delta -= base_energy_pd;
+ +                      if (cur_delta < best_delta) {
+ +                              best_delta = cur_delta;
                                 best_energy_cpu = max_spare_cap_cpu;
                         }
                 }
@@@ -6423,10 -6459,10 +6423,10 @@@ unlock
          * Pick the best CPU if prev_cpu cannot be used, or if it saves at
          * least 6% of the energy used by prev_cpu.
          */
- -      if (prev_energy == ULONG_MAX)
+ +      if (prev_delta == ULONG_MAX)
                 return best_energy_cpu;
   
- -      if ((prev_energy - best_energy) > (prev_energy >> 4))
+ +      if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
                 return best_energy_cpu;
   
         return prev_cpu;
@@@ -6760,7 -6796,7 +6760,7 @@@ again
                 goto idle;
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -      if (prev->sched_class != &fair_sched_class)
+ +      if (!prev || prev->sched_class != &fair_sched_class)
                 goto simple;
   
         /*
@@@ -6837,8 -6873,8 +6837,8 @@@
         goto done;
   simple:
   #endif
- -
- -      put_prev_task(rq, prev);
+ +      if (prev)
+ +              put_prev_task(rq, prev);
   
         do {
                 se = pick_next_entity(cfs_rq, NULL);
@@@ -6866,13 -6902,11 +6866,13 @@@ done: __maybe_unused
         return p;
   
   idle:
- -      update_misfit_status(NULL, rq);
- -      new_tasks = idle_balance(rq, rf);
+ +      if (!rf)
+ +              return NULL;
+ +
+ +      new_tasks = newidle_balance(rq, rf);
   
         /*
- -       * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ +       * Because newidle_balance() releases (and re-acquires) rq->lock, it is
          * possible for any higher priority task to appear. In that case we
          * must re-start the pick_next_entity() loop.
          */
@@@ -6894,7 -6928,7 +6894,7 @@@
   /*
    * Account for a descheduled task:
    */
- -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+ +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   {
         struct sched_entity *se = &prev->se;
         struct cfs_rq *cfs_rq;
@@@ -7396,7 -7430,7 +7396,7 @@@ static int detach_tasks(struct lb_env *
                 detached++;
                 env->imbalance -= load;
   
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
                 /*
                  * NEWIDLE balancing is a source of latency, so preemptible
                  * kernels will stop after the first task is detached to minimize
@@@ -7943,7 -7977,8 +7943,7 @@@ group_is_overloaded(struct lb_env *env
   static inline bool
   group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
   {
- -      return sg->sgc->min_capacity * capacity_margin <
- -                                              ref->sgc->min_capacity * 1024;
+ +      return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
   }
   
   /*
@@@ -7953,7 -7988,8 +7953,7 @@@
   static inline bool
   group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
   {
- -      return sg->sgc->max_capacity * capacity_margin <
- -                                              ref->sgc->max_capacity * 1024;
+ +      return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
   }
   
   static inline enum
@@@ -9011,10 -9047,9 +9011,10 @@@ more_balance
   out_balanced:
         /*
          * We reach balance although we may have faced some affinity
- -       * constraints. Clear the imbalance flag if it was set.
+ +       * constraints. Clear the imbalance flag only if other tasks got
+ +       * a chance to move and fix the imbalance.
          */
- -      if (sd_parent) {
+ +      if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
   
                 if (*group_imbalance)
@@@ -9035,10 -9070,10 +9035,10 @@@ out_one_pinned
         ld_moved = 0;
   
         /*
- -       * idle_balance() disregards balance intervals, so we could repeatedly
- -       * reach this code, which would lead to balance_interval skyrocketting
- -       * in a short amount of time. Skip the balance_interval increase logic
- -       * to avoid that.
+ +       * newidle_balance() disregards balance intervals, so we could
+ +       * repeatedly reach this code, which would lead to balance_interval
+ +       * skyrocketting in a short amount of time. Skip the balance_interval
+ +       * increase logic to avoid that.
          */
         if (env.idle == CPU_NEWLY_IDLE)
                 goto out;
@@@ -9748,7 -9783,7 +9748,7 @@@ static inline void nohz_newidle_balance
    * idle_balance is called by schedule() if this_cpu is about to become
    * idle. Attempts to pull tasks from other CPUs.
    */
- -static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+ +int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
   {
         unsigned long next_balance = jiffies + HZ;
         int this_cpu = this_rq->cpu;
@@@ -9756,7 -9791,6 +9756,7 @@@
         int pulled_task = 0;
         u64 curr_cost = 0;
   
+ +      update_misfit_status(NULL, this_rq);
         /*
          * We must set idle_stamp _before_ calling idle_balance(), such that we
          * measure the duration of idle_balance() as idle time.
@@@ -10141,19 -10175,9 +10141,19 @@@ static void switched_to_fair(struct rq 
    * This routine is mostly called to set cfs_rq->curr field when a task
    * migrates between groups/classes.
    */
- -static void set_curr_task_fair(struct rq *rq)
+ +static void set_next_task_fair(struct rq *rq, struct task_struct *p)
   {
- -      struct sched_entity *se = &rq->curr->se;
+ +      struct sched_entity *se = &p->se;
+ +
+ +#ifdef CONFIG_SMP
+ +      if (task_on_rq_queued(p)) {
+ +              /*
+ +               * Move the next running task to the front of the list, so our
+ +               * cfs_tasks list becomes MRU one.
+ +               */
+ +              list_move(&se->group_node, &rq->cfs_tasks);
+ +      }
+ +#endif
   
         for_each_sched_entity(se) {
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@@ -10271,18 -10295,18 +10271,18 @@@ err
   void online_fair_sched_group(struct task_group *tg)
   {
         struct sched_entity *se;
+ +      struct rq_flags rf;
         struct rq *rq;
         int i;
   
         for_each_possible_cpu(i) {
                 rq = cpu_rq(i);
                 se = tg->se[i];
- -
- -              raw_spin_lock_irq(&rq->lock);
+ +              rq_lock_irq(rq, &rf);
                 update_rq_clock(rq);
                 attach_entity_cfs_rq(se);
                 sync_throttle(tg, i);
- -              raw_spin_unlock_irq(&rq->lock);
+ +              rq_unlock_irq(rq, &rf);
         }
   }
   
@@@ -10424,9 -10448,7 +10424,9 @@@ const struct sched_class fair_sched_cla
         .check_preempt_curr     = check_preempt_wakeup,
   
         .pick_next_task         = pick_next_task_fair,
+ +
         .put_prev_task          = put_prev_task_fair,
+ +      .set_next_task          = set_next_task_fair,
   
   #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_fair,
@@@ -10439,6 -10461,7 +10439,6 @@@
         .set_cpus_allowed       = set_cpus_allowed_common,
   #endif
   
- -      .set_curr_task          = set_curr_task_fair,
         .task_tick              = task_tick_fair,
         .task_fork              = task_fork_fair,
   
diff --combined kernel/sched/sched.h

index 00ff5b57e9cd7a70ec350479818dc663f8ed65a9,f2ce6ba1c5d5681d99ded1f51c2fe3a7940bc2b5..b3cb895d14a2088eef7a8853f00e7fc9b5823dd6
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -335,6 -335,8 +335,6 @@@ struct cfs_bandwidth 
         u64                     quota;
         u64                     runtime;
         s64                     hierarchical_quota;
- -      u64                     runtime_expires;
- -      int                     expires_seq;
   
         u8                      idle;
         u8                      period_active;
@@@ -391,16 -393,6 +391,16 @@@ struct task_group 
   #endif
   
         struct cfs_bandwidth    cfs_bandwidth;
+ +
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      /* The two decimal precision [%] value requested from user-space */
+ +      unsigned int            uclamp_pct[UCLAMP_CNT];
+ +      /* Clamp values requested for a task group */
+ +      struct uclamp_se        uclamp_req[UCLAMP_CNT];
+ +      /* Effective clamp values used for a task group */
+ +      struct uclamp_se        uclamp[UCLAMP_CNT];
+ +#endif
+ +
   };
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -491,8 -483,7 +491,8 @@@ struct cfs_rq 
         struct load_weight      load;
         unsigned long           runnable_weight;
         unsigned int            nr_running;
- -      unsigned int            h_nr_running;
+ +      unsigned int            h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
+ +      unsigned int            idle_h_nr_running; /* SCHED_IDLE */
   
         u64                     exec_clock;
         u64                     min_vruntime;
@@@ -565,6 -556,8 +565,6 @@@
   
   #ifdef CONFIG_CFS_BANDWIDTH
         int                     runtime_enabled;
- -      int                     expires_seq;
- -      u64                     runtime_expires;
         s64                     runtime_remaining;
   
         u64                     throttled_clock;
@@@ -784,6 -777,9 +784,6 @@@ struct root_domain 
         struct perf_domain __rcu *pd;
   };
   
- -extern struct root_domain def_root_domain;
- -extern struct mutex sched_domains_mutex;
- -
   extern void init_defrootdomain(void);
   extern int sched_init_domains(const struct cpumask *cpu_map);
   extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
@@@ -1265,18 -1261,16 +1265,18 @@@ enum numa_topology_type 
   extern enum numa_topology_type sched_numa_topology_type;
   extern int sched_max_numa_distance;
   extern bool find_numa_distance(int distance);
- -#endif
- -
- -#ifdef CONFIG_NUMA
   extern void sched_init_numa(void);
   extern void sched_domains_numa_masks_set(unsigned int cpu);
   extern void sched_domains_numa_masks_clear(unsigned int cpu);
+ +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
   #else
   static inline void sched_init_numa(void) { }
   static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
   static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+ +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+ +{
+ +      return nr_cpu_ids;
+ +}
   #endif
   
   #ifdef CONFIG_NUMA_BALANCING
@@@ -1455,14 -1449,10 +1455,14 @@@ static inline void unregister_sched_dom
   }
   #endif
   
+ +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+ +
   #else
   
   static inline void sched_ttwu_pending(void) { }
   
+ +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
+ +
   #endif /* CONFIG_SMP */
   
   #include "stats.h"
@@@ -1710,21 -1700,17 +1710,21 @@@ struct sched_class 
         void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
   
         /*
- -       * It is the responsibility of the pick_next_task() method that will
- -       * return the next task to call put_prev_task() on the @prev task or
- -       * something equivalent.
+ +       * Both @prev and @rf are optional and may be NULL, in which case the
+ +       * caller must already have invoked put_prev_task(rq, prev, rf).
+ +       *
+ +       * Otherwise it is the responsibility of the pick_next_task() to call
+ +       * put_prev_task() on the @prev task or something equivalent, IFF it
+ +       * returns a next task.
          *
- -       * May return RETRY_TASK when it finds a higher prio class has runnable
- -       * tasks.
+ +       * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
+ +       * higher prio class has runnable tasks.
          */
         struct task_struct * (*pick_next_task)(struct rq *rq,
                                                struct task_struct *prev,
                                                struct rq_flags *rf);
- -      void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ +      void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
+ +      void (*set_next_task)(struct rq *rq, struct task_struct *p);
   
   #ifdef CONFIG_SMP
         int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
@@@ -1739,6 -1725,7 +1739,6 @@@
         void (*rq_offline)(struct rq *rq);
   #endif
   
- -      void (*set_curr_task)(struct rq *rq);
         void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
         void (*task_fork)(struct task_struct *p);
         void (*task_dead)(struct task_struct *p);
@@@ -1768,14 -1755,12 +1768,14 @@@
   
   static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
   {
- -      prev->sched_class->put_prev_task(rq, prev);
+ +      WARN_ON_ONCE(rq->curr != prev);
+ +      prev->sched_class->put_prev_task(rq, prev, NULL);
   }
   
- -static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+ +static inline void set_next_task(struct rq *rq, struct task_struct *next)
   {
- -      curr->sched_class->set_curr_task(rq);
+ +      WARN_ON_ONCE(rq->curr != next);
+ +      next->sched_class->set_next_task(rq, next);
   }
   
   #ifdef CONFIG_SMP
@@@ -1958,7 -1943,7 +1958,7 @@@ unsigned long arch_scale_freq_capacity(
   #endif
   
   #ifdef CONFIG_SMP
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
   
   static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
   
@@@ -2010,7 -1995,7 +2010,7 @@@ static inline int _double_lock_balance(
         return ret;
   }
   
- #endif /* CONFIG_PREEMPT */
+ #endif /* CONFIG_PREEMPTION */
   
   /*
    * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@@ -2281,7 -2266,7 +2281,7 @@@ static inline void cpufreq_update_util(
   #endif /* CONFIG_CPU_FREQ */
   
   #ifdef CONFIG_UCLAMP_TASK
- -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+ +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
   
   static __always_inline
   unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
author	Ingo Molnar <mingo@kernel.org>
	Mon, 16 Sep 2019 12:04:28 +0000 (14:04 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Mon, 16 Sep 2019 12:05:04 +0000 (14:05 +0200)
		1	2
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history