Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
Pull scheduler updates from Ingo Molnar:

 - MAINTAINERS: Add Mark Rutland as perf submaintainer, Juri Lelli and
   Vincent Guittot as scheduler submaintainers. Add Dietmar Eggemann,
   Steven Rostedt, Ben Segall and Mel Gorman as scheduler reviewers.

   As perf and the scheduler is getting bigger and more complex,
   document the status quo of current responsibilities and interests,
   and spread the review pain^H^H^H^H fun via an increase in the Cc:
   linecount generated by scripts/get_maintainer.pl. :-)

 - Add another series of patches that brings the -rt (PREEMPT_RT) tree
   closer to mainline: split the monolithic CONFIG_PREEMPT dependencies
   into a new CONFIG_PREEMPTION category that will allow the eventual
   introduction of CONFIG_PREEMPT_RT. Still a few more hundred patches
   to go though.

 - Extend the CPU cgroup controller with uclamp.min and uclamp.max to
   allow the finer shaping of CPU bandwidth usage.

 - Micro-optimize energy-aware wake-ups from O(CPUS^2) to O(CPUS).

 - Improve the behavior of high CPU count, high thread count
   applications running under cpu.cfs_quota_us constraints.

 - Improve balancing with SCHED_IDLE (SCHED_BATCH) tasks present.

 - Improve CPU isolation housekeeping CPU allocation NUMA locality.

 - Fix deadline scheduler bandwidth calculations and logic when cpusets
   rebuilds the topology, or when it gets deadline-throttled while it's
   being offlined.

 - Convert the cpuset_mutex to percpu_rwsem, to allow it to be used from
   setscheduler() system calls without creating global serialization.
   Add new synchronization between cpuset topology-changing events and
   the deadline acceptance tests in setscheduler(), which were broken
   before.

 - Rework the active_mm state machine to be less confusing and more
   optimal.

 - Rework (simplify) the pick_next_task() slowpath.

 - Improve load-balancing on AMD EPYC systems.

 - ... and misc cleanups, smaller fixes and improvements - please see
   the Git log for more details.

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (53 commits)
  sched/psi: Correct overly pessimistic size calculation
  sched/fair: Speed-up energy-aware wake-ups
  sched/uclamp: Always use 'enum uclamp_id' for clamp_id values
  sched/uclamp: Update CPU's refcount on TG's clamp changes
  sched/uclamp: Use TG's clamps to restrict TASK's clamps
  sched/uclamp: Propagate system defaults to the root group
  sched/uclamp: Propagate parent clamps
  sched/uclamp: Extend CPU's cgroup controller
  sched/topology: Improve load balancing on AMD EPYC systems
  arch, ia64: Make NUMA select SMP
  sched, perf: MAINTAINERS update, add submaintainers and reviewers
  sched/fair: Use rq_lock/unlock in online_fair_sched_group
  cpufreq: schedutil: fix equation in comment
  sched: Rework pick_next_task() slow-path
  sched: Allow put_prev_task() to drop rq->lock
  sched/fair: Expose newidle_balance()
  sched: Add task_struct pointer to sched_class::set_curr_task
  sched: Rework CPU hotplug task selection
  sched/{rt,deadline}: Fix set_next_task vs pick_next_task
  sched: Fix kerneldoc comment for ia64_set_curr_task
  ...

23 files changed:
1  2 
MAINTAINERS
arch/Kconfig
arch/ia64/Kconfig
arch/x86/entry/entry_64.S
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/kvm.c
include/linux/rcupdate.h
include/linux/topology.h
init/Kconfig
kernel/cgroup/cgroup.c
kernel/events/core.c
kernel/kprobes.c
kernel/rcu/tree.c
kernel/rcu/tree_stall.h
kernel/sched/core.c
kernel/sched/cpufreq_schedutil.c
kernel/sched/deadline.c
kernel/sched/fair.c
kernel/sched/idle.c
kernel/sched/psi.c
kernel/trace/ftrace.c
kernel/trace/trace_events.c
mm/page_alloc.c

diff --combined MAINTAINERS
index cbe625343277ea0277e470e413c9cdbf79cabc04,3a5ef62c9dd11a70d6ecc5da353b46bf11770ebd..49f75d1b7b51a95d1177f6c207b9ecb0e3a1b8dc
@@@ -183,7 -183,7 +183,7 @@@ M: Realtek linux nic maintainers <nic_s
  M:    Heiner Kallweit <hkallweit1@gmail.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
 -F:    drivers/net/ethernet/realtek/r8169.c
 +F:    drivers/net/ethernet/realtek/r8169*
  
  8250/16?50 (AND CLONE UARTS) SERIAL DRIVER
  M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
@@@ -517,6 -517,14 +517,6 @@@ W:        http://ez.analog.com/community/linux
  S:    Supported
  F:    drivers/video/backlight/adp8860_bl.c
  
 -ADS1015 HARDWARE MONITOR DRIVER
 -M:    Dirk Eibach <eibach@gdsys.de>
 -L:    linux-hwmon@vger.kernel.org
 -S:    Maintained
 -F:    Documentation/hwmon/ads1015.rst
 -F:    drivers/hwmon/ads1015.c
 -F:    include/linux/platform_data/ads1015.h
 -
  ADT746X FAN DRIVER
  M:    Colin Leroy <colin@colino.net>
  S:    Maintained
@@@ -675,7 -683,7 +675,7 @@@ S: Maintaine
  F:    drivers/crypto/sunxi-ss/
  
  ALLWINNER VPU DRIVER
 -M:    Maxime Ripard <maxime.ripard@bootlin.com>
 +M:    Maxime Ripard <mripard@kernel.org>
  M:    Paul Kocialkowski <paul.kocialkowski@bootlin.com>
  L:    linux-media@vger.kernel.org
  S:    Maintained
@@@ -1342,7 -1350,8 +1342,7 @@@ M:      Will Deacon <will@kernel.org
  R:    Robin Murphy <robin.murphy@arm.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
 -F:    drivers/iommu/arm-smmu.c
 -F:    drivers/iommu/arm-smmu-v3.c
 +F:    drivers/iommu/arm-smmu*
  F:    drivers/iommu/io-pgtable-arm.c
  F:    drivers/iommu/io-pgtable-arm-v7s.c
  
@@@ -1399,7 -1408,7 +1399,7 @@@ S:      Maintaine
  F:    drivers/clk/sunxi/
  
  ARM/Allwinner sunXi SoC support
 -M:    Maxime Ripard <maxime.ripard@bootlin.com>
 +M:    Maxime Ripard <mripard@kernel.org>
  M:    Chen-Yu Tsai <wens@csie.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
@@@ -1617,21 -1626,6 +1617,21 @@@ F:    drivers/clocksource/timer-atlas7.
  N:    [^a-z]sirf
  X:    drivers/gnss
  
 +ARM/CZ.NIC TURRIS MOX SUPPORT
 +M:    Marek Behun <marek.behun@nic.cz>
 +W:    http://mox.turris.cz
 +S:    Maintained
 +F:    Documentation/ABI/testing/debugfs-moxtet
 +F:    Documentation/ABI/testing/sysfs-bus-moxtet-devices
 +F:    Documentation/ABI/testing/sysfs-firmware-turris-mox-rwtm
 +F:    Documentation/devicetree/bindings/bus/moxtet.txt
 +F:    Documentation/devicetree/bindings/firmware/cznic,turris-mox-rwtm.txt
 +F:    Documentation/devicetree/bindings/gpio/gpio-moxtet.txt
 +F:    include/linux/moxtet.h
 +F:    drivers/bus/moxtet.c
 +F:    drivers/firmware/turris-mox-rwtm.c
 +F:    drivers/gpio/gpio-moxtet.c
 +
  ARM/EBSA110 MACHINE SUPPORT
  M:    Russell King <linux@armlinux.org.uk>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1755,11 -1749,20 +1755,11 @@@ L:   linux-arm-kernel@lists.infradead.or
  S:    Maintained
  F:    arch/arm/mach-pxa/colibri-pxa270-income.c
  
 -ARM/INTEL IOP13XX ARM ARCHITECTURE
 -M:    Lennert Buytenhek <kernel@wantstofly.org>
 -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 -S:    Maintained
 -
  ARM/INTEL IOP32X ARM ARCHITECTURE
  M:    Lennert Buytenhek <kernel@wantstofly.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  
 -ARM/INTEL IOP33X ARM ARCHITECTURE
 -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 -S:    Orphan
 -
  ARM/INTEL IQ81342EX MACHINE SUPPORT
  M:    Lennert Buytenhek <kernel@wantstofly.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1918,6 -1921,12 +1918,6 @@@ S:     Maintaine
  F:    drivers/phy/mediatek/
  F:    Documentation/devicetree/bindings/phy/phy-mtk-*
  
 -ARM/MICREL KS8695 ARCHITECTURE
 -M:    Greg Ungerer <gerg@uclinux.org>
 -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 -F:    arch/arm/mach-ks8695/
 -S:    Odd Fixes
 -
  ARM/Microchip (AT91) SoC support
  M:    Nicolas Ferre <nicolas.ferre@microchip.com>
  M:    Alexandre Belloni <alexandre.belloni@bootlin.com>
@@@ -1959,7 -1968,6 +1959,7 @@@ F:      Documentation/devicetree/bindings/i2
  F:    arch/arm/mach-nomadik/
  F:    arch/arm/mach-u300/
  F:    arch/arm/mach-ux500/
 +F:    drivers/soc/ux500/
  F:    arch/arm/boot/dts/ste-*
  F:    drivers/clk/clk-nomadik.c
  F:    drivers/clk/clk-u300.c
@@@ -2003,6 -2011,22 +2003,6 @@@ F:     drivers/*/*npcm
  F:    Documentation/devicetree/bindings/*/*npcm*
  F:    Documentation/devicetree/bindings/*/*/*npcm*
  
 -ARM/NUVOTON W90X900 ARM ARCHITECTURE
 -M:    Wan ZongShun <mcuos.com@gmail.com>
 -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 -W:    http://www.mcuos.com
 -S:    Maintained
 -F:    arch/arm/mach-w90x900/
 -F:    drivers/input/keyboard/w90p910_keypad.c
 -F:    drivers/input/touchscreen/w90p910_ts.c
 -F:    drivers/watchdog/nuc900_wdt.c
 -F:    drivers/net/ethernet/nuvoton/w90p910_ether.c
 -F:    drivers/mtd/nand/raw/nuc900_nand.c
 -F:    drivers/rtc/rtc-nuc900.c
 -F:    drivers/spi/spi-nuc900.c
 -F:    drivers/usb/host/ehci-w90x900.c
 -F:    drivers/video/fbdev/nuc900fb.c
 -
  ARM/OPENMOKO NEO FREERUNNER (GTA02) MACHINE SUPPORT
  L:    openmoko-kernel@lists.openmoko.org (subscribers-only)
  W:    http://wiki.openmoko.org/wiki/Neo_FreeRunner
@@@ -2131,12 -2155,10 +2131,12 @@@ F:   Documentation/devicetree/bindings/ar
  
  ARM/RENESAS ARM64 ARCHITECTURE
  M:    Simon Horman <horms@verge.net.au>
 +M:    Geert Uytterhoeven <geert+renesas@glider.be>
  M:    Magnus Damm <magnus.damm@gmail.com>
  L:    linux-renesas-soc@vger.kernel.org
  Q:    http://patchwork.kernel.org/project/linux-renesas-soc/list/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next
  S:    Supported
  F:    arch/arm64/boot/dts/renesas/
  F:    Documentation/devicetree/bindings/arm/renesas.yaml
@@@ -2195,9 -2217,8 +2195,9 @@@ F:      drivers/*/*s3c24
  F:    drivers/*/*/*s3c24*
  F:    drivers/*/*s3c64xx*
  F:    drivers/*/*s5pv210*
 -F:    drivers/memory/samsung/*
 -F:    drivers/soc/samsung/*
 +F:    drivers/memory/samsung/
 +F:    drivers/soc/samsung/
 +F:    include/linux/soc/samsung/
  F:    Documentation/arm/samsung/
  F:    Documentation/devicetree/bindings/arm/samsung/
  F:    Documentation/devicetree/bindings/sram/samsung-sram.txt
@@@ -2248,12 -2269,10 +2248,12 @@@ F:   drivers/media/platform/s5p-mfc
  
  ARM/SHMOBILE ARM ARCHITECTURE
  M:    Simon Horman <horms@verge.net.au>
 +M:    Geert Uytterhoeven <geert+renesas@glider.be>
  M:    Magnus Damm <magnus.damm@gmail.com>
  L:    linux-renesas-soc@vger.kernel.org
  Q:    http://patchwork.kernel.org/project/linux-renesas-soc/list/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next
  S:    Supported
  F:    arch/arm/boot/dts/emev2*
  F:    arch/arm/boot/dts/gr-peach*
@@@ -3554,7 -3573,7 +3554,7 @@@ F:      Documentation/filesystems/caching/ca
  F:    fs/cachefiles/
  
  CADENCE MIPI-CSI2 BRIDGES
 -M:    Maxime Ripard <maxime.ripard@bootlin.com>
 +M:    Maxime Ripard <mripard@kernel.org>
  L:    linux-media@vger.kernel.org
  S:    Maintained
  F:    Documentation/devicetree/bindings/media/cdns,*.txt
@@@ -4267,14 -4286,6 +4267,14 @@@ S:    Supporte
  F:    drivers/cpuidle/cpuidle-exynos.c
  F:    arch/arm/mach-exynos/pm.c
  
 +CPUIDLE DRIVER - ARM PSCI
 +M:    Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
 +M:    Sudeep Holla <sudeep.holla@arm.com>
 +L:    linux-pm@vger.kernel.org
 +L:    linux-arm-kernel@lists.infradead.org
 +S:    Supported
 +F:    drivers/cpuidle/cpuidle-psci.c
 +
  CPU IDLE TIME MANAGEMENT FRAMEWORK
  M:    "Rafael J. Wysocki" <rjw@rjwysocki.net>
  M:    Daniel Lezcano <daniel.lezcano@linaro.org>
@@@ -5280,7 -5291,7 +5280,7 @@@ F:      include/linux/vga
  
  DRM DRIVERS AND MISC GPU PATCHES
  M:    Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
 -M:    Maxime Ripard <maxime.ripard@bootlin.com>
 +M:    Maxime Ripard <mripard@kernel.org>
  M:    Sean Paul <sean@poorly.run>
  W:    https://01.org/linuxgraphics/gfx-docs/maintainer-tools/drm-misc.html
  S:    Maintained
@@@ -5293,7 -5304,7 +5293,7 @@@ F:      include/uapi/drm/drm
  F:    include/linux/vga*
  
  DRM DRIVERS FOR ALLWINNER A10
 -M:    Maxime Ripard  <maxime.ripard@bootlin.com>
 +M:    Maxime Ripard <mripard@kernel.org>
  L:    dri-devel@lists.freedesktop.org
  S:    Supported
  F:    drivers/gpu/drm/sun4i/
@@@ -5746,11 -5757,6 +5746,11 @@@ S:    Supporte
  F:    drivers/edac/aspeed_edac.c
  F:    Documentation/devicetree/bindings/edac/aspeed-sdram-edac.txt
  
 +EDAC-BLUEFIELD
 +M:    Shravan Kumar Ramani <sramani@mellanox.com>
 +S:    Supported
 +F:    drivers/edac/bluefield_edac.c
 +
  EDAC-CALXEDA
  M:    Robert Richter <rric@kernel.org>
  L:    linux-edac@vger.kernel.org
@@@ -5775,11 -5781,10 +5775,11 @@@ F:   drivers/edac/thunderx_edac
  EDAC-CORE
  M:    Borislav Petkov <bp@alien8.de>
  M:    Mauro Carvalho Chehab <mchehab@kernel.org>
 +M:    Tony Luck <tony.luck@intel.com>
  R:    James Morse <james.morse@arm.com>
 +R:    Robert Richter <rrichter@marvell.com>
  L:    linux-edac@vger.kernel.org
 -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git for-next
 -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac.git linux_next
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next
  S:    Supported
  F:    Documentation/admin-guide/ras.rst
  F:    Documentation/driver-api/edac.rst
@@@ -6056,7 -6061,7 +6056,7 @@@ M:      Florian Fainelli <f.fainelli@gmail.c
  M:    Heiner Kallweit <hkallweit1@gmail.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
 -F:    Documentation/ABI/testing/sysfs-bus-mdio
 +F:    Documentation/ABI/testing/sysfs-class-net-phydev
  F:    Documentation/devicetree/bindings/net/ethernet-phy.yaml
  F:    Documentation/devicetree/bindings/net/mdio*
  F:    Documentation/networking/phy.rst
@@@ -6317,16 -6322,24 +6317,16 @@@ F:   Documentation/devicetree/bindings/co
  F:    drivers/counter/ftm-quaddec.c
  
  FLOPPY DRIVER
 -S:    Orphan
 +M:    Denis Efremov <efremov@linux.com>
 +S:    Odd Fixes
  L:    linux-block@vger.kernel.org
  F:    drivers/block/floppy.c
  
 -FMC SUBSYSTEM
 -M:    Alessandro Rubini <rubini@gnudd.com>
 -W:    http://www.ohwr.org/projects/fmc-bus
 -S:    Supported
 -F:    drivers/fmc/
 -F:    include/linux/fmc*.h
 -F:    include/linux/ipmi-fru.h
 -K:    fmc_d.*register
 -
  FPGA MANAGER FRAMEWORK
  M:    Moritz Fischer <mdf@kernel.org>
  L:    linux-fpga@vger.kernel.org
  S:    Maintained
 -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/atull/linux-fpga.git
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mdf/linux-fpga.git
  Q:    http://patchwork.kernel.org/project/linux-fpga/list/
  F:    Documentation/fpga/
  F:    Documentation/driver-api/fpga/
@@@ -6359,7 -6372,7 +6359,7 @@@ FRAMEBUFFER LAYE
  M:    Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
  L:    dri-devel@lists.freedesktop.org
  L:    linux-fbdev@vger.kernel.org
 -T:    git git://github.com/bzolnier/linux.git
 +T:    git git://anongit.freedesktop.org/drm/drm-misc
  Q:    http://patchwork.kernel.org/project/linux-fbdev/list/
  S:    Maintained
  F:    Documentation/fb/
@@@ -6421,17 -6434,8 +6421,17 @@@ M:    Frank Li <Frank.li@nxp.com
  L:    linux-arm-kernel@lists.infradead.org
  S:    Maintained
  F:    drivers/perf/fsl_imx8_ddr_perf.c
 +F:    Documentation/admin-guide/perf/imx-ddr.rst
  F:    Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt
  
 +FREESCALE IMX I2C DRIVER
 +M:    Oleksij Rempel <o.rempel@pengutronix.de>
 +R:    Pengutronix Kernel Team <kernel@pengutronix.de>
 +L:    linux-i2c@vger.kernel.org
 +S:    Maintained
 +F:    drivers/i2c/busses/i2c-imx.c
 +F:    Documentation/devicetree/bindings/i2c/i2c-imx.txt
 +
  FREESCALE IMX LPI2C DRIVER
  M:    Dong Aisheng <aisheng.dong@nxp.com>
  L:    linux-i2c@vger.kernel.org
@@@ -6715,13 -6719,6 +6715,13 @@@ W:    https://linuxtv.or
  S:    Maintained
  F:    drivers/media/radio/radio-gemtek*
  
 +GENERIC ARCHITECTURE TOPOLOGY
 +M:    Sudeep Holla <sudeep.holla@arm.com>
 +L:    linux-kernel@vger.kernel.org
 +S:    Maintained
 +F:    drivers/base/arch_topology.c
 +F:    include/linux/arch_topology.h
 +
  GENERIC GPIO I2C DRIVER
  M:    Wolfram Sang <wsa+renesas@sang-engineering.com>
  S:    Supported
@@@ -6825,6 -6822,13 +6825,6 @@@ F:     Documentation/filesystems/gfs2*.tx
  F:    fs/gfs2/
  F:    include/uapi/linux/gfs2_ondisk.h
  
 -GIGASET ISDN DRIVERS
 -M:    Paul Bolle <pebolle@tiscali.nl>
 -L:    gigaset307x-common@lists.sourceforge.net
 -W:    http://gigaset307x.sourceforge.net/
 -S:    Odd Fixes
 -F:    drivers/staging/isdn/gigaset/
 -
  GNSS SUBSYSTEM
  M:    Johan Hovold <johan@kernel.org>
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git
@@@ -7450,7 -7454,7 +7450,7 @@@ F:      drivers/net/hyperv
  F:    drivers/scsi/storvsc_drv.c
  F:    drivers/uio/uio_hv_generic.c
  F:    drivers/video/fbdev/hyperv_fb.c
 -F:    drivers/iommu/hyperv_iommu.c
 +F:    drivers/iommu/hyperv-iommu.c
  F:    net/vmw_vsock/hyperv_transport.c
  F:    include/clocksource/hyperv_timer.h
  F:    include/linux/hyperv.h
@@@ -7503,7 -7507,7 +7503,7 @@@ I2C MV64XXX MARVELL AND ALLWINNER DRIVE
  M:    Gregory CLEMENT <gregory.clement@bootlin.com>
  L:    linux-i2c@vger.kernel.org
  S:    Maintained
 -F:    Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt
 +F:    Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml
  F:    drivers/i2c/busses/i2c-mv64xxx.c
  
  I2C OVER PARALLEL PORT
@@@ -8040,7 -8044,6 +8040,7 @@@ S:      Maintaine
  F:    drivers/video/fbdev/i810/
  
  INTEL ASoC DRIVERS
 +M:    Cezary Rojewski <cezary.rojewski@intel.com>
  M:    Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
  M:    Liam Girdwood <liam.r.girdwood@linux.intel.com>
  M:    Jie Yang <yang.jie@linux.intel.com>
@@@ -8062,13 -8065,6 +8062,13 @@@ T:    git git://git.code.sf.net/p/intel-sa
  S:    Supported
  F:    drivers/scsi/isci/
  
 +INTEL CPU family model numbers
 +M:    Tony Luck <tony.luck@intel.com>
 +M:    x86@kernel.org
 +L:    linux-kernel@vger.kernel.org
 +S:    Supported
 +F:    arch/x86/include/asm/intel-family.h
 +
  INTEL DRM DRIVERS (excluding Poulsbo, Moorestown and derivative chipsets)
  M:    Jani Nikula <jani.nikula@linux.intel.com>
  M:    Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
@@@ -8375,6 -8371,12 +8375,6 @@@ F:     Documentation/x86/intel_txt.rs
  F:    include/linux/tboot.h
  F:    arch/x86/kernel/tboot.c
  
 -INTEL-MID GPIO DRIVER
 -M:    David Cohen <david.a.cohen@linux.intel.com>
 -L:    linux-gpio@vger.kernel.org
 -S:    Maintained
 -F:    drivers/gpio/gpio-intel-mid.c
 -
  INTERCONNECT API
  M:    Georgi Djakov <georgi.djakov@linaro.org>
  L:    linux-pm@vger.kernel.org
@@@ -8399,6 -8401,12 +8399,6 @@@ L:     linux-mips@vger.kernel.or
  S:    Maintained
  F:    drivers/net/ethernet/sgi/ioc3-eth.c
  
 -IOC3 SERIAL DRIVER
 -M:    Pat Gefre <pfg@sgi.com>
 -L:    linux-serial@vger.kernel.org
 -S:    Maintained
 -F:    drivers/tty/serial/ioc3_serial.c
 -
  IOMAP FILESYSTEM LIBRARY
  M:    Christoph Hellwig <hch@infradead.org>
  M:    Darrick J. Wong <darrick.wong@oracle.com>
@@@ -8408,6 -8416,7 +8408,6 @@@ L:      linux-xfs@vger.kernel.or
  L:    linux-fsdevel@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git
  S:    Supported
 -F:    fs/iomap.c
  F:    fs/iomap/
  F:    include/linux/iomap.h
  
@@@ -8432,6 -8441,11 +8432,6 @@@ S:     Maintaine
  F:    fs/io_uring.c
  F:    include/uapi/linux/io_uring.h
  
 -IP MASQUERADING
 -M:    Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
 -S:    Maintained
 -F:    net/ipv4/netfilter/ipt_MASQUERADE.c
 -
  IPMI SUBSYSTEM
  M:    Corey Minyard <minyard@acm.org>
  L:    openipmi-developer@lists.sourceforge.net (moderated for non-subscribers)
@@@ -8805,6 -8819,14 +8805,6 @@@ F:     virt/kvm/
  F:    tools/kvm/
  F:    tools/testing/selftests/kvm/
  
 -KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd)
 -M:    Joerg Roedel <joro@8bytes.org>
 -L:    kvm@vger.kernel.org
 -W:    http://www.linux-kvm.org/
 -S:    Maintained
 -F:    arch/x86/include/asm/svm.h
 -F:    arch/x86/kvm/svm.c
 -
  KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64)
  M:    Marc Zyngier <maz@kernel.org>
  R:    James Morse <james.morse@arm.com>
@@@ -8847,7 -8869,7 +8847,7 @@@ M:      Christian Borntraeger <borntraeger@d
  M:    Janosch Frank <frankja@linux.ibm.com>
  R:    David Hildenbrand <david@redhat.com>
  R:    Cornelia Huck <cohuck@redhat.com>
 -L:    linux-s390@vger.kernel.org
 +L:    kvm@vger.kernel.org
  W:    http://www.ibm.com/developerworks/linux/linux390/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
  S:    Supported
@@@ -8862,11 -8884,6 +8862,11 @@@ F:    tools/testing/selftests/kvm/*/s390x
  KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86)
  M:    Paolo Bonzini <pbonzini@redhat.com>
  M:    Radim Krčmář <rkrcmar@redhat.com>
 +R:    Sean Christopherson <sean.j.christopherson@intel.com>
 +R:    Vitaly Kuznetsov <vkuznets@redhat.com>
 +R:    Wanpeng Li <wanpengli@tencent.com>
 +R:    Jim Mattson <jmattson@google.com>
 +R:    Joerg Roedel <joro@8bytes.org>
  L:    kvm@vger.kernel.org
  W:    http://www.linux-kvm.org
  T:    git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
@@@ -8874,12 -8891,8 +8874,12 @@@ S:    Supporte
  F:    arch/x86/kvm/
  F:    arch/x86/kvm/*/
  F:    arch/x86/include/uapi/asm/kvm*
 +F:    arch/x86/include/uapi/asm/vmx.h
 +F:    arch/x86/include/uapi/asm/svm.h
  F:    arch/x86/include/asm/kvm*
  F:    arch/x86/include/asm/pvclock-abi.h
 +F:    arch/x86/include/asm/svm.h
 +F:    arch/x86/include/asm/vmx.h
  F:    arch/x86/kernel/kvm.c
  F:    arch/x86/kernel/kvmclock.c
  
@@@ -8911,7 -8924,7 +8911,7 @@@ F:      security/keys/encrypted-keys
  
  KEYS-TRUSTED
  M:    James Bottomley <jejb@linux.ibm.com>
 -M:      Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
 +M:    Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
  M:    Mimi Zohar <zohar@linux.ibm.com>
  L:    linux-integrity@vger.kernel.org
  L:    keyrings@vger.kernel.org
@@@ -9207,18 -9220,6 +9207,18 @@@ F:    include/linux/nd.
  F:    include/linux/libnvdimm.h
  F:    include/uapi/linux/ndctl.h
  
 +LICENSES and SPDX stuff
 +M:    Thomas Gleixner <tglx@linutronix.de>
 +M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 +L:    linux-spdx@vger.kernel.org
 +S:    Maintained
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/spdx.git
 +F:    COPYING
 +F:    Documentation/process/license-rules.rst
 +F:    LICENSES/
 +F:    scripts/spdxcheck-test.sh
 +F:    scripts/spdxcheck.py
 +
  LIGHTNVM PLATFORM SUPPORT
  M:    Matias Bjorling <mb@lightnvm.io>
  W:    http://github/OpenChannelSSD
@@@ -9325,7 -9326,7 +9325,7 @@@ F:      drivers/misc/lkdtm/
  
  LINUX KERNEL MEMORY CONSISTENCY MODEL (LKMM)
  M:    Alan Stern <stern@rowland.harvard.edu>
 -M:    Andrea Parri <andrea.parri@amarulasolutions.com>
 +M:    Andrea Parri <parri.andrea@gmail.com>
  M:    Will Deacon <will@kernel.org>
  M:    Peter Zijlstra <peterz@infradead.org>
  M:    Boqun Feng <boqun.feng@gmail.com>
@@@ -9333,7 -9334,7 +9333,7 @@@ M:      Nicholas Piggin <npiggin@gmail.com
  M:    David Howells <dhowells@redhat.com>
  M:    Jade Alglave <j.alglave@ucl.ac.uk>
  M:    Luc Maranget <luc.maranget@inria.fr>
 -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
 +M:    "Paul E. McKenney" <paulmck@kernel.org>
  R:    Akira Yokosawa <akiyks@gmail.com>
  R:    Daniel Lustig <dlustig@nvidia.com>
  L:    linux-kernel@vger.kernel.org
@@@ -10016,8 -10017,8 +10016,8 @@@ L:   linux-media@vger.kernel.or
  L:    linux-renesas-soc@vger.kernel.org
  T:    git git://linuxtv.org/media_tree.git
  S:    Supported
 -F:    Documentation/devicetree/bindings/media/renesas,rcar-csi2.txt
 -F:    Documentation/devicetree/bindings/media/rcar_vin.txt
 +F:    Documentation/devicetree/bindings/media/renesas,csi2.txt
 +F:    Documentation/devicetree/bindings/media/renesas,vin.txt
  F:    drivers/media/platform/rcar-vin/
  
  MEDIA DRIVERS FOR RENESAS - VSP1
@@@ -10362,7 -10363,7 +10362,7 @@@ F:   drivers/platform/x86/mlx-platform.
  
  MEMBARRIER SUPPORT
  M:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
 +M:    "Paul E. McKenney" <paulmck@kernel.org>
  L:    linux-kernel@vger.kernel.org
  S:    Supported
  F:    kernel/sched/membarrier.c
@@@ -10614,6 -10615,12 +10614,6 @@@ M:  Nicolas Ferre <nicolas.ferre@microch
  S:    Supported
  F:    drivers/power/reset/at91-sama5d2_shdwc.c
  
 -MICROCHIP SAMA5D2-COMPATIBLE PIOBU GPIO
 -M:    Andrei Stefanescu <andrei.stefanescu@microchip.com>
 -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 -L:    linux-gpio@vger.kernel.org
 -F:    drivers/gpio/gpio-sama5d2-piobu.c
 -
  MICROCHIP SPI DRIVER
  M:    Nicolas Ferre <nicolas.ferre@microchip.com>
  S:    Supported
@@@ -10626,6 -10633,13 +10626,6 @@@ S:  Supporte
  F:    drivers/misc/atmel-ssc.c
  F:    include/linux/atmel-ssc.h
  
 -MICROCHIP TIMER COUNTER (TC) AND CLOCKSOURCE DRIVERS
 -M:    Nicolas Ferre <nicolas.ferre@microchip.com>
 -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 -S:    Supported
 -F:    drivers/misc/atmel_tclib.c
 -F:    drivers/clocksource/tcb_clksrc.c
 -
  MICROCHIP USBA UDC DRIVER
  M:    Cristian Birsan <cristian.birsan@microchip.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -11058,7 -11072,7 +11058,7 @@@ NET_FAILOVER MODUL
  M:    Sridhar Samudrala <sridhar.samudrala@intel.com>
  L:    netdev@vger.kernel.org
  S:    Supported
 -F:    driver/net/net_failover.c
 +F:    drivers/net/net_failover.c
  F:    include/net/net_failover.h
  F:    Documentation/networking/net_failover.rst
  
@@@ -11130,7 -11144,6 +11130,7 @@@ L:   netdev@vger.kernel.or
  S:    Maintained
  W:    https://fedorahosted.org/dropwatch/
  F:    net/core/drop_monitor.c
 +F:    include/uapi/linux/net_dropmon.h
  
  NETWORKING DRIVERS
  M:    "David S. Miller" <davem@davemloft.net>
@@@ -11269,7 -11282,6 +11269,7 @@@ M:   Aviad Yehezkel <aviadye@mellanox.com
  M:    Dave Watson <davejwatson@fb.com>
  M:    John Fastabend <john.fastabend@gmail.com>
  M:    Daniel Borkmann <daniel@iogearbox.net>
 +M:    Jakub Kicinski <jakub.kicinski@netronome.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    net/tls/*
@@@ -12578,6 -12590,7 +12578,7 @@@ PERFORMANCE EVENTS SUBSYSTE
  M:    Peter Zijlstra <peterz@infradead.org>
  M:    Ingo Molnar <mingo@redhat.com>
  M:    Arnaldo Carvalho de Melo <acme@kernel.org>
+ R:    Mark Rutland <mark.rutland@arm.com>
  R:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
  R:    Jiri Olsa <jolsa@redhat.com>
  R:    Namhyung Kim <namhyung@kernel.org>
@@@ -12667,7 -12680,6 +12668,7 @@@ L:   linux-arm-kernel@lists.infradead.or
  L:    linux-gpio@vger.kernel.org
  S:    Supported
  F:    drivers/pinctrl/pinctrl-at91*
 +F:    drivers/gpio/gpio-sama5d2-piobu.c
  
  PIN CONTROLLER - FREESCALE
  M:    Dong Aisheng <aisheng.dong@nxp.com>
@@@ -13465,7 -13477,7 +13466,7 @@@ S:   Orpha
  F:    drivers/net/wireless/ray*
  
  RCUTORTURE TEST FRAMEWORK
 -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
 +M:    "Paul E. McKenney" <paulmck@kernel.org>
  M:    Josh Triplett <josh@joshtriplett.org>
  R:    Steven Rostedt <rostedt@goodmis.org>
  R:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
@@@ -13512,7 -13524,7 +13513,7 @@@ F:   arch/x86/include/asm/resctrl_sched.
  F:    Documentation/x86/resctrl*
  
  READ-COPY UPDATE (RCU)
 -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
 +M:    "Paul E. McKenney" <paulmck@kernel.org>
  M:    Josh Triplett <josh@joshtriplett.org>
  R:    Steven Rostedt <rostedt@goodmis.org>
  R:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
@@@ -13670,7 -13682,7 +13671,7 @@@ F:   include/linux/reset-controller.
  RESTARTABLE SEQUENCES SUPPORT
  M:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  M:    Peter Zijlstra <peterz@infradead.org>
 -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
 +M:    "Paul E. McKenney" <paulmck@kernel.org>
  M:    Boqun Feng <boqun.feng@gmail.com>
  L:    linux-kernel@vger.kernel.org
  S:    Supported
@@@ -14005,12 -14017,6 +14006,12 @@@ F: drivers/media/common/saa7146
  F:    drivers/media/pci/saa7146/
  F:    include/media/drv-intf/saa7146*
  
 +SAFESETID SECURITY MODULE
 +M:     Micah Morton <mortonm@chromium.org>
 +S:     Supported
 +F:     security/safesetid/
 +F:     Documentation/admin-guide/LSM/SafeSetID.rst
 +
  SAMSUNG AUDIO (ASoC) DRIVERS
  M:    Krzysztof Kozlowski <krzk@kernel.org>
  M:    Sangbeom Kim <sbkim73@samsung.com>
@@@ -14101,8 -14107,6 +14102,8 @@@ M:   Kamil Konieczny <k.konieczny@partner
  L:    linux-crypto@vger.kernel.org
  L:    linux-samsung-soc@vger.kernel.org
  S:    Maintained
 +F:    Documentation/devicetree/bindings/crypto/samsung-slimsss.txt
 +F:    Documentation/devicetree/bindings/crypto/samsung-sss.txt
  F:    drivers/crypto/s5p-sss.c
  
  SAMSUNG S5P/EXYNOS4 SOC SERIES CAMERA SUBSYSTEM DRIVERS
@@@ -14123,8 -14127,6 +14124,8 @@@ T:   git git://git.kernel.org/pub/scm/lin
  F:    drivers/clk/samsung/
  F:    include/dt-bindings/clock/exynos*.h
  F:    Documentation/devicetree/bindings/clock/exynos*.txt
 +F:    Documentation/devicetree/bindings/clock/samsung,s3c*
 +F:    Documentation/devicetree/bindings/clock/samsung,s5p*
  
  SAMSUNG SPI DRIVERS
  M:    Kukjin Kim <kgene@kernel.org>
@@@ -14175,6 -14177,12 +14176,12 @@@ F: drivers/watchdog/sc1200wdt.
  SCHEDULER
  M:    Ingo Molnar <mingo@redhat.com>
  M:    Peter Zijlstra <peterz@infradead.org>
+ M:    Juri Lelli <juri.lelli@redhat.com> (SCHED_DEADLINE)
+ M:    Vincent Guittot <vincent.guittot@linaro.org> (SCHED_NORMAL)
+ R:    Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
+ R:    Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
+ R:    Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
+ R:    Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
  L:    linux-kernel@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
  S:    Maintained
@@@ -14455,7 -14463,6 +14462,7 @@@ F:   drivers/net/phy/phylink.
  F:    drivers/net/phy/sfp*
  F:    include/linux/phylink.h
  F:    include/linux/sfp.h
 +K:    phylink
  
  SGI GRU DRIVER
  M:    Dimitri Sivanich <sivanich@sgi.com>
@@@ -14710,7 -14717,7 +14717,7 @@@ F:   mm/sl?b
  
  SLEEPABLE READ-COPY UPDATE (SRCU)
  M:    Lai Jiangshan <jiangshanlai@gmail.com>
 -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
 +M:    "Paul E. McKenney" <paulmck@kernel.org>
  M:    Josh Triplett <josh@joshtriplett.org>
  R:    Steven Rostedt <rostedt@goodmis.org>
  R:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
@@@ -14861,9 -14868,9 +14868,9 @@@ F:   include/linux/arm_sdei.
  F:    include/uapi/linux/arm_sdei.h
  
  SOFTWARE RAID (Multiple Disks) SUPPORT
 -M:    Shaohua Li <shli@kernel.org>
 +M:    Song Liu <song@kernel.org>
  L:    linux-raid@vger.kernel.org
 -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
  S:    Supported
  F:    drivers/md/Makefile
  F:    drivers/md/Kconfig
@@@ -15545,7 -15552,6 +15552,7 @@@ F:   drivers/clk/clk-sc[mp]i.
  F:    drivers/cpufreq/sc[mp]i-cpufreq.c
  F:    drivers/firmware/arm_scpi.c
  F:    drivers/firmware/arm_scmi/
 +F:    drivers/reset/reset-scmi.c
  F:    include/linux/sc[mp]i_protocol.h
  
  SYSTEM RESET/SHUTDOWN DRIVERS
@@@ -15854,7 -15860,6 +15861,7 @@@ F:   drivers/firmware/ti_sci
  F:    include/linux/soc/ti/ti_sci_protocol.h
  F:    Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt
  F:    drivers/soc/ti/ti_sci_pm_domains.c
 +F:    include/dt-bindings/soc/ti,sci_pm_domain.h
  F:    Documentation/devicetree/bindings/reset/ti,sci-reset.txt
  F:    Documentation/devicetree/bindings/clock/ti,sci-clk.txt
  F:    drivers/clk/keystone/sci-clk.c
@@@ -16080,7 -16085,7 +16087,7 @@@ S:   Maintaine
  F:    drivers/net/ethernet/ti/netcp*
  
  TI PCM3060 ASoC CODEC DRIVER
 -M:    Kirill Marinushkin <kmarinushkin@birdec.tech>
 +M:    Kirill Marinushkin <kmarinushkin@birdec.com>
  L:    alsa-devel@alsa-project.org (moderated for non-subscribers)
  S:    Maintained
  F:    Documentation/devicetree/bindings/sound/pcm3060.txt
@@@ -16209,7 -16214,7 +16216,7 @@@ F:   drivers/platform/x86/topstar-laptop.
  
  TORTURE-TEST MODULES
  M:    Davidlohr Bueso <dave@stgolabs.net>
 -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
 +M:    "Paul E. McKenney" <paulmck@kernel.org>
  M:    Josh Triplett <josh@joshtriplett.org>
  L:    linux-kernel@vger.kernel.org
  S:    Supported
@@@ -17235,7 -17240,6 +17242,7 @@@ F:   Documentation/power/regulator
  F:    drivers/regulator/
  F:    include/dt-bindings/regulator/
  F:    include/linux/regulator/
 +K:    regulator_get_optional
  
  VRF
  M:    David Ahern <dsa@cumulusnetworks.com>
@@@ -17557,6 -17561,7 +17564,6 @@@ M:   Jakub Kicinski <jakub.kicinski@netro
  M:    Jesper Dangaard Brouer <hawk@kernel.org>
  M:    John Fastabend <john.fastabend@gmail.com>
  L:    netdev@vger.kernel.org
 -L:    xdp-newbies@vger.kernel.org
  L:    bpf@vger.kernel.org
  S:    Supported
  F:    net/core/xdp.c
@@@ -17672,7 -17677,8 +17679,7 @@@ F:   include/uapi/linux/dqblk_xfs.
  F:    include/uapi/linux/fsmap.h
  
  XILINX AXI ETHERNET DRIVER
 -M:    Anirudha Sarangi <anirudh@xilinx.com>
 -M:    John Linn <John.Linn@xilinx.com>
 +M:    Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
  S:    Maintained
  F:    drivers/net/ethernet/xilinx/xilinx_axienet*
  
diff --combined arch/Kconfig
index 71d9ae0c0ea16ea8990e1a81841d5bb31b77a07d,c7efbc018f4fa800ca08c752ea630c2238eeccae..6baedab10dcaa14130b15907313200a6f04b38b4
@@@ -18,9 -18,6 +18,9 @@@ config KEXEC_COR
        select CRASH_CORE
        bool
  
 +config KEXEC_ELF
 +      bool
 +
  config HAVE_IMA_KEXEC
        bool
  
@@@ -106,7 -103,7 +106,7 @@@ config STATIC_KEYS_SELFTES
  config OPTPROBES
        def_bool y
        depends on KPROBES && HAVE_OPTPROBES
-       select TASKS_RCU if PREEMPT
+       select TASKS_RCU if PREEMPTION
  
  config KPROBES_ON_FTRACE
        def_bool y
@@@ -928,20 -925,6 +928,20 @@@ config LOCK_EVENT_COUNT
          the chance of application behavior change because of timing
          differences. The counts are reported via debugfs.
  
 +# Select if the architecture has support for applying RELR relocations.
 +config ARCH_HAS_RELR
 +      bool
 +
 +config RELR
 +      bool "Use RELR relocation packing"
 +      depends on ARCH_HAS_RELR && TOOLS_SUPPORT_RELR
 +      default y
 +      help
 +        Store the kernel's dynamic relocations in the RELR relocation packing
 +        format. Requires a compatible linker (LLD supports this feature), as
 +        well as compatible NM and OBJCOPY utilities (llvm-nm and llvm-objcopy
 +        are compatible).
 +
  source "kernel/gcov/Kconfig"
  
  source "scripts/gcc-plugins/Kconfig"
diff --combined arch/ia64/Kconfig
index 13d49c232556ce9e3bbdf1862fc3ca388d1b2e6a,997baba02b70e7e427511c324e4f8cb9dae5fef9..9711cf73092948678423b9d474030eecd24d4032
@@@ -10,14 -10,12 +10,14 @@@ config IA6
        bool
        select ARCH_MIGHT_HAVE_PC_PARPORT
        select ARCH_MIGHT_HAVE_PC_SERIO
 -      select ACPI if (!IA64_HP_SIM)
 -      select ARCH_SUPPORTS_ACPI if (!IA64_HP_SIM)
 +      select ACPI
 +      select ACPI_NUMA if NUMA
 +      select ARCH_SUPPORTS_ACPI
        select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
        select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
 -      select FORCE_PCI if (!IA64_HP_SIM)
 +      select FORCE_PCI
        select PCI_DOMAINS if PCI
 +      select PCI_MSI
        select PCI_SYSCALL if PCI
        select HAVE_UNSTABLE_SCHED_CLOCK
        select HAVE_EXIT_THREAD
@@@ -32,8 -30,8 +32,8 @@@
        select HAVE_ARCH_TRACEHOOK
        select HAVE_MEMBLOCK_NODE_MAP
        select HAVE_VIRT_CPU_ACCOUNTING
 -      select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB
 -      select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB
 +      select ARCH_HAS_DMA_COHERENT_TO_PFN
 +      select ARCH_HAS_SYNC_DMA_FOR_CPU
        select VIRT_TO_BUS
        select GENERIC_IRQ_PROBE
        select GENERIC_PENDING_IRQ if SMP
@@@ -47,7 -45,6 +47,7 @@@
        select ARCH_THREAD_STACK_ALLOCATOR
        select ARCH_CLOCKSOURCE_DATA
        select GENERIC_TIME_VSYSCALL
 +      select SWIOTLB
        select SYSCTL_ARCH_UNALIGN_NO_WARN
        select HAVE_MOD_ARCH_SPECIFIC
        select MODULES_USE_ELF_RELA
@@@ -55,7 -52,6 +55,7 @@@
        select HAVE_ARCH_AUDITSYSCALL
        select NEED_DMA_MAP_STATE
        select NEED_SG_DMA_LENGTH
 +      select NUMA if !FLATMEM
        default y
        help
          The Itanium Processor Family is Intel's 64-bit successor to
@@@ -70,6 -66,7 +70,6 @@@ config 64BI
  
  config ZONE_DMA32
        def_bool y
 -      depends on !IA64_SGI_SN2
  
  config QUICKLIST
        bool
@@@ -123,6 -120,87 +123,6 @@@ config AUDIT_ARC
        bool
        default y
  
 -choice
 -      prompt "System type"
 -      default IA64_GENERIC
 -
 -config IA64_GENERIC
 -      bool "generic"
 -      select NUMA
 -      select ACPI_NUMA
 -      select SWIOTLB
 -      select PCI_MSI
 -      help
 -        This selects the system type of your hardware.  A "generic" kernel
 -        will run on any supported IA-64 system.  However, if you configure
 -        a kernel for your specific system, it will be faster and smaller.
 -
 -        generic               For any supported IA-64 system
 -        DIG-compliant         For DIG ("Developer's Interface Guide") compliant systems
 -        DIG+Intel+IOMMU       For DIG systems with Intel IOMMU
 -        HP-zx1/sx1000         For HP systems
 -        HP-zx1/sx1000+swiotlb For HP systems with (broken) DMA-constrained devices.
 -        SGI-SN2               For SGI Altix systems
 -        SGI-UV                For SGI UV systems
 -        Ski-simulator         For the HP simulator <http://www.hpl.hp.com/research/linux/ski/>
 -
 -        If you don't know what to do, choose "generic".
 -
 -config IA64_DIG
 -      bool "DIG-compliant"
 -      select SWIOTLB
 -
 -config IA64_DIG_VTD
 -      bool "DIG+Intel+IOMMU"
 -      select INTEL_IOMMU
 -      select PCI_MSI
 -
 -config IA64_HP_ZX1
 -      bool "HP-zx1/sx1000"
 -      help
 -        Build a kernel that runs on HP zx1 and sx1000 systems.  This adds
 -        support for the HP I/O MMU.
 -
 -config IA64_HP_ZX1_SWIOTLB
 -      bool "HP-zx1/sx1000 with software I/O TLB"
 -      select SWIOTLB
 -      help
 -        Build a kernel that runs on HP zx1 and sx1000 systems even when they
 -        have broken PCI devices which cannot DMA to full 32 bits.  Apart
 -        from support for the HP I/O MMU, this includes support for the software
 -        I/O TLB, which allows supporting the broken devices at the expense of
 -        wasting some kernel memory (about 2MB by default).
 -
 -config IA64_SGI_SN2
 -      bool "SGI-SN2"
 -      select NUMA
 -      select ACPI_NUMA
 -      help
 -        Selecting this option will optimize the kernel for use on sn2 based
 -        systems, but the resulting kernel binary will not run on other
 -        types of ia64 systems.  If you have an SGI Altix system, it's safe
 -        to select this option.  If in doubt, select ia64 generic support
 -        instead.
 -
 -config IA64_SGI_UV
 -      bool "SGI-UV"
 -      select NUMA
 -      select ACPI_NUMA
 -      select SWIOTLB
 -      help
 -        Selecting this option will optimize the kernel for use on UV based
 -        systems, but the resulting kernel binary will not run on other
 -        types of ia64 systems.  If you have an SGI UV system, it's safe
 -        to select this option.  If in doubt, select ia64 generic support
 -        instead.
 -
 -config IA64_HP_SIM
 -      bool "Ski-simulator"
 -      select SWIOTLB
 -      depends on !PM
 -
 -endchoice
 -
  choice
        prompt "Processor type"
        default ITANIUM
@@@ -174,7 -252,14 +174,7 @@@ config IA64_PAGE_SIZE_64K
  
  endchoice
  
 -if IA64_HP_SIM
 -config HZ
 -      default 32
 -endif
 -
 -if !IA64_HP_SIM
  source "kernel/Kconfig.hz"
 -endif
  
  config IA64_BRL_EMU
        bool
@@@ -187,26 -272,17 +187,26 @@@ config IA64_L1_CACHE_SHIF
        default "7" if MCKINLEY
        default "6" if ITANIUM
  
 +config IA64_SGI_UV
 +      bool "SGI-UV support"
 +      help
 +        Selecting this option will add specific support for running on SGI
 +        UV based systems.  If you have an SGI UV system or are building a
 +        distro kernel, select this option.
 +
 +config IA64_HP_SBA_IOMMU
 +      bool "HP SBA IOMMU support"
 +      default y
 +      help
 +        Say Y here to add support for the SBA IOMMU found on HP zx1 and
 +        sx1000 systems.  If you're unsure, answer Y.
 +
  config IA64_CYCLONE
        bool "Cyclone (EXA) Time Source support"
        help
          Say Y here to enable support for IBM EXA Cyclone time source.
          If you're unsure, answer N.
  
 -config IOSAPIC
 -      bool
 -      depends on !IA64_HP_SIM
 -      default y
 -
  config FORCE_MAX_ZONEORDER
        int "MAX_ORDER (11 - 17)"  if !HUGETLB_PAGE
        range 11 17  if !HUGETLB_PAGE
@@@ -305,12 -381,15 +305,13 @@@ config ARCH_SPARSEMEM_ENABL
        select SPARSEMEM_VMEMMAP_ENABLE
  
  config ARCH_DISCONTIGMEM_DEFAULT
 -      def_bool y if (IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB)
 +      def_bool y
        depends on ARCH_DISCONTIGMEM_ENABLE
  
  config NUMA
        bool "NUMA support"
 -      depends on !IA64_HP_SIM && !FLATMEM
 -      default y if IA64_SGI_SN2
 -      select ACPI_NUMA if ACPI
 +      depends on !FLATMEM
+       select SMP
        help
          Say Y to compile the kernel to support NUMA (Non-Uniform Memory
          Access).  This option is for configuring high-end multiprocessor
@@@ -331,7 -410,7 +332,7 @@@ config NODES_SHIF
  config VIRTUAL_MEM_MAP
        bool "Virtual mem map"
        depends on !SPARSEMEM
 -      default y if !IA64_HP_SIM
 +      default y
        help
          Say Y to compile the kernel with support for a virtual mem map.
          This code also only takes effect if a memory hole of greater than
@@@ -394,6 -473,9 +395,6 @@@ config IA64_MC_ERR_INJEC
  
          If you're unsure, do not select this option.
  
 -config SGI_SN
 -      def_bool y if (IA64_SGI_SN2 || IA64_GENERIC)
 -
  config IA64_ESI
        bool "ESI (Extensible SAL Interface) support"
        help
@@@ -412,9 -494,11 +413,9 @@@ config IA64_HP_AML_NF
          the "force" module parameter, e.g., with the "aml_nfw.force"
          kernel command line option.
  
 -source "drivers/sn/Kconfig"
 -
  config KEXEC
        bool "kexec system call"
 -      depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
 +      depends on !SMP || HOTPLUG_CPU
        select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
  
  config CRASH_DUMP
          bool "kernel crash dumps"
 -        depends on IA64_MCA_RECOVERY && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
 +        depends on IA64_MCA_RECOVERY && (!SMP || HOTPLUG_CPU)
          help
            Generate crash dump after being started by kexec.
  
@@@ -454,6 -538,8 +455,6 @@@ endi
  
  endmenu
  
 -source "arch/ia64/hp/sim/Kconfig"
 -
  config MSPEC
        tristate "Memory special operations driver"
        depends on IA64
index be9ca198c581aea7ed29f4417aae9c1c1b835473,9701464341e49b3af4a51c9505388510babb7dff..af077ded196966256792af01507427fc800cf32e
@@@ -519,7 -519,7 +519,7 @@@ ENTRY(interrupt_entry
        testb   $3, CS-ORIG_RAX+8(%rsp)
        jz      1f
        SWAPGS
 -
 +      FENCE_SWAPGS_USER_ENTRY
        /*
         * Switch to the thread stack. The IRET frame and orig_ax are
         * on the stack, as well as the return address. RDI..R12 are
        UNWIND_HINT_FUNC
  
        movq    (%rdi), %rdi
 +      jmp     2f
  1:
 -
 +      FENCE_SWAPGS_KERNEL_ENTRY
 +2:
        PUSH_AND_CLEAR_REGS save_ret=1
        ENCODE_FRAME_POINTER 8
  
@@@ -664,7 -662,7 +664,7 @@@ GLOBAL(swapgs_restore_regs_and_return_t
  
  /* Returning to kernel space */
  retint_kernel:
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
        /* Interrupts are off */
        /* Check if we need preemption */
        btl     $9, EFLAGS(%rsp)                /* were interrupts off? */
@@@ -1115,7 -1113,7 +1115,7 @@@ ENTRY(xen_do_hypervisor_callback)               /* d
        call    xen_evtchn_do_upcall
        LEAVE_IRQ_STACK
  
- #ifndef CONFIG_PREEMPT
+ #ifndef CONFIG_PREEMPTION
        call    xen_maybe_preempt_hcall
  #endif
        jmp     error_exit
@@@ -1240,13 -1238,6 +1240,13 @@@ ENTRY(paranoid_entry
         */
        SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
  
 +      /*
 +       * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
 +       * unconditional CR3 write, even in the PTI case.  So do an lfence
 +       * to prevent GS speculation, regardless of whether PTI is enabled.
 +       */
 +      FENCE_SWAPGS_KERNEL_ENTRY
 +
        ret
  END(paranoid_entry)
  
@@@ -1297,7 -1288,6 +1297,7 @@@ ENTRY(error_entry
         * from user mode due to an IRET fault.
         */
        SWAPGS
 +      FENCE_SWAPGS_USER_ENTRY
        /* We have user CR3.  Change to kernel CR3. */
        SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
  
        pushq   %r12
        ret
  
 +.Lerror_entry_done_lfence:
 +      FENCE_SWAPGS_KERNEL_ENTRY
  .Lerror_entry_done:
        ret
  
        cmpq    %rax, RIP+8(%rsp)
        je      .Lbstep_iret
        cmpq    $.Lgs_change, RIP+8(%rsp)
 -      jne     .Lerror_entry_done
 +      jne     .Lerror_entry_done_lfence
  
        /*
         * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
         * .Lgs_change's error handler with kernel gsbase.
         */
        SWAPGS
 +      FENCE_SWAPGS_USER_ENTRY
        SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
        jmp .Lerror_entry_done
  
         * gsbase and CR3.  Switch to kernel gsbase and CR3:
         */
        SWAPGS
 +      FENCE_SWAPGS_USER_ENTRY
        SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
  
        /*
@@@ -1445,7 -1431,6 +1445,7 @@@ ENTRY(nmi
  
        swapgs
        cld
 +      FENCE_SWAPGS_USER_ENTRY
        SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
        movq    %rsp, %rdx
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
index 68c363c341bf2a3794ec26f29df4dd28281d4dc5,ceeb8afc7cf3a43d998ce581849729d235847497..7d6e0efcc2db3cf909c2fd868311ac496b9a1070
@@@ -8,6 -8,7 +8,7 @@@
  #include <linux/sched.h>
  #include <linux/sched/clock.h>
  #include <linux/random.h>
+ #include <linux/topology.h>
  #include <asm/processor.h>
  #include <asm/apic.h>
  #include <asm/cacheinfo.h>
@@@ -804,64 -805,6 +805,64 @@@ static void init_amd_ln(struct cpuinfo_
        msr_set_bit(MSR_AMD64_DE_CFG, 31);
  }
  
 +static bool rdrand_force;
 +
 +static int __init rdrand_cmdline(char *str)
 +{
 +      if (!str)
 +              return -EINVAL;
 +
 +      if (!strcmp(str, "force"))
 +              rdrand_force = true;
 +      else
 +              return -EINVAL;
 +
 +      return 0;
 +}
 +early_param("rdrand", rdrand_cmdline);
 +
 +static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
 +{
 +      /*
 +       * Saving of the MSR used to hide the RDRAND support during
 +       * suspend/resume is done by arch/x86/power/cpu.c, which is
 +       * dependent on CONFIG_PM_SLEEP.
 +       */
 +      if (!IS_ENABLED(CONFIG_PM_SLEEP))
 +              return;
 +
 +      /*
 +       * The nordrand option can clear X86_FEATURE_RDRAND, so check for
 +       * RDRAND support using the CPUID function directly.
 +       */
 +      if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
 +              return;
 +
 +      msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
 +
 +      /*
 +       * Verify that the CPUID change has occurred in case the kernel is
 +       * running virtualized and the hypervisor doesn't support the MSR.
 +       */
 +      if (cpuid_ecx(1) & BIT(30)) {
 +              pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
 +              return;
 +      }
 +
 +      clear_cpu_cap(c, X86_FEATURE_RDRAND);
 +      pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
 +}
 +
 +static void init_amd_jg(struct cpuinfo_x86 *c)
 +{
 +      /*
 +       * Some BIOS implementations do not restore proper RDRAND support
 +       * across suspend and resume. Check on whether to hide the RDRAND
 +       * instruction support via CPUID.
 +       */
 +      clear_rdrand_cpuid_bit(c);
 +}
 +
  static void init_amd_bd(struct cpuinfo_x86 *c)
  {
        u64 value;
                        wrmsrl_safe(MSR_F15H_IC_CFG, value);
                }
        }
 +
 +      /*
 +       * Some BIOS implementations do not restore proper RDRAND support
 +       * across suspend and resume. Check on whether to hide the RDRAND
 +       * instruction support via CPUID.
 +       */
 +      clear_rdrand_cpuid_bit(c);
  }
  
  static void init_amd_zn(struct cpuinfo_x86 *c)
  {
        set_cpu_cap(c, X86_FEATURE_ZEN);
  
+ #ifdef CONFIG_NUMA
+       node_reclaim_distance = 32;
+ #endif
        /*
         * Fix erratum 1076: CPB feature bit not being set in CPUID.
         * Always set it, except when running under a hypervisor.
@@@ -925,7 -865,6 +930,7 @@@ static void init_amd(struct cpuinfo_x8
        case 0x10: init_amd_gh(c); break;
        case 0x12: init_amd_ln(c); break;
        case 0x15: init_amd_bd(c); break;
 +      case 0x16: init_amd_jg(c); break;
        case 0x17: init_amd_zn(c); break;
        }
  
diff --combined arch/x86/kernel/kvm.c
index 4ab377c9fffede8af8c93b620bdb9d90803fd353,3d07f84c4846943f122d0345e540565e7954f207..4cc967178bf952ca32b01059505b588c3439db1d
@@@ -308,10 -308,13 +308,10 @@@ static notrace void kvm_guest_apic_eoi_
  
  static void kvm_guest_cpu_init(void)
  {
 -      if (!kvm_para_available())
 -              return;
 -
        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
                u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
  
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
                pa |= KVM_ASYNC_PF_SEND_ALWAYS;
  #endif
                pa |= KVM_ASYNC_PF_ENABLED;
@@@ -622,6 -625,9 +622,6 @@@ static void __init kvm_guest_init(void
  {
        int i;
  
 -      if (!kvm_para_available())
 -              return;
 -
        paravirt_ops_setup();
        register_reboot_notifier(&kvm_pv_reboot_nb);
        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
@@@ -842,6 -848,8 +842,6 @@@ asm
   */
  void __init kvm_spinlock_init(void)
  {
 -      if (!kvm_para_available())
 -              return;
        /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
        if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
                return;
diff --combined include/linux/rcupdate.h
index 80d6056f58556590af7adb5978daf40692cca87e,c4f76a310443f306e930a2da55e4604b021e6263..75a2eded7aa2ce6973622ecfd5a2a00772f07270
@@@ -221,7 -221,6 +221,7 @@@ int debug_lockdep_rcu_enabled(void)
  int rcu_read_lock_held(void);
  int rcu_read_lock_bh_held(void);
  int rcu_read_lock_sched_held(void);
 +int rcu_read_lock_any_held(void);
  
  #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
@@@ -242,12 -241,6 +242,12 @@@ static inline int rcu_read_lock_sched_h
  {
        return !preemptible();
  }
 +
 +static inline int rcu_read_lock_any_held(void)
 +{
 +      return !preemptible();
 +}
 +
  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
  #ifdef CONFIG_PROVE_RCU
@@@ -483,7 -476,7 +483,7 @@@ do {                                                                             
   * The no-tracing version of rcu_dereference_raw() must not call
   * rcu_read_lock_held().
   */
 -#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
 +#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)
  
  /**
   * rcu_dereference_protected() - fetch RCU pointer when updates prevented
   *
   * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU),
   * it is illegal to block while in an RCU read-side critical section.
-  * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPT
+  * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
   * kernel builds, RCU read-side critical sections may be preempted,
   * but explicit blocking is illegal.  Finally, in preemptible RCU
   * implementations in real-time (with -rt patchset) kernel builds, RCU
diff --combined include/linux/topology.h
index 2a19d196af2886899a552dda7c68edaf1015dc66,579522ec446c3aa76a9c5f0d5c2f90543b9d407d..eb2fe6edd73c80ad16ddad96fd9c10777b72d051
@@@ -27,7 -27,6 +27,7 @@@
  #ifndef _LINUX_TOPOLOGY_H
  #define _LINUX_TOPOLOGY_H
  
 +#include <linux/arch_topology.h>
  #include <linux/cpumask.h>
  #include <linux/bitops.h>
  #include <linux/mmzone.h>
@@@ -60,6 -59,20 +60,20 @@@ int arch_update_cpu_topology(void)
   */
  #define RECLAIM_DISTANCE 30
  #endif
+ /*
+  * The following tunable allows platforms to override the default node
+  * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
+  * sufficiently fast that the default value actually hurts
+  * performance.
+  *
+  * AMD EPYC machines use this because even though the 2-hop distance
+  * is 32 (3.2x slower than a local memory access) performance actually
+  * *improves* if allowed to reclaim memory and load balance tasks
+  * between NUMA nodes 2-hops apart.
+  */
+ extern int __read_mostly node_reclaim_distance;
  #ifndef PENALTY_FOR_NODE_WITH_CPUS
  #define PENALTY_FOR_NODE_WITH_CPUS    (1)
  #endif
diff --combined init/Kconfig
index d96127ebc44e08526f0be1586098d2ecd52e7104,ac285cfa78b6ca22254bcb6868e9abb3eacd7bdb..ec1021fd33712afdc98b5ff454518320d858d8f5
@@@ -30,9 -30,6 +30,9 @@@ config CC_CAN_LIN
  config CC_HAS_ASM_GOTO
        def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC))
  
 +config TOOLS_SUPPORT_RELR
 +      def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh)
 +
  config CC_HAS_WARN_MAYBE_UNINITIALIZED
        def_bool $(cc-option,-Wmaybe-uninitialized)
        help
@@@ -931,6 -928,28 +931,28 @@@ config RT_GROUP_SCHE
  
  endif #CGROUP_SCHED
  
+ config UCLAMP_TASK_GROUP
+       bool "Utilization clamping per group of tasks"
+       depends on CGROUP_SCHED
+       depends on UCLAMP_TASK
+       default n
+       help
+         This feature enables the scheduler to track the clamped utilization
+         of each CPU based on RUNNABLE tasks currently scheduled on that CPU.
+         When this option is enabled, the user can specify a min and max
+         CPU bandwidth which is allowed for each single task in a group.
+         The max bandwidth allows to clamp the maximum frequency a task
+         can use, while the min bandwidth allows to define a minimum
+         frequency a task will always use.
+         When task group based utilization clamping is enabled, an eventually
+         specified task-specific clamp value is constrained by the cgroup
+         specified clamp value. Both minimum and maximum task clamping cannot
+         be bigger than the corresponding clamping defined at task group level.
+         If in doubt, say N.
  config CGROUP_PIDS
        bool "PIDs controller"
        help
diff --combined kernel/cgroup/cgroup.c
index 8be1da1ebd9a4f3d4ee3f6038a85e18e8d5fa685,4b5bc452176ca83f807fb5055ecff7ea15b01c64..a7ce73a2c40198e8cdd53df154db0d16af89daaa
@@@ -1891,7 -1891,7 +1891,7 @@@ static int cgroup_reconfigure(struct fs
   */
  static bool use_task_css_set_links __read_mostly;
  
static void cgroup_enable_task_cg_lists(void)
+ void cgroup_enable_task_cg_lists(void)
  {
        struct task_struct *p, *g;
  
@@@ -5255,16 -5255,8 +5255,16 @@@ static struct cgroup *cgroup_create(str
         * if the parent has to be frozen, the child has too.
         */
        cgrp->freezer.e_freeze = parent->freezer.e_freeze;
 -      if (cgrp->freezer.e_freeze)
 +      if (cgrp->freezer.e_freeze) {
 +              /*
 +               * Set the CGRP_FREEZE flag, so when a process will be
 +               * attached to the child cgroup, it will become frozen.
 +               * At this point the new cgroup is unpopulated, so we can
 +               * consider it frozen immediately.
 +               */
 +              set_bit(CGRP_FREEZE, &cgrp->flags);
                set_bit(CGRP_FROZEN, &cgrp->flags);
 +      }
  
        spin_lock_irq(&css_set_lock);
        for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
diff --combined kernel/events/core.c
index 2aad959e6def727accc954163a3d8f532e3fd83b,c1f52a749db25cd89f5ccc7cf5e6d0237677adae..1c414b8866b454aed555aafdf34e823256f0c8ba
@@@ -1887,89 -1887,6 +1887,89 @@@ list_del_event(struct perf_event *event
        ctx->generation++;
  }
  
 +static int
 +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
 +{
 +      if (!has_aux(aux_event))
 +              return 0;
 +
 +      if (!event->pmu->aux_output_match)
 +              return 0;
 +
 +      return event->pmu->aux_output_match(aux_event);
 +}
 +
 +static void put_event(struct perf_event *event);
 +static void event_sched_out(struct perf_event *event,
 +                          struct perf_cpu_context *cpuctx,
 +                          struct perf_event_context *ctx);
 +
 +static void perf_put_aux_event(struct perf_event *event)
 +{
 +      struct perf_event_context *ctx = event->ctx;
 +      struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 +      struct perf_event *iter;
 +
 +      /*
 +       * If event uses aux_event tear down the link
 +       */
 +      if (event->aux_event) {
 +              iter = event->aux_event;
 +              event->aux_event = NULL;
 +              put_event(iter);
 +              return;
 +      }
 +
 +      /*
 +       * If the event is an aux_event, tear down all links to
 +       * it from other events.
 +       */
 +      for_each_sibling_event(iter, event->group_leader) {
 +              if (iter->aux_event != event)
 +                      continue;
 +
 +              iter->aux_event = NULL;
 +              put_event(event);
 +
 +              /*
 +               * If it's ACTIVE, schedule it out and put it into ERROR
 +               * state so that we don't try to schedule it again. Note
 +               * that perf_event_enable() will clear the ERROR status.
 +               */
 +              event_sched_out(iter, cpuctx, ctx);
 +              perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 +      }
 +}
 +
 +static int perf_get_aux_event(struct perf_event *event,
 +                            struct perf_event *group_leader)
 +{
 +      /*
 +       * Our group leader must be an aux event if we want to be
 +       * an aux_output. This way, the aux event will precede its
 +       * aux_output events in the group, and therefore will always
 +       * schedule first.
 +       */
 +      if (!group_leader)
 +              return 0;
 +
 +      if (!perf_aux_output_match(event, group_leader))
 +              return 0;
 +
 +      if (!atomic_long_inc_not_zero(&group_leader->refcount))
 +              return 0;
 +
 +      /*
 +       * Link aux_outputs to their aux event; this is undone in
 +       * perf_group_detach() by perf_put_aux_event(). When the
 +       * group in torn down, the aux_output events loose their
 +       * link to the aux_event and can't schedule any more.
 +       */
 +      event->aux_event = group_leader;
 +
 +      return 1;
 +}
 +
  static void perf_group_detach(struct perf_event *event)
  {
        struct perf_event *sibling, *tmp;
  
        event->attach_state &= ~PERF_ATTACH_GROUP;
  
 +      perf_put_aux_event(event);
 +
        /*
         * If this is a sibling, remove it from its group.
         */
@@@ -4174,10 -4089,8 +4174,8 @@@ alloc_perf_context(struct pmu *pmu, str
                return NULL;
  
        __perf_event_init_context(ctx);
-       if (task) {
-               ctx->task = task;
-               get_task_struct(task);
-       }
+       if (task)
+               ctx->task = get_task_struct(task);
        ctx->pmu = pmu;
  
        return ctx;
@@@ -10440,8 -10353,7 +10438,7 @@@ perf_event_alloc(struct perf_event_att
                 * and we cannot use the ctx information because we need the
                 * pmu before we get a ctx.
                 */
-               get_task_struct(task);
-               event->hw.target = task;
+               event->hw.target = get_task_struct(task);
        }
  
        event->clock = &local_clock;
                goto err_ns;
        }
  
 +      if (event->attr.aux_output &&
 +          !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
 +              err = -EOPNOTSUPP;
 +              goto err_pmu;
 +      }
 +
        err = exclusive_event_init(event);
        if (err)
                goto err_pmu;
@@@ -11173,8 -11079,6 +11170,8 @@@ SYSCALL_DEFINE5(perf_event_open
                }
        }
  
 +      if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
 +              goto err_locked;
  
        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
diff --combined kernel/kprobes.c
index ebe8315a756a2593f0e9bab3f37efe7885a347ed,8bc5f1ffd68e4351ac472dae4fcdca2862f88bcd..1b66ccbb744a6a991dbaa8670f0bf26ff826837a
@@@ -470,7 -470,6 +470,7 @@@ static DECLARE_DELAYED_WORK(optimizing_
   */
  static void do_optimize_kprobes(void)
  {
 +      lockdep_assert_held(&text_mutex);
        /*
         * The optimization/unoptimization refers online_cpus via
         * stop_machine() and cpu-hotplug modifies online_cpus.
            list_empty(&optimizing_list))
                return;
  
 -      mutex_lock(&text_mutex);
        arch_optimize_kprobes(&optimizing_list);
 -      mutex_unlock(&text_mutex);
  }
  
  /*
@@@ -499,7 -500,6 +499,7 @@@ static void do_unoptimize_kprobes(void
  {
        struct optimized_kprobe *op, *tmp;
  
 +      lockdep_assert_held(&text_mutex);
        /* See comment in do_optimize_kprobes() */
        lockdep_assert_cpus_held();
  
        if (list_empty(&unoptimizing_list))
                return;
  
 -      mutex_lock(&text_mutex);
        arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
        /* Loop free_list for disarming */
        list_for_each_entry_safe(op, tmp, &freeing_list, list) {
                } else
                        list_del_init(&op->list);
        }
 -      mutex_unlock(&text_mutex);
  }
  
  /* Reclaim all kprobes on the free_list */
@@@ -554,7 -556,6 +554,7 @@@ static void kprobe_optimizer(struct wor
  {
        mutex_lock(&kprobe_mutex);
        cpus_read_lock();
 +      mutex_lock(&text_mutex);
        /* Lock modules while optimizing kprobes */
        mutex_lock(&module_mutex);
  
        do_free_cleaned_kprobes();
  
        mutex_unlock(&module_mutex);
 +      mutex_unlock(&text_mutex);
        cpus_read_unlock();
        mutex_unlock(&kprobe_mutex);
  
@@@ -1514,8 -1514,7 +1514,8 @@@ static int check_kprobe_address_safe(st
        /* Ensure it is not in reserved area nor out of text */
        if (!kernel_text_address((unsigned long) p->addr) ||
            within_kprobe_blacklist((unsigned long) p->addr) ||
 -          jump_label_text_reserved(p->addr, p->addr)) {
 +          jump_label_text_reserved(p->addr, p->addr) ||
 +          find_bug((unsigned long)p->addr)) {
                ret = -EINVAL;
                goto out;
        }
@@@ -1907,7 -1906,7 +1907,7 @@@ int register_kretprobe(struct kretprob
  
        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0) {
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
                rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
  #else
                rp->maxactive = num_possible_cpus();
diff --combined kernel/rcu/tree.c
index 71395e91b876809bdfabdcf9408a76bb5172b57c,6a37afd5436c32a6635fee042c5562f10ed24baf..81105141b6a823689254b5a9033cc7b62e330213
@@@ -56,7 -56,6 +56,7 @@@
  #include <linux/smpboot.h>
  #include <linux/jiffies.h>
  #include <linux/sched/isolation.h>
 +#include <linux/sched/clock.h>
  #include "../time/tick-internal.h"
  
  #include "tree.h"
@@@ -211,9 -210,9 +211,9 @@@ static long rcu_get_n_cbs_cpu(int cpu
  {
        struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
  
 -      if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */
 +      if (rcu_segcblist_is_enabled(&rdp->cblist))
                return rcu_segcblist_n_cbs(&rdp->cblist);
 -      return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */
 +      return 0;
  }
  
  void rcu_softirq_qs(void)
@@@ -417,12 -416,6 +417,12 @@@ module_param(qlowmark, long, 0444)
  static ulong jiffies_till_first_fqs = ULONG_MAX;
  static ulong jiffies_till_next_fqs = ULONG_MAX;
  static bool rcu_kick_kthreads;
 +static int rcu_divisor = 7;
 +module_param(rcu_divisor, int, 0644);
 +
 +/* Force an exit from rcu_do_batch() after 3 milliseconds. */
 +static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
 +module_param(rcu_resched_ns, long, 0644);
  
  /*
   * How long the grace period must be before we start recruiting
@@@ -1258,7 -1251,6 +1258,7 @@@ static bool rcu_accelerate_cbs(struct r
        unsigned long gp_seq_req;
        bool ret = false;
  
 +      rcu_lockdep_assert_cblist_protected(rdp);
        raw_lockdep_assert_held_rcu_node(rnp);
  
        /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@@ -1300,7 -1292,7 +1300,7 @@@ static void rcu_accelerate_cbs_unlocked
        unsigned long c;
        bool needwake;
  
 -      lockdep_assert_irqs_disabled();
 +      rcu_lockdep_assert_cblist_protected(rdp);
        c = rcu_seq_snap(&rcu_state.gp_seq);
        if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
                /* Old request still live, so mark recent callbacks. */
   */
  static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
  {
 +      rcu_lockdep_assert_cblist_protected(rdp);
        raw_lockdep_assert_held_rcu_node(rnp);
  
        /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
        return rcu_accelerate_cbs(rnp, rdp);
  }
  
 +/*
 + * Move and classify callbacks, but only if doing so won't require
 + * that the RCU grace-period kthread be awakened.
 + */
 +static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
 +                                                struct rcu_data *rdp)
 +{
 +      rcu_lockdep_assert_cblist_protected(rdp);
 +      if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
 +          !raw_spin_trylock_rcu_node(rnp))
 +              return;
 +      WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
 +      raw_spin_unlock_rcu_node(rnp);
 +}
 +
  /*
   * Update CPU-local rcu_data state to record the beginnings and ends of
   * grace periods.  The caller must hold the ->lock of the leaf rcu_node
   */
  static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
  {
 -      bool ret;
 +      bool ret = false;
        bool need_gp;
 +      const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 +                             rcu_segcblist_is_offloaded(&rdp->cblist);
  
        raw_lockdep_assert_held_rcu_node(rnp);
  
        /* Handle the ends of any preceding grace periods first. */
        if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
            unlikely(READ_ONCE(rdp->gpwrap))) {
 -              ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */
 +              if (!offloaded)
 +                      ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
                trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
        } else {
 -              ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */
 +              if (!offloaded)
 +                      ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
        }
  
        /* Now handle the beginnings of any new-to-this-CPU grace periods. */
@@@ -1685,7 -1657,6 +1685,7 @@@ static void rcu_gp_cleanup(void
        unsigned long gp_duration;
        bool needgp = false;
        unsigned long new_gp_seq;
 +      bool offloaded;
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root();
        struct swait_queue_head *sq;
                needgp = true;
        }
        /* Advance CBs to reduce false positives below. */
 -      if (!rcu_accelerate_cbs(rnp, rdp) && needgp) {
 +      offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 +                  rcu_segcblist_is_offloaded(&rdp->cblist);
 +      if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
                WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
                rcu_state.gp_req_activity = jiffies;
                trace_rcu_grace_period(rcu_state.name,
@@@ -1912,7 -1881,7 +1912,7 @@@ rcu_report_unblock_qs_rnp(struct rcu_no
        struct rcu_node *rnp_p;
  
        raw_lockdep_assert_held_rcu_node(rnp);
-       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
+       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
            WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
            rnp->qsmask != 0) {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@@ -1947,9 -1916,7 +1947,9 @@@ rcu_report_qs_rdp(int cpu, struct rcu_d
  {
        unsigned long flags;
        unsigned long mask;
 -      bool needwake;
 +      bool needwake = false;
 +      const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 +                             rcu_segcblist_is_offloaded(&rdp->cblist);
        struct rcu_node *rnp;
  
        rnp = rdp->mynode;
                 * This GP can't end until cpu checks in, so all of our
                 * callbacks can be processed during the next GP.
                 */
 -              needwake = rcu_accelerate_cbs(rnp, rdp);
 +              if (!offloaded)
 +                      needwake = rcu_accelerate_cbs(rnp, rdp);
  
                rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
                /* ^^^ Released rnp->lock */
@@@ -2111,12 -2077,9 +2111,12 @@@ int rcutree_dead_cpu(unsigned int cpu
  static void rcu_do_batch(struct rcu_data *rdp)
  {
        unsigned long flags;
 +      const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 +                             rcu_segcblist_is_offloaded(&rdp->cblist);
        struct rcu_head *rhp;
        struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
        long bl, count;
 +      long pending, tlimit = 0;
  
        /* If no callbacks are ready, just return. */
        if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
         * callback counts, as rcu_barrier() needs to be conservative.
         */
        local_irq_save(flags);
 +      rcu_nocb_lock(rdp);
        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
 -      bl = rdp->blimit;
 +      pending = rcu_segcblist_n_cbs(&rdp->cblist);
 +      bl = max(rdp->blimit, pending >> rcu_divisor);
 +      if (unlikely(bl > 100))
 +              tlimit = local_clock() + rcu_resched_ns;
        trace_rcu_batch_start(rcu_state.name,
                              rcu_segcblist_n_lazy_cbs(&rdp->cblist),
                              rcu_segcblist_n_cbs(&rdp->cblist), bl);
        rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
 -      local_irq_restore(flags);
 +      if (offloaded)
 +              rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
 +      rcu_nocb_unlock_irqrestore(rdp, flags);
  
        /* Invoke callbacks. */
        rhp = rcu_cblist_dequeue(&rcl);
                 * Stop only if limit reached and CPU has something to do.
                 * Note: The rcl structure counts down from zero.
                 */
 -              if (-rcl.len >= bl &&
 +              if (-rcl.len >= bl && !offloaded &&
                    (need_resched() ||
                     (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                        break;
 +              if (unlikely(tlimit)) {
 +                      /* only call local_clock() every 32 callbacks */
 +                      if (likely((-rcl.len & 31) || local_clock() < tlimit))
 +                              continue;
 +                      /* Exceeded the time limit, so leave. */
 +                      break;
 +              }
 +              if (offloaded) {
 +                      WARN_ON_ONCE(in_serving_softirq());
 +                      local_bh_enable();
 +                      lockdep_assert_irqs_enabled();
 +                      cond_resched_tasks_rcu_qs();
 +                      lockdep_assert_irqs_enabled();
 +                      local_bh_disable();
 +              }
        }
  
        local_irq_save(flags);
 +      rcu_nocb_lock(rdp);
        count = -rcl.len;
        trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
                            is_idle_task(current), rcu_is_callbacks_kthread());
         * The following usually indicates a double call_rcu().  To track
         * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
         */
 -      WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
 +      WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
 +      WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 +                   count != 0 && rcu_segcblist_empty(&rdp->cblist));
  
 -      local_irq_restore(flags);
 +      rcu_nocb_unlock_irqrestore(rdp, flags);
  
        /* Re-invoke RCU core processing if there are callbacks remaining. */
 -      if (rcu_segcblist_ready_cbs(&rdp->cblist))
 +      if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
                invoke_rcu_core();
  }
  
@@@ -2266,7 -2205,7 +2266,7 @@@ static void force_qs_rnp(int (*f)(struc
                mask = 0;
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                if (rnp->qsmask == 0) {
-                       if (!IS_ENABLED(CONFIG_PREEMPT) ||
+                       if (!IS_ENABLED(CONFIG_PREEMPTION) ||
                            rcu_preempt_blocked_readers_cgp(rnp)) {
                                /*
                                 * No point in scanning bits because they
@@@ -2341,8 -2280,6 +2341,8 @@@ static __latent_entropy void rcu_core(v
        unsigned long flags;
        struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
        struct rcu_node *rnp = rdp->mynode;
 +      const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 +                             rcu_segcblist_is_offloaded(&rdp->cblist);
  
        if (cpu_is_offline(smp_processor_id()))
                return;
  
        /* No grace period and unregistered callbacks? */
        if (!rcu_gp_in_progress() &&
 -          rcu_segcblist_is_enabled(&rdp->cblist)) {
 +          rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
                local_irq_save(flags);
                if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
                        rcu_accelerate_cbs_unlocked(rnp, rdp);
        rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
  
        /* If there are callbacks ready, invoke them. */
 -      if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
 +      if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
            likely(READ_ONCE(rcu_scheduler_fully_active)))
                rcu_do_batch(rdp);
  
@@@ -2552,11 -2489,10 +2552,11 @@@ static void rcu_leak_callback(struct rc
   * is expected to specify a CPU.
   */
  static void
 -__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
 +__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
  {
        unsigned long flags;
        struct rcu_data *rdp;
 +      bool was_alldone;
  
        /* Misaligned rcu_head! */
        WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
        rdp = this_cpu_ptr(&rcu_data);
  
        /* Add the callback to our list. */
 -      if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
 -              int offline;
 -
 -              if (cpu != -1)
 -                      rdp = per_cpu_ptr(&rcu_data, cpu);
 -              if (likely(rdp->mynode)) {
 -                      /* Post-boot, so this should be for a no-CBs CPU. */
 -                      offline = !__call_rcu_nocb(rdp, head, lazy, flags);
 -                      WARN_ON_ONCE(offline);
 -                      /* Offline CPU, _call_rcu() illegal, leak callback.  */
 -                      local_irq_restore(flags);
 -                      return;
 -              }
 -              /*
 -               * Very early boot, before rcu_init().  Initialize if needed
 -               * and then drop through to queue the callback.
 -               */
 -              WARN_ON_ONCE(cpu != -1);
 +      if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
 +              // This can trigger due to call_rcu() from offline CPU:
 +              WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
                WARN_ON_ONCE(!rcu_is_watching());
 +              // Very early boot, before rcu_init().  Initialize if needed
 +              // and then drop through to queue the callback.
                if (rcu_segcblist_empty(&rdp->cblist))
                        rcu_segcblist_init(&rdp->cblist);
        }
 +      if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
 +              return; // Enqueued onto ->nocb_bypass, so just leave.
 +      /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
        rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
        if (__is_kfree_rcu_offset((unsigned long)func))
                trace_rcu_kfree_callback(rcu_state.name, head,
                                   rcu_segcblist_n_cbs(&rdp->cblist));
  
        /* Go handle any RCU core processing required. */
 -      __call_rcu_core(rdp, head, flags);
 -      local_irq_restore(flags);
 +      if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 +          unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
 +              __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
 +      } else {
 +              __call_rcu_core(rdp, head, flags);
 +              local_irq_restore(flags);
 +      }
  }
  
  /**
   */
  void call_rcu(struct rcu_head *head, rcu_callback_t func)
  {
 -      __call_rcu(head, func, -1, 0);
 +      __call_rcu(head, func, 0);
  }
  EXPORT_SYMBOL_GPL(call_rcu);
  
   */
  void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
  {
 -      __call_rcu(head, func, -1, 1);
 +      __call_rcu(head, func, 1);
  }
  EXPORT_SYMBOL_GPL(kfree_call_rcu);
  
@@@ -2681,7 -2622,7 +2681,7 @@@ static int rcu_blocking_is_gp(void
  {
        int ret;
  
-       if (IS_ENABLED(CONFIG_PREEMPT))
+       if (IS_ENABLED(CONFIG_PREEMPTION))
                return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
        might_sleep();  /* Check for RCU read-side critical section. */
        preempt_disable();
@@@ -2794,10 -2735,6 +2794,10 @@@ static int rcu_pending(void
        /* Check for CPU stalls, if enabled. */
        check_cpu_stall(rdp);
  
 +      /* Does this CPU need a deferred NOCB wakeup? */
 +      if (rcu_nocb_need_deferred_wakeup(rdp))
 +              return 1;
 +
        /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
        if (rcu_nohz_full_cpu())
                return 0;
        /* Has RCU gone idle with this CPU needing another grace period? */
        if (!rcu_gp_in_progress() &&
            rcu_segcblist_is_enabled(&rdp->cblist) &&
 +          (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
 +           !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
            !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
                return 1;
  
            unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
                return 1;
  
 -      /* Does this CPU need a deferred NOCB wakeup? */
 -      if (rcu_nocb_need_deferred_wakeup(rdp))
 -              return 1;
 -
        /* nothing to do */
        return 0;
  }
@@@ -2862,8 -2801,6 +2862,8 @@@ static void rcu_barrier_func(void *unus
        rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
        rdp->barrier_head.func = rcu_barrier_callback;
        debug_rcu_head_queue(&rdp->barrier_head);
 +      rcu_nocb_lock(rdp);
 +      WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
        if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
                atomic_inc(&rcu_state.barrier_cpu_count);
        } else {
                rcu_barrier_trace(TPS("IRQNQ"), -1,
                                   rcu_state.barrier_sequence);
        }
 +      rcu_nocb_unlock(rdp);
  }
  
  /**
@@@ -2922,11 -2858,22 +2922,11 @@@ void rcu_barrier(void
         * corresponding CPU's preceding callbacks have been invoked.
         */
        for_each_possible_cpu(cpu) {
 -              if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
 -                      continue;
                rdp = per_cpu_ptr(&rcu_data, cpu);
 -              if (rcu_is_nocb_cpu(cpu)) {
 -                      if (!rcu_nocb_cpu_needs_barrier(cpu)) {
 -                              rcu_barrier_trace(TPS("OfflineNoCB"), cpu,
 -                                                 rcu_state.barrier_sequence);
 -                      } else {
 -                              rcu_barrier_trace(TPS("OnlineNoCB"), cpu,
 -                                                 rcu_state.barrier_sequence);
 -                              smp_mb__before_atomic();
 -                              atomic_inc(&rcu_state.barrier_cpu_count);
 -                              __call_rcu(&rdp->barrier_head,
 -                                         rcu_barrier_callback, cpu, 0);
 -                      }
 -              } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
 +              if (!cpu_online(cpu) &&
 +                  !rcu_segcblist_is_offloaded(&rdp->cblist))
 +                      continue;
 +              if (rcu_segcblist_n_cbs(&rdp->cblist)) {
                        rcu_barrier_trace(TPS("OnlineQ"), cpu,
                                           rcu_state.barrier_sequence);
                        smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
@@@ -3011,8 -2958,7 +3011,8 @@@ rcu_boot_init_percpu_data(int cpu
   * Initializes a CPU's per-CPU RCU data.  Note that only one online or
   * offline event can be happening at a given time.  Note also that we can
   * accept some slop in the rsp->gp_seq access due to the fact that this
 - * CPU cannot possibly have any RCU callbacks in flight yet.
 + * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
 + * And any offloaded callbacks are being numbered elsewhere.
   */
  int rcutree_prepare_cpu(unsigned int cpu)
  {
        rdp->n_force_qs_snap = rcu_state.n_force_qs;
        rdp->blimit = blimit;
        if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
 -          !init_nocb_callback_list(rdp))
 +          !rcu_segcblist_is_offloaded(&rdp->cblist))
                rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
        rdp->dynticks_nesting = 1;      /* CPU not up, no tearing. */
        rcu_dynticks_eqs_online();
@@@ -3205,38 -3151,29 +3205,38 @@@ void rcutree_migrate_callbacks(int cpu
  {
        unsigned long flags;
        struct rcu_data *my_rdp;
 +      struct rcu_node *my_rnp;
        struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 -      struct rcu_node *rnp_root = rcu_get_root();
        bool needwake;
  
 -      if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
 +      if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
 +          rcu_segcblist_empty(&rdp->cblist))
                return;  /* No callbacks to migrate. */
  
        local_irq_save(flags);
        my_rdp = this_cpu_ptr(&rcu_data);
 -      if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
 -              local_irq_restore(flags);
 -              return;
 -      }
 -      raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
 +      my_rnp = my_rdp->mynode;
 +      rcu_nocb_lock(my_rdp); /* irqs already disabled. */
 +      WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
 +      raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
        /* Leverage recent GPs and set GP for new callbacks. */
 -      needwake = rcu_advance_cbs(rnp_root, rdp) ||
 -                 rcu_advance_cbs(rnp_root, my_rdp);
 +      needwake = rcu_advance_cbs(my_rnp, rdp) ||
 +                 rcu_advance_cbs(my_rnp, my_rdp);
        rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
 +      needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
 +      rcu_segcblist_disable(&rdp->cblist);
        WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
                     !rcu_segcblist_n_cbs(&my_rdp->cblist));
 -      raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
 +      if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
 +              raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
 +              __call_rcu_nocb_wake(my_rdp, true, flags);
 +      } else {
 +              rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
 +              raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
 +      }
        if (needwake)
                rcu_gp_kthread_wake();
 +      lockdep_assert_irqs_enabled();
        WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
                  !rcu_segcblist_empty(&rdp->cblist),
                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@@ -3297,13 -3234,13 +3297,13 @@@ static int __init rcu_spawn_gp_kthread(
        t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
        if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
                return 0;
-       rnp = rcu_get_root();
-       raw_spin_lock_irqsave_rcu_node(rnp, flags);
-       rcu_state.gp_kthread = t;
        if (kthread_prio) {
                sp.sched_priority = kthread_prio;
                sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
        }
+       rnp = rcu_get_root();
+       raw_spin_lock_irqsave_rcu_node(rnp, flags);
+       rcu_state.gp_kthread = t;
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        wake_up_process(t);
        rcu_spawn_nocb_kthreads();
diff --combined kernel/rcu/tree_stall.h
index 841ab43f3e60d4b26df3a63477fe65814947a826,9b92bf18b737e2dbf2702d8be91b924e60082bb1..c0b8c458d8a6ad267151f6cbffc791c217aeefdf
@@@ -163,7 -163,7 +163,7 @@@ static void rcu_iw_handler(struct irq_w
  //
  // Printing RCU CPU stall warnings
  
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
  
  /*
   * Dump detailed information for all tasks blocking the current RCU
@@@ -215,7 -215,7 +215,7 @@@ static int rcu_print_task_stall(struct 
        return ndetected;
  }
  
- #else /* #ifdef CONFIG_PREEMPT */
+ #else /* #ifdef CONFIG_PREEMPTION */
  
  /*
   * Because preemptible RCU does not exist, we never have to check for
@@@ -233,7 -233,7 +233,7 @@@ static int rcu_print_task_stall(struct 
  {
        return 0;
  }
- #endif /* #else #ifdef CONFIG_PREEMPT */
+ #endif /* #else #ifdef CONFIG_PREEMPTION */
  
  /*
   * Dump stacks of all tasks running on stalled CPUs.  First try using
@@@ -527,8 -527,6 +527,8 @@@ static void check_cpu_stall(struct rcu_
  
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall();
 +              if (rcu_cpu_stall_ftrace_dump)
 +                      rcu_ftrace_dump(DUMP_ALL);
  
        } else if (rcu_gp_in_progress() &&
                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
  
                /* They had a few time units to dump stack, so complain. */
                print_other_cpu_stall(gs2);
 +              if (rcu_cpu_stall_ftrace_dump)
 +                      rcu_ftrace_dump(DUMP_ALL);
        }
  }
  
@@@ -589,11 -585,6 +589,11 @@@ void show_rcu_gp_kthreads(void
                                cpu, (long)rdp->gp_seq_needed);
                }
        }
 +      for_each_possible_cpu(cpu) {
 +              rdp = per_cpu_ptr(&rcu_data, cpu);
 +              if (rcu_segcblist_is_offloaded(&rdp->cblist))
 +                      show_rcu_nocb_state(rdp);
 +      }
        /* sched_show_task(rcu_state.gp_kthread); */
  }
  EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
diff --combined kernel/sched/core.c
index 7fa8e74ad2ab4003457d266df57373f41f0e0d2a,87b84a726db448c76edd1fd46a387e392de9255c..06961b997ed6d8c13ced5558520f75b07c85aedc
@@@ -773,6 -773,18 +773,18 @@@ static void set_load_weight(struct task
  }
  
  #ifdef CONFIG_UCLAMP_TASK
+ /*
+  * Serializes updates of utilization clamp values
+  *
+  * The (slow-path) user-space triggers utilization clamp value updates which
+  * can require updates on (fast-path) scheduler's data structures used to
+  * support enqueue/dequeue operations.
+  * While the per-CPU rq lock protects fast-path update operations, user-space
+  * requests are serialized using a mutex to reduce the risk of conflicting
+  * updates or API abuses.
+  */
+ static DEFINE_MUTEX(uclamp_mutex);
  /* Max allowed minimum utilization */
  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
  
@@@ -798,7 -810,7 +810,7 @@@ static inline unsigned int uclamp_bucke
        return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
  }
  
- static inline unsigned int uclamp_none(int clamp_id)
+ static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
  {
        if (clamp_id == UCLAMP_MIN)
                return 0;
@@@ -814,7 -826,7 +826,7 @@@ static inline void uclamp_se_set(struc
  }
  
  static inline unsigned int
- uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+ uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                  unsigned int clamp_value)
  {
        /*
        return uclamp_none(UCLAMP_MIN);
  }
  
- static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                     unsigned int clamp_value)
  {
        /* Reset max-clamp retention only on idle exit */
  }
  
  static inline
unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
-                                unsigned int clamp_value)
enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+                                  unsigned int clamp_value)
  {
        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
        int bucket_id = UCLAMP_BUCKETS - 1;
        return uclamp_idle_value(rq, clamp_id, clamp_value);
  }
  
+ static inline struct uclamp_se
+ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+ {
+       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+       struct uclamp_se uc_max;
+       /*
+        * Tasks in autogroups or root task group will be
+        * restricted by system defaults.
+        */
+       if (task_group_is_autogroup(task_group(p)))
+               return uc_req;
+       if (task_group(p) == &root_task_group)
+               return uc_req;
+       uc_max = task_group(p)->uclamp[clamp_id];
+       if (uc_req.value > uc_max.value || !uc_req.user_defined)
+               return uc_max;
+ #endif
+       return uc_req;
+ }
  /*
   * The effective clamp bucket index of a task depends on, by increasing
   * priority:
   * - the task specific clamp value, when explicitly requested from userspace
+  * - the task group effective clamp value, for tasks not either in the root
+  *   group or in an autogroup
   * - the system default clamp value, defined by the sysadmin
   */
  static inline struct uclamp_se
- uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
  {
-       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+       struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
        struct uclamp_se uc_max = uclamp_default[clamp_id];
  
        /* System default restrictions always apply */
        return uc_req;
  }
  
unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
  {
        struct uclamp_se uc_eff;
  
   * for each bucket when all its RUNNABLE tasks require the same clamp.
   */
  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
   * enforce the expected state and warn.
   */
  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
  
  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
        if (unlikely(!p->sched_class->uclamp_enabled))
                return;
  
  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
        if (unlikely(!p->sched_class->uclamp_enabled))
                return;
                uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
+ static inline void
+ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+ {
+       struct rq_flags rf;
+       struct rq *rq;
+       /*
+        * Lock the task and the rq where the task is (or was) queued.
+        *
+        * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+        * price to pay to safely serialize util_{min,max} updates with
+        * enqueues, dequeues and migration operations.
+        * This is the same locking schema used by __set_cpus_allowed_ptr().
+        */
+       rq = task_rq_lock(p, &rf);
+       /*
+        * Setting the clamp bucket is serialized by task_rq_lock().
+        * If the task is not yet RUNNABLE and its task_struct is not
+        * affecting a valid clamp bucket, the next time it's enqueued,
+        * it will already see the updated clamp bucket value.
+        */
+       if (!p->uclamp[clamp_id].active) {
+               uclamp_rq_dec_id(rq, p, clamp_id);
+               uclamp_rq_inc_id(rq, p, clamp_id);
+       }
+       task_rq_unlock(rq, p, &rf);
+ }
+ static inline void
+ uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+                          unsigned int clamps)
+ {
+       enum uclamp_id clamp_id;
+       struct css_task_iter it;
+       struct task_struct *p;
+       css_task_iter_start(css, 0, &it);
+       while ((p = css_task_iter_next(&it))) {
+               for_each_clamp_id(clamp_id) {
+                       if ((0x1 << clamp_id) & clamps)
+                               uclamp_update_active(p, clamp_id);
+               }
+       }
+       css_task_iter_end(&it);
+ }
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+ static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+ static void uclamp_update_root_tg(void)
+ {
+       struct task_group *tg = &root_task_group;
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+                     sysctl_sched_uclamp_util_min, false);
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+                     sysctl_sched_uclamp_util_max, false);
+       rcu_read_lock();
+       cpu_util_update_eff(&root_task_group.css);
+       rcu_read_unlock();
+ }
+ #else
+ static void uclamp_update_root_tg(void) { }
+ #endif
  int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
                                loff_t *ppos)
  {
+       bool update_root_tg = false;
        int old_min, old_max;
-       static DEFINE_MUTEX(mutex);
        int result;
  
-       mutex_lock(&mutex);
+       mutex_lock(&uclamp_mutex);
        old_min = sysctl_sched_uclamp_util_min;
        old_max = sysctl_sched_uclamp_util_max;
  
        if (old_min != sysctl_sched_uclamp_util_min) {
                uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                              sysctl_sched_uclamp_util_min, false);
+               update_root_tg = true;
        }
        if (old_max != sysctl_sched_uclamp_util_max) {
                uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                              sysctl_sched_uclamp_util_max, false);
+               update_root_tg = true;
        }
  
+       if (update_root_tg)
+               uclamp_update_root_tg();
        /*
-        * Updating all the RUNNABLE task is expensive, keep it simple and do
-        * just a lazy update at each next enqueue time.
+        * We update all RUNNABLE tasks only when task groups are in use.
+        * Otherwise, keep it simple and do just a lazy update at each next
+        * task enqueue time.
         */
        goto done;
  
  undo:
        sysctl_sched_uclamp_util_min = old_min;
        sysctl_sched_uclamp_util_max = old_max;
  done:
-       mutex_unlock(&mutex);
+       mutex_unlock(&uclamp_mutex);
  
        return result;
  }
@@@ -1075,7 -1187,7 +1187,7 @@@ static int uclamp_validate(struct task_
  static void __setscheduler_uclamp(struct task_struct *p,
                                  const struct sched_attr *attr)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
        /*
         * On scheduling class change, reset to default clamps for tasks
  
  static void uclamp_fork(struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
        for_each_clamp_id(clamp_id)
                p->uclamp[clamp_id].active = false;
  static void __init init_uclamp(void)
  {
        struct uclamp_se uc_max = {};
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
        int cpu;
  
+       mutex_init(&uclamp_mutex);
        for_each_possible_cpu(cpu) {
                memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                cpu_rq(cpu)->uclamp_flags = 0;
  
        /* System defaults allow max clamp values for both indexes */
        uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
-       for_each_clamp_id(clamp_id)
+       for_each_clamp_id(clamp_id) {
                uclamp_default[clamp_id] = uc_max;
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+               root_task_group.uclamp_req[clamp_id] = uc_max;
+               root_task_group.uclamp[clamp_id] = uc_max;
+ #endif
+       }
  }
  
  #else /* CONFIG_UCLAMP_TASK */
@@@ -1494,7 -1613,7 +1613,7 @@@ void do_set_cpus_allowed(struct task_st
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  }
  
  /*
@@@ -3214,12 -3333,8 +3333,8 @@@ static __always_inline struct rq 
  context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next, struct rq_flags *rf)
  {
-       struct mm_struct *mm, *oldmm;
        prepare_task_switch(rq, prev, next);
  
-       mm = next->mm;
-       oldmm = prev->active_mm;
        /*
         * For paravirt, this is coupled with an exit in switch_to to
         * combine the page table reload and the switch backend into
        arch_start_context_switch(prev);
  
        /*
-        * If mm is non-NULL, we pass through switch_mm(). If mm is
-        * NULL, we will pass through mmdrop() in finish_task_switch().
-        * Both of these contain the full memory barrier required by
-        * membarrier after storing to rq->curr, before returning to
-        * user-space.
+        * kernel -> kernel   lazy + transfer active
+        *   user -> kernel   lazy + mmgrab() active
+        *
+        * kernel ->   user   switch + mmdrop() active
+        *   user ->   user   switch
         */
-       if (!mm) {
-               next->active_mm = oldmm;
-               mmgrab(oldmm);
-               enter_lazy_tlb(oldmm, next);
-       } else
-               switch_mm_irqs_off(oldmm, mm, next);
+       if (!next->mm) {                                // to kernel
+               enter_lazy_tlb(prev->active_mm, next);
+               next->active_mm = prev->active_mm;
+               if (prev->mm)                           // from user
+                       mmgrab(prev->active_mm);
+               else
+                       prev->active_mm = NULL;
+       } else {                                        // to user
+               /*
+                * sys_membarrier() requires an smp_mb() between setting
+                * rq->curr and returning to userspace.
+                *
+                * The below provides this either through switch_mm(), or in
+                * case 'prev->active_mm == next->mm' through
+                * finish_task_switch()'s mmdrop().
+                */
+               switch_mm_irqs_off(prev->active_mm, next->mm, next);
  
-       if (!prev->mm) {
-               prev->active_mm = NULL;
-               rq->prev_mm = oldmm;
+               if (!prev->mm) {                        // from kernel
+                       /* will mmdrop() in finish_task_switch(). */
+                       rq->prev_mm = prev->active_mm;
+                       prev->active_mm = NULL;
+               }
        }
  
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@@ -3622,7 -3752,7 +3752,7 @@@ static inline void sched_tick_start(in
  static inline void sched_tick_stop(int cpu) { }
  #endif
  
- #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_TRACE_PREEMPT_TOGGLE))
  /*
   * If the value passed in is equal to the current preempt count
@@@ -3780,7 -3910,7 +3910,7 @@@ pick_next_task(struct rq *rq, struct ta
  
                p = fair_sched_class.pick_next_task(rq, prev, rf);
                if (unlikely(p == RETRY_TASK))
-                       goto again;
+                       goto restart;
  
                /* Assumes fair_sched_class->next == idle_sched_class */
                if (unlikely(!p))
                return p;
        }
  
- again:
+ restart:
+       /*
+        * Ensure that we put DL/RT tasks before the pick loop, such that they
+        * can PULL higher prio tasks when we lower the RQ 'priority'.
+        */
+       prev->sched_class->put_prev_task(rq, prev, rf);
+       if (!rq->nr_running)
+               newidle_balance(rq, rf);
        for_each_class(class) {
-               p = class->pick_next_task(rq, prev, rf);
-               if (p) {
-                       if (unlikely(p == RETRY_TASK))
-                               goto again;
+               p = class->pick_next_task(rq, NULL, NULL);
+               if (p)
                        return p;
-               }
        }
  
        /* The idle class should always have a runnable task: */
   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
   *      called on the nearest possible occasion:
   *
-  *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+  *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
   *
   *         - in syscall or exception context, at the next outmost
   *           preempt_enable(). (this might be as soon as the wake_up()'s
   *         - in IRQ context, return from interrupt-handler to
   *           preemptible context
   *
-  *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+  *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
   *         then at the next:
   *
   *          - cond_resched() call
@@@ -3945,7 -4080,7 +4080,7 @@@ void __noreturn do_task_dead(void
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
 -      if (!tsk->state || tsk_is_pi_blocked(tsk))
 +      if (!tsk->state)
                return;
  
        /*
                preempt_enable_no_resched();
        }
  
 +      if (tsk_is_pi_blocked(tsk))
 +              return;
 +
        /*
         * If we are going to sleep and we have plugged IO queued,
         * make sure to submit it to avoid deadlocks.
@@@ -4077,7 -4209,7 +4212,7 @@@ static void __sched notrace preempt_sch
        } while (need_resched());
  }
  
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
  /*
   * this is the entry point to schedule() from in-kernel preemption
   * off of preempt_enable. Kernel preemptions off return from interrupt
@@@ -4149,7 -4281,7 +4284,7 @@@ asmlinkage __visible void __sched notra
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
- #endif /* CONFIG_PREEMPT */
+ #endif /* CONFIG_PREEMPTION */
  
  /*
   * this is the entry point to schedule() from kernel preemption
@@@ -4317,7 -4449,7 +4452,7 @@@ void rt_mutex_setprio(struct task_struc
        if (queued)
                enqueue_task(rq, p, queue_flag);
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  
        check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@@ -4384,7 -4516,7 +4519,7 @@@ void set_user_nice(struct task_struct *
                        resched_curr(rq);
        }
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  out_unlock:
        task_rq_unlock(rq, p, &rf);
  }
@@@ -4701,6 -4833,9 +4836,9 @@@ recheck
                        return retval;
        }
  
+       if (pi)
+               cpuset_read_lock();
        /*
         * Make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
         * Changing the policy of the stop threads its a very bad idea:
         */
        if (p == rq->stop) {
-               task_rq_unlock(rq, p, &rf);
-               return -EINVAL;
+               retval = -EINVAL;
+               goto unlock;
        }
  
        /*
                        goto change;
  
                p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &rf);
-               return 0;
+               retval = 0;
+               goto unlock;
        }
  change:
  
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &rf);
-                       return -EPERM;
+                       retval = -EPERM;
+                       goto unlock;
                }
  #endif
  #ifdef CONFIG_SMP
                         */
                        if (!cpumask_subset(span, p->cpus_ptr) ||
                            rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &rf);
-                               return -EPERM;
+                               retval = -EPERM;
+                               goto unlock;
                        }
                }
  #endif
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
                task_rq_unlock(rq, p, &rf);
+               if (pi)
+                       cpuset_read_unlock();
                goto recheck;
        }
  
         * is available.
         */
        if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &rf);
-               return -EBUSY;
+               retval = -EBUSY;
+               goto unlock;
        }
  
        p->sched_reset_on_fork = reset_on_fork;
                enqueue_task(rq, p, queue_flags);
        }
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  
        check_class_changed(rq, p, prev_class, oldprio);
  
        preempt_disable();
        task_rq_unlock(rq, p, &rf);
  
-       if (pi)
+       if (pi) {
+               cpuset_read_unlock();
                rt_mutex_adjust_pi(p);
+       }
  
        /* Run balance callbacks after we've adjusted the PI chain: */
        balance_callback(rq);
        preempt_enable();
  
        return 0;
+ unlock:
+       task_rq_unlock(rq, p, &rf);
+       if (pi)
+               cpuset_read_unlock();
+       return retval;
  }
  
  static int _sched_setscheduler(struct task_struct *p, int policy,
@@@ -4926,10 -5071,15 +5074,15 @@@ do_sched_setscheduler(pid_t pid, int po
        rcu_read_lock();
        retval = -ESRCH;
        p = find_process_by_pid(pid);
-       if (p != NULL)
-               retval = sched_setscheduler(p, policy, &lparam);
+       if (likely(p))
+               get_task_struct(p);
        rcu_read_unlock();
  
+       if (likely(p)) {
+               retval = sched_setscheduler(p, policy, &lparam);
+               put_task_struct(p);
+       }
        return retval;
  }
  
@@@ -5146,40 -5296,37 +5299,40 @@@ out_unlock
        return retval;
  }
  
 -static int sched_read_attr(struct sched_attr __user *uattr,
 -                         struct sched_attr *attr,
 -                         unsigned int usize)
 +/*
 + * Copy the kernel size attribute structure (which might be larger
 + * than what user-space knows about) to user-space.
 + *
 + * Note that all cases are valid: user-space buffer can be larger or
 + * smaller than the kernel-space buffer. The usual case is that both
 + * have the same size.
 + */
 +static int
 +sched_attr_copy_to_user(struct sched_attr __user *uattr,
 +                      struct sched_attr *kattr,
 +                      unsigned int usize)
  {
 -      int ret;
 +      unsigned int ksize = sizeof(*kattr);
  
        if (!access_ok(uattr, usize))
                return -EFAULT;
  
        /*
 -       * If we're handed a smaller struct than we know of,
 -       * ensure all the unknown bits are 0 - i.e. old
 -       * user-space does not get uncomplete information.
 +       * sched_getattr() ABI forwards and backwards compatibility:
 +       *
 +       * If usize == ksize then we just copy everything to user-space and all is good.
 +       *
 +       * If usize < ksize then we only copy as much as user-space has space for,
 +       * this keeps ABI compatibility as well. We skip the rest.
 +       *
 +       * If usize > ksize then user-space is using a newer version of the ABI,
 +       * which part the kernel doesn't know about. Just ignore it - tooling can
 +       * detect the kernel's knowledge of attributes from the attr->size value
 +       * which is set to ksize in this case.
         */
 -      if (usize < sizeof(*attr)) {
 -              unsigned char *addr;
 -              unsigned char *end;
 -
 -              addr = (void *)attr + usize;
 -              end  = (void *)attr + sizeof(*attr);
 +      kattr->size = min(usize, ksize);
  
 -              for (; addr < end; addr++) {
 -                      if (*addr)
 -                              return -EFBIG;
 -              }
 -
 -              attr->size = usize;
 -      }
 -
 -      ret = copy_to_user(uattr, attr, attr->size);
 -      if (ret)
 +      if (copy_to_user(uattr, kattr, kattr->size))
                return -EFAULT;
  
        return 0;
   * sys_sched_getattr - similar to sched_getparam, but with sched_attr
   * @pid: the pid in question.
   * @uattr: structure containing the extended parameters.
 - * @size: sizeof(attr) for fwd/bwd comp.
 + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
   * @flags: for future extension.
   */
  SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 -              unsigned int, size, unsigned int, flags)
 +              unsigned int, usize, unsigned int, flags)
  {
 -      struct sched_attr attr = {
 -              .size = sizeof(struct sched_attr),
 -      };
 +      struct sched_attr kattr = { };
        struct task_struct *p;
        int retval;
  
 -      if (!uattr || pid < 0 || size > PAGE_SIZE ||
 -          size < SCHED_ATTR_SIZE_VER0 || flags)
 +      if (!uattr || pid < 0 || usize > PAGE_SIZE ||
 +          usize < SCHED_ATTR_SIZE_VER0 || flags)
                return -EINVAL;
  
        rcu_read_lock();
        if (retval)
                goto out_unlock;
  
 -      attr.sched_policy = p->policy;
 +      kattr.sched_policy = p->policy;
        if (p->sched_reset_on_fork)
 -              attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 +              kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
        if (task_has_dl_policy(p))
 -              __getparam_dl(p, &attr);
 +              __getparam_dl(p, &kattr);
        else if (task_has_rt_policy(p))
 -              attr.sched_priority = p->rt_priority;
 +              kattr.sched_priority = p->rt_priority;
        else
 -              attr.sched_nice = task_nice(p);
 +              kattr.sched_nice = task_nice(p);
  
  #ifdef CONFIG_UCLAMP_TASK
 -      attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 -      attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
 +      kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 +      kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
  #endif
  
        rcu_read_unlock();
  
 -      retval = sched_read_attr(uattr, &attr, size);
 -      return retval;
 +      return sched_attr_copy_to_user(uattr, &kattr, usize);
  
  out_unlock:
        rcu_read_unlock();
@@@ -5460,7 -5610,7 +5613,7 @@@ SYSCALL_DEFINE0(sched_yield
        return 0;
  }
  
- #ifndef CONFIG_PREEMPT
+ #ifndef CONFIG_PREEMPTION
  int __sched _cond_resched(void)
  {
        if (should_resched(0)) {
@@@ -5477,7 -5627,7 +5630,7 @@@ EXPORT_SYMBOL(_cond_resched)
   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
-  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+  * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
@@@ -6016,7 -6166,7 +6169,7 @@@ void sched_setnuma(struct task_struct *
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
        task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@@ -6056,21 -6206,22 +6209,22 @@@ static void calc_load_migrate(struct r
                atomic_long_add(delta, &calc_load_tasks);
  }
  
- static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+ static struct task_struct *__pick_migrate_task(struct rq *rq)
  {
- }
+       const struct sched_class *class;
+       struct task_struct *next;
  
- static const struct sched_class fake_sched_class = {
-       .put_prev_task = put_prev_task_fake,
- };
+       for_each_class(class) {
+               next = class->pick_next_task(rq, NULL, NULL);
+               if (next) {
+                       next->sched_class->put_prev_task(rq, next, NULL);
+                       return next;
+               }
+       }
  
- static struct task_struct fake_task = {
-       /*
-        * Avoid pull_{rt,dl}_task()
-        */
-       .prio = MAX_PRIO + 1,
-       .sched_class = &fake_sched_class,
- };
+       /* The idle class should always have a runnable task */
+       BUG();
+ }
  
  /*
   * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@@ -6113,12 -6264,7 +6267,7 @@@ static void migrate_tasks(struct rq *de
                if (rq->nr_running == 1)
                        break;
  
-               /*
-                * pick_next_task() assumes pinned rq->lock:
-                */
-               next = pick_next_task(rq, &fake_task, rf);
-               BUG_ON(!next);
-               put_prev_task(rq, next);
+               next = __pick_migrate_task(rq);
  
                /*
                 * Rules for changing task_struct::cpus_mask are holding
@@@ -6415,19 -6561,19 +6564,19 @@@ DECLARE_PER_CPU(cpumask_var_t, select_i
  
  void __init sched_init(void)
  {
-       unsigned long alloc_size = 0, ptr;
+       unsigned long ptr = 0;
        int i;
  
        wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
-       if (alloc_size) {
-               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+       if (ptr) {
+               ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.se = (struct sched_entity **)ptr;
@@@ -6746,7 -6892,7 +6895,7 @@@ struct task_struct *curr_task(int cpu
  
  #ifdef CONFIG_IA64
  /**
-  * set_curr_task - set the current task for a given CPU.
+  * ia64_set_curr_task - set the current task for a given CPU.
   * @cpu: the processor in question.
   * @p: the task pointer to set.
   *
@@@ -6771,6 -6917,20 +6920,20 @@@ void ia64_set_curr_task(int cpu, struc
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
+ static inline void alloc_uclamp_sched_group(struct task_group *tg,
+                                           struct task_group *parent)
+ {
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+       enum uclamp_id clamp_id;
+       for_each_clamp_id(clamp_id) {
+               uclamp_se_set(&tg->uclamp_req[clamp_id],
+                             uclamp_none(clamp_id), false);
+               tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+       }
+ #endif
+ }
  static void sched_free_group(struct task_group *tg)
  {
        free_fair_sched_group(tg);
@@@ -6794,6 -6954,8 +6957,8 @@@ struct task_group *sched_create_group(s
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
  
+       alloc_uclamp_sched_group(tg, parent);
        return tg;
  
  err:
@@@ -6897,7 -7059,7 +7062,7 @@@ void sched_move_task(struct task_struc
        if (queued)
                enqueue_task(rq, tsk, queue_flags);
        if (running)
-               set_curr_task(rq, tsk);
+               set_next_task(rq, tsk);
  
        task_rq_unlock(rq, tsk, &rf);
  }
@@@ -6980,10 -7142,6 +7145,6 @@@ static int cpu_cgroup_can_attach(struc
  #ifdef CONFIG_RT_GROUP_SCHED
                if (!sched_rt_can_attach(css_tg(css), task))
                        return -EINVAL;
- #else
-               /* We don't support RT-tasks being in separate groups */
-               if (task->sched_class != &fair_sched_class)
-                       return -EINVAL;
  #endif
                /*
                 * Serialize against wake_up_new_task() such that if its
@@@ -7014,6 -7172,178 +7175,178 @@@ static void cpu_cgroup_attach(struct cg
                sched_move_task(task);
  }
  
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+ {
+       struct cgroup_subsys_state *top_css = css;
+       struct uclamp_se *uc_parent = NULL;
+       struct uclamp_se *uc_se = NULL;
+       unsigned int eff[UCLAMP_CNT];
+       enum uclamp_id clamp_id;
+       unsigned int clamps;
+       css_for_each_descendant_pre(css, top_css) {
+               uc_parent = css_tg(css)->parent
+                       ? css_tg(css)->parent->uclamp : NULL;
+               for_each_clamp_id(clamp_id) {
+                       /* Assume effective clamps matches requested clamps */
+                       eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+                       /* Cap effective clamps with parent's effective clamps */
+                       if (uc_parent &&
+                           eff[clamp_id] > uc_parent[clamp_id].value) {
+                               eff[clamp_id] = uc_parent[clamp_id].value;
+                       }
+               }
+               /* Ensure protection is always capped by limit */
+               eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+               /* Propagate most restrictive effective clamps */
+               clamps = 0x0;
+               uc_se = css_tg(css)->uclamp;
+               for_each_clamp_id(clamp_id) {
+                       if (eff[clamp_id] == uc_se[clamp_id].value)
+                               continue;
+                       uc_se[clamp_id].value = eff[clamp_id];
+                       uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+                       clamps |= (0x1 << clamp_id);
+               }
+               if (!clamps) {
+                       css = css_rightmost_descendant(css);
+                       continue;
+               }
+               /* Immediately update descendants RUNNABLE tasks */
+               uclamp_update_active_tasks(css, clamps);
+       }
+ }
+ /*
+  * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+  * C expression. Since there is no way to convert a macro argument (N) into a
+  * character constant, use two levels of macros.
+  */
+ #define _POW10(exp) ((unsigned int)1e##exp)
+ #define POW10(exp) _POW10(exp)
+ struct uclamp_request {
+ #define UCLAMP_PERCENT_SHIFT  2
+ #define UCLAMP_PERCENT_SCALE  (100 * POW10(UCLAMP_PERCENT_SHIFT))
+       s64 percent;
+       u64 util;
+       int ret;
+ };
+ static inline struct uclamp_request
+ capacity_from_percent(char *buf)
+ {
+       struct uclamp_request req = {
+               .percent = UCLAMP_PERCENT_SCALE,
+               .util = SCHED_CAPACITY_SCALE,
+               .ret = 0,
+       };
+       buf = strim(buf);
+       if (strcmp(buf, "max")) {
+               req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+                                            &req.percent);
+               if (req.ret)
+                       return req;
+               if (req.percent > UCLAMP_PERCENT_SCALE) {
+                       req.ret = -ERANGE;
+                       return req;
+               }
+               req.util = req.percent << SCHED_CAPACITY_SHIFT;
+               req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+       }
+       return req;
+ }
+ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off,
+                               enum uclamp_id clamp_id)
+ {
+       struct uclamp_request req;
+       struct task_group *tg;
+       req = capacity_from_percent(buf);
+       if (req.ret)
+               return req.ret;
+       mutex_lock(&uclamp_mutex);
+       rcu_read_lock();
+       tg = css_tg(of_css(of));
+       if (tg->uclamp_req[clamp_id].value != req.util)
+               uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+       /*
+        * Because of not recoverable conversion rounding we keep track of the
+        * exact requested value
+        */
+       tg->uclamp_pct[clamp_id] = req.percent;
+       /* Update effective clamps to track the most restrictive value */
+       cpu_util_update_eff(of_css(of));
+       rcu_read_unlock();
+       mutex_unlock(&uclamp_mutex);
+       return nbytes;
+ }
+ static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+ {
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+ }
+ static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+ {
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+ }
+ static inline void cpu_uclamp_print(struct seq_file *sf,
+                                   enum uclamp_id clamp_id)
+ {
+       struct task_group *tg;
+       u64 util_clamp;
+       u64 percent;
+       u32 rem;
+       rcu_read_lock();
+       tg = css_tg(seq_css(sf));
+       util_clamp = tg->uclamp_req[clamp_id].value;
+       rcu_read_unlock();
+       if (util_clamp == SCHED_CAPACITY_SCALE) {
+               seq_puts(sf, "max\n");
+               return;
+       }
+       percent = tg->uclamp_pct[clamp_id];
+       percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+       seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+ }
+ static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+ {
+       cpu_uclamp_print(sf, UCLAMP_MIN);
+       return 0;
+ }
+ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+ {
+       cpu_uclamp_print(sf, UCLAMP_MAX);
+       return 0;
+ }
+ #endif /* CONFIG_UCLAMP_TASK_GROUP */
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                struct cftype *cftype, u64 shareval)
@@@ -7358,6 -7688,20 +7691,20 @@@ static struct cftype cpu_legacy_files[
                .read_u64 = cpu_rt_period_read_uint,
                .write_u64 = cpu_rt_period_write_uint,
        },
+ #endif
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
  #endif
        { }     /* Terminate */
  };
@@@ -7525,6 -7869,20 +7872,20 @@@ static struct cftype cpu_files[] = 
                .seq_show = cpu_max_show,
                .write = cpu_max_write,
        },
+ #endif
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
  #endif
        { }     /* terminate */
  };
index 867b4bb6d4beb541d1d9eb087711d1e52a446416,e127d89d5974499bfbcba34e6d0a456c0d9ac6d3..fdce9cfaca05b802c87a232d77903d6731809716
@@@ -40,7 -40,6 +40,7 @@@ struct sugov_policy 
        struct task_struct      *thread;
        bool                    work_in_progress;
  
 +      bool                    limits_changed;
        bool                    need_freq_update;
  };
  
@@@ -90,11 -89,8 +90,11 @@@ static bool sugov_should_update_freq(st
            !cpufreq_this_cpu_can_update(sg_policy->policy))
                return false;
  
 -      if (unlikely(sg_policy->need_freq_update))
 +      if (unlikely(sg_policy->limits_changed)) {
 +              sg_policy->limits_changed = false;
 +              sg_policy->need_freq_update = true;
                return true;
 +      }
  
        delta_ns = time - sg_policy->last_freq_update_time;
  
@@@ -263,9 -259,9 +263,9 @@@ unsigned long schedutil_cpu_util(int cp
         * irq metric. Because IRQ/steal time is hidden from the task clock we
         * need to scale the task numbers:
         *
-        *              1 - irq
-        *   U' = irq + ------- * U
-        *                max
+        *              max - irq
+        *   U' = irq + --------- * U
+        *                 max
         */
        util = scale_irq_capacity(util, irq, max);
        util += irq;
@@@ -441,7 -437,7 +441,7 @@@ static inline bool sugov_cpu_is_busy(st
  static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
  {
        if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
 -              sg_policy->need_freq_update = true;
 +              sg_policy->limits_changed = true;
  }
  
  static void sugov_update_single(struct update_util_data *hook, u64 time,
        if (!sugov_should_update_freq(sg_policy, time))
                return;
  
 -      busy = sugov_cpu_is_busy(sg_cpu);
 +      /* Limits may have changed, don't skip frequency update */
 +      busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
  
        util = sugov_get_util(sg_cpu);
        max = sg_cpu->max;
@@@ -836,7 -831,6 +836,7 @@@ static int sugov_start(struct cpufreq_p
        sg_policy->last_freq_update_time        = 0;
        sg_policy->next_freq                    = 0;
        sg_policy->work_in_progress             = false;
 +      sg_policy->limits_changed               = false;
        sg_policy->need_freq_update             = false;
        sg_policy->cached_raw_freq              = 0;
  
@@@ -885,7 -879,7 +885,7 @@@ static void sugov_limits(struct cpufreq
                mutex_unlock(&sg_policy->work_lock);
        }
  
 -      sg_policy->need_freq_update = true;
 +      sg_policy->limits_changed = true;
  }
  
  struct cpufreq_governor schedutil_gov = {
diff --combined kernel/sched/deadline.c
index 46122edd8552c9abd7acb3cf665332d91746ed7d,0b9cbfb2b1d4fcfd339c20c6e8ecb29d78444032..39dc9f74f2898f13b56837f8073f49043275a5d2
@@@ -529,6 -529,7 +529,7 @@@ static struct rq *find_lock_later_rq(st
  static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
  {
        struct rq *later_rq = NULL;
+       struct dl_bw *dl_b;
  
        later_rq = find_lock_later_rq(p, rq);
        if (!later_rq) {
                double_lock_balance(rq, later_rq);
        }
  
+       if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+               /*
+                * Inactive timer is armed (or callback is running, but
+                * waiting for us to release rq locks). In any case, when it
+                * will fire (or continue), it will see running_bw of this
+                * task migrated to later_rq (and correctly handle it).
+                */
+               sub_running_bw(&p->dl, &rq->dl);
+               sub_rq_bw(&p->dl, &rq->dl);
+               add_rq_bw(&p->dl, &later_rq->dl);
+               add_running_bw(&p->dl, &later_rq->dl);
+       } else {
+               sub_rq_bw(&p->dl, &rq->dl);
+               add_rq_bw(&p->dl, &later_rq->dl);
+       }
+       /*
+        * And we finally need to fixup root_domain(s) bandwidth accounting,
+        * since p is still hanging out in the old (now moved to default) root
+        * domain.
+        */
+       dl_b = &rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+       __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+       raw_spin_unlock(&dl_b->lock);
+       dl_b = &later_rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+       __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+       raw_spin_unlock(&dl_b->lock);
        set_task_cpu(p, later_rq->cpu);
        double_unlock_balance(later_rq, rq);
  
@@@ -1694,12 -1727,20 +1727,20 @@@ static void start_hrtick_dl(struct rq *
  }
  #endif
  
- static inline void set_next_task(struct rq *rq, struct task_struct *p)
+ static void set_next_task_dl(struct rq *rq, struct task_struct *p)
  {
        p->se.exec_start = rq_clock_task(rq);
  
        /* You can't push away the running task */
        dequeue_pushable_dl_task(rq, p);
+       if (hrtick_enabled(rq))
+               start_hrtick_dl(rq, p);
+       if (rq->curr->sched_class != &dl_sched_class)
+               update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+       deadline_queue_push_tasks(rq);
  }
  
  static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@@ -1720,64 -1761,42 +1761,42 @@@ pick_next_task_dl(struct rq *rq, struc
        struct task_struct *p;
        struct dl_rq *dl_rq;
  
-       dl_rq = &rq->dl;
-       if (need_pull_dl_task(rq, prev)) {
-               /*
-                * This is OK, because current is on_cpu, which avoids it being
-                * picked for load-balance and preemption/IRQs are still
-                * disabled avoiding further scheduler activity on it and we're
-                * being very careful to re-start the picking loop.
-                */
-               rq_unpin_lock(rq, rf);
-               pull_dl_task(rq);
-               rq_repin_lock(rq, rf);
-               /*
-                * pull_dl_task() can drop (and re-acquire) rq->lock; this
-                * means a stop task can slip in, in which case we need to
-                * re-start task selection.
-                */
-               if (rq->stop && task_on_rq_queued(rq->stop))
-                       return RETRY_TASK;
-       }
+       WARN_ON_ONCE(prev || rf);
  
-       /*
-        * When prev is DL, we may throttle it in put_prev_task().
-        * So, we update time before we check for dl_nr_running.
-        */
-       if (prev->sched_class == &dl_sched_class)
-               update_curr_dl(rq);
+       dl_rq = &rq->dl;
  
        if (unlikely(!dl_rq->dl_nr_running))
                return NULL;
  
-       put_prev_task(rq, prev);
        dl_se = pick_next_dl_entity(rq, dl_rq);
        BUG_ON(!dl_se);
  
        p = dl_task_of(dl_se);
  
-       set_next_task(rq, p);
-       if (hrtick_enabled(rq))
-               start_hrtick_dl(rq, p);
-       deadline_queue_push_tasks(rq);
-       if (rq->curr->sched_class != &dl_sched_class)
-               update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+       set_next_task_dl(rq, p);
  
        return p;
  }
  
- static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  {
        update_curr_dl(rq);
  
        update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
        if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
                enqueue_pushable_dl_task(rq, p);
+       if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+               /*
+                * This is OK, because current is on_cpu, which avoids it being
+                * picked for load-balance and preemption/IRQs are still
+                * disabled avoiding further scheduler activity on it and we've
+                * not yet started the picking loop.
+                */
+               rq_unpin_lock(rq, rf);
+               pull_dl_task(rq);
+               rq_repin_lock(rq, rf);
+       }
  }
  
  /*
@@@ -1811,11 -1830,6 +1830,6 @@@ static void task_fork_dl(struct task_st
         */
  }
  
- static void set_curr_task_dl(struct rq *rq)
- {
-       set_next_task(rq, rq->curr);
- }
  #ifdef CONFIG_SMP
  
  /* Only try algorithms three times */
@@@ -2088,13 -2102,17 +2102,13 @@@ retry
        }
  
        deactivate_task(rq, next_task, 0);
 -      sub_running_bw(&next_task->dl, &rq->dl);
 -      sub_rq_bw(&next_task->dl, &rq->dl);
        set_task_cpu(next_task, later_rq->cpu);
 -      add_rq_bw(&next_task->dl, &later_rq->dl);
  
        /*
         * Update the later_rq clock here, because the clock is used
         * by the cpufreq_update_util() inside __add_running_bw().
         */
        update_rq_clock(later_rq);
 -      add_running_bw(&next_task->dl, &later_rq->dl);
        activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
        ret = 1;
  
@@@ -2182,7 -2200,11 +2196,7 @@@ static void pull_dl_task(struct rq *thi
                        resched = true;
  
                        deactivate_task(src_rq, p, 0);
 -                      sub_running_bw(&p->dl, &src_rq->dl);
 -                      sub_rq_bw(&p->dl, &src_rq->dl);
                        set_task_cpu(p, this_cpu);
 -                      add_rq_bw(&p->dl, &this_rq->dl);
 -                      add_running_bw(&p->dl, &this_rq->dl);
                        activate_task(this_rq, p, 0);
                        dmin = p->dl.deadline;
  
@@@ -2275,6 -2297,36 +2289,36 @@@ void __init init_sched_dl_class(void
                                        GFP_KERNEL, cpu_to_node(i));
  }
  
+ void dl_add_task_root_domain(struct task_struct *p)
+ {
+       struct rq_flags rf;
+       struct rq *rq;
+       struct dl_bw *dl_b;
+       rq = task_rq_lock(p, &rf);
+       if (!dl_task(p))
+               goto unlock;
+       dl_b = &rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+       __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+       raw_spin_unlock(&dl_b->lock);
+ unlock:
+       task_rq_unlock(rq, p, &rf);
+ }
+ void dl_clear_root_domain(struct root_domain *rd)
+ {
+       unsigned long flags;
+       raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+       rd->dl_bw.total_bw = 0;
+       raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+ }
  #endif /* CONFIG_SMP */
  
  static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@@ -2395,6 -2447,7 +2439,7 @@@ const struct sched_class dl_sched_clas
  
        .pick_next_task         = pick_next_task_dl,
        .put_prev_task          = put_prev_task_dl,
+       .set_next_task          = set_next_task_dl,
  
  #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_dl,
        .task_woken             = task_woken_dl,
  #endif
  
-       .set_curr_task          = set_curr_task_dl,
        .task_tick              = task_tick_dl,
        .task_fork              = task_fork_dl,
  
diff --combined kernel/sched/fair.c
index 500f5db0de0ba86a331586d4189e3b299cb6148e,1f0a5e1a90faf36d18277401df7d321c1dbfdb26..d4bbf68c31611fcd6fa3da456ef435021cefae53
@@@ -96,12 -96,12 +96,12 @@@ int __weak arch_asym_cpu_priority(int c
  }
  
  /*
-  * The margin used when comparing utilization with CPU capacity:
-  * util * margin < capacity * 1024
+  * The margin used when comparing utilization with CPU capacity.
   *
   * (default: ~20%)
   */
- static unsigned int capacity_margin                   = 1280;
+ #define fits_capacity(cap, max)       ((cap) * 1280 < (max) * 1024)
  #endif
  
  #ifdef CONFIG_CFS_BANDWIDTH
@@@ -1188,47 -1188,6 +1188,6 @@@ static unsigned int task_scan_max(struc
        return max(smin, smax);
  }
  
- void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
- {
-       int mm_users = 0;
-       struct mm_struct *mm = p->mm;
-       if (mm) {
-               mm_users = atomic_read(&mm->mm_users);
-               if (mm_users == 1) {
-                       mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-                       mm->numa_scan_seq = 0;
-               }
-       }
-       p->node_stamp                   = 0;
-       p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
-       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
-       p->numa_work.next               = &p->numa_work;
-       p->numa_faults                  = NULL;
-       RCU_INIT_POINTER(p->numa_group, NULL);
-       p->last_task_numa_placement     = 0;
-       p->last_sum_exec_runtime        = 0;
-       /* New address space, reset the preferred nid */
-       if (!(clone_flags & CLONE_VM)) {
-               p->numa_preferred_nid = NUMA_NO_NODE;
-               return;
-       }
-       /*
-        * New thread, keep existing numa_preferred_nid which should be copied
-        * already by arch_dup_task_struct but stagger when scans start.
-        */
-       if (mm) {
-               unsigned int delay;
-               delay = min_t(unsigned int, task_scan_max(current),
-                       current->numa_scan_period * mm_users * NSEC_PER_MSEC);
-               delay += 2 * TICK_NSEC;
-               p->node_stamp = delay;
-       }
- }
  static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
  {
        rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@@ -2523,7 -2482,7 +2482,7 @@@ static void reset_ptenuma_scan(struct t
   * The expensive part of numa migration is done from task_work context.
   * Triggered from task_tick_numa().
   */
- void task_numa_work(struct callback_head *work)
static void task_numa_work(struct callback_head *work)
  {
        unsigned long migrate, next_scan, now = jiffies;
        struct task_struct *p = current;
  
        SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
  
-       work->next = work; /* protect against double add */
+       work->next = work;
        /*
         * Who cares about NUMA placement when they're dying.
         *
        }
  }
  
+ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+ {
+       int mm_users = 0;
+       struct mm_struct *mm = p->mm;
+       if (mm) {
+               mm_users = atomic_read(&mm->mm_users);
+               if (mm_users == 1) {
+                       mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+                       mm->numa_scan_seq = 0;
+               }
+       }
+       p->node_stamp                   = 0;
+       p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
+       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
+       /* Protect against double add, see task_tick_numa and task_numa_work */
+       p->numa_work.next               = &p->numa_work;
+       p->numa_faults                  = NULL;
+       RCU_INIT_POINTER(p->numa_group, NULL);
+       p->last_task_numa_placement     = 0;
+       p->last_sum_exec_runtime        = 0;
+       init_task_work(&p->numa_work, task_numa_work);
+       /* New address space, reset the preferred nid */
+       if (!(clone_flags & CLONE_VM)) {
+               p->numa_preferred_nid = NUMA_NO_NODE;
+               return;
+       }
+       /*
+        * New thread, keep existing numa_preferred_nid which should be copied
+        * already by arch_dup_task_struct but stagger when scans start.
+        */
+       if (mm) {
+               unsigned int delay;
+               delay = min_t(unsigned int, task_scan_max(current),
+                       current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+               delay += 2 * TICK_NSEC;
+               p->node_stamp = delay;
+       }
+ }
  /*
   * Drive the periodic memory faults..
   */
@@@ -2693,10 -2696,8 +2696,8 @@@ static void task_tick_numa(struct rq *r
                        curr->numa_scan_period = task_scan_start(curr);
                curr->node_stamp += period;
  
-               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
-                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+               if (!time_before(jiffies, curr->mm->numa_next_scan))
                        task_work_add(curr, work, true);
-               }
        }
  }
  
@@@ -3689,8 -3690,6 +3690,6 @@@ static inline unsigned long cfs_rq_load
        return cfs_rq->avg.load_avg;
  }
  
- static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
  static inline unsigned long task_util(struct task_struct *p)
  {
        return READ_ONCE(p->se.avg.util_avg);
@@@ -3807,7 -3806,7 +3806,7 @@@ util_est_dequeue(struct cfs_rq *cfs_rq
  
  static inline int task_fits_capacity(struct task_struct *p, long capacity)
  {
-       return capacity * 1024 > task_util_est(p) * capacity_margin;
+       return fits_capacity(task_util_est(p), capacity);
  }
  
  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@@ -4370,8 -4369,6 +4369,6 @@@ void __refill_cfs_bandwidth_runtime(str
  
        now = sched_clock_cpu(smp_processor_id());
        cfs_b->runtime = cfs_b->quota;
-       cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
-       cfs_b->expires_seq++;
  }
  
  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@@ -4393,8 -4390,7 +4390,7 @@@ static int assign_cfs_rq_runtime(struc
  {
        struct task_group *tg = cfs_rq->tg;
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount, expires;
-       int expires_seq;
+       u64 amount = 0, min_amount;
  
        /* note: this is a positive sum as runtime_remaining <= 0 */
        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
                        cfs_b->idle = 0;
                }
        }
-       expires_seq = cfs_b->expires_seq;
-       expires = cfs_b->runtime_expires;
        raw_spin_unlock(&cfs_b->lock);
  
        cfs_rq->runtime_remaining += amount;
-       /*
-        * we may have advanced our local expiration to account for allowed
-        * spread between our sched_clock and the one on which runtime was
-        * issued.
-        */
-       if (cfs_rq->expires_seq != expires_seq) {
-               cfs_rq->expires_seq = expires_seq;
-               cfs_rq->runtime_expires = expires;
-       }
  
        return cfs_rq->runtime_remaining > 0;
  }
  
- /*
-  * Note: This depends on the synchronization provided by sched_clock and the
-  * fact that rq->clock snapshots this value.
-  */
- static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
- {
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-       /* if the deadline is ahead of our clock, nothing to do */
-       if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
-               return;
-       if (cfs_rq->runtime_remaining < 0)
-               return;
-       /*
-        * If the local deadline has passed we have to consider the
-        * possibility that our sched_clock is 'fast' and the global deadline
-        * has not truly expired.
-        *
-        * Fortunately we can check determine whether this the case by checking
-        * whether the global deadline(cfs_b->expires_seq) has advanced.
-        */
-       if (cfs_rq->expires_seq == cfs_b->expires_seq) {
-               /* extend local deadline, drift is bounded above by 2 ticks */
-               cfs_rq->runtime_expires += TICK_NSEC;
-       } else {
-               /* global deadline is ahead, expiration has passed */
-               cfs_rq->runtime_remaining = 0;
-       }
- }
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
-       expire_cfs_rq_runtime(cfs_rq);
  
        if (likely(cfs_rq->runtime_remaining > 0))
                return;
  
 +      if (cfs_rq->throttled)
 +              return;
        /*
         * if we're unable to extend our runtime we resched so that the active
         * hierarchy can be throttled
@@@ -4556,7 -4506,7 +4508,7 @@@ static void throttle_cfs_rq(struct cfs_
        struct rq *rq = rq_of(cfs_rq);
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
        struct sched_entity *se;
-       long task_delta, dequeue = 1;
+       long task_delta, idle_task_delta, dequeue = 1;
        bool empty;
  
        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
        rcu_read_unlock();
  
        task_delta = cfs_rq->h_nr_running;
+       idle_task_delta = cfs_rq->idle_h_nr_running;
        for_each_sched_entity(se) {
                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
                /* throttled entity or throttle-on-deactivate */
                if (dequeue)
                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
                qcfs_rq->h_nr_running -= task_delta;
+               qcfs_rq->idle_h_nr_running -= idle_task_delta;
  
                if (qcfs_rq->load.weight)
                        dequeue = 0;
@@@ -4615,7 -4567,7 +4569,7 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
        struct sched_entity *se;
        int enqueue = 1;
-       long task_delta;
+       long task_delta, idle_task_delta;
  
        se = cfs_rq->tg->se[cpu_of(rq)];
  
                return;
  
        task_delta = cfs_rq->h_nr_running;
+       idle_task_delta = cfs_rq->idle_h_nr_running;
        for_each_sched_entity(se) {
                if (se->on_rq)
                        enqueue = 0;
                if (enqueue)
                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
                cfs_rq->h_nr_running += task_delta;
+               cfs_rq->idle_h_nr_running += idle_task_delta;
  
                if (cfs_rq_throttled(cfs_rq))
                        break;
                resched_curr(rq);
  }
  
- static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
-               u64 remaining, u64 expires)
+ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
  {
        struct cfs_rq *cfs_rq;
        u64 runtime;
                if (!cfs_rq_throttled(cfs_rq))
                        goto next;
  
 +              /* By the above check, this should never be true */
 +              SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
 +
                runtime = -cfs_rq->runtime_remaining + 1;
                if (runtime > remaining)
                        runtime = remaining;
                remaining -= runtime;
  
                cfs_rq->runtime_remaining += runtime;
-               cfs_rq->runtime_expires = expires;
  
                /* we check whether we're throttled above */
                if (cfs_rq->runtime_remaining > 0)
@@@ -4709,7 -4658,7 +4663,7 @@@ next
   */
  static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
  {
-       u64 runtime, runtime_expires;
+       u64 runtime;
        int throttled;
  
        /* no need to continue the timer with no bandwidth constraint */
        /* account preceding periods in which throttling occurred */
        cfs_b->nr_throttled += overrun;
  
-       runtime_expires = cfs_b->runtime_expires;
        /*
         * This check is repeated as we are holding onto the new bandwidth while
         * we unthrottle. This can potentially race with an unthrottled group
                cfs_b->distribute_running = 1;
                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                /* we can't nest cfs_b->lock while distributing bandwidth */
-               runtime = distribute_cfs_runtime(cfs_b, runtime,
-                                                runtime_expires);
+               runtime = distribute_cfs_runtime(cfs_b, runtime);
                raw_spin_lock_irqsave(&cfs_b->lock, flags);
  
                cfs_b->distribute_running = 0;
@@@ -4834,8 -4780,7 +4785,7 @@@ static void __return_cfs_rq_runtime(str
                return;
  
        raw_spin_lock(&cfs_b->lock);
-       if (cfs_b->quota != RUNTIME_INF &&
-           cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+       if (cfs_b->quota != RUNTIME_INF) {
                cfs_b->runtime += slack_runtime;
  
                /* we are under rq->lock, defer unthrottling using a timer */
@@@ -4868,7 -4813,6 +4818,6 @@@ static void do_sched_cfs_slack_timer(st
  {
        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
        unsigned long flags;
-       u64 expires;
  
        /* confirm we're still not at a refresh boundary */
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                runtime = cfs_b->runtime;
  
-       expires = cfs_b->runtime_expires;
        if (runtime)
                cfs_b->distribute_running = 1;
  
        if (!runtime)
                return;
  
-       runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+       runtime = distribute_cfs_runtime(cfs_b, runtime);
  
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
-       if (expires == cfs_b->runtime_expires)
-               lsub_positive(&cfs_b->runtime, runtime);
+       lsub_positive(&cfs_b->runtime, runtime);
        cfs_b->distribute_running = 0;
        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  }
@@@ -5056,8 -4998,6 +5003,6 @@@ void start_cfs_bandwidth(struct cfs_ban
  
        cfs_b->period_active = 1;
        overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
-       cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
-       cfs_b->expires_seq++;
        hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
  }
  
@@@ -5235,7 -5175,7 +5180,7 @@@ static inline unsigned long cpu_util(in
  
  static inline bool cpu_overutilized(int cpu)
  {
-       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+       return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
  }
  
  static inline void update_overutilized_status(struct rq *rq)
@@@ -5259,6 -5199,7 +5204,7 @@@ enqueue_task_fair(struct rq *rq, struc
  {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+       int idle_h_nr_running = task_has_idle_policy(p);
  
        /*
         * The code below (indirectly) updates schedutil which looks at
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
  
                flags = ENQUEUE_WAKEUP;
        }
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
  
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@@ -5359,6 -5302,7 +5307,7 @@@ static void dequeue_task_fair(struct r
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
        int task_sleep = flags & DEQUEUE_SLEEP;
+       int idle_h_nr_running = task_has_idle_policy(p);
  
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
  
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
  
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@@ -5425,6 -5371,15 +5376,15 @@@ static struct 
  
  #endif /* CONFIG_NO_HZ_COMMON */
  
+ /* CPU only has SCHED_IDLE tasks enqueued */
+ static int sched_idle_cpu(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+       return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+                       rq->nr_running);
+ }
  static unsigned long cpu_runnable_load(struct rq *rq)
  {
        return cfs_rq_runnable_load_avg(&rq->cfs);
@@@ -5747,7 -5702,7 +5707,7 @@@ find_idlest_group_cpu(struct sched_grou
        unsigned int min_exit_latency = UINT_MAX;
        u64 latest_idle_timestamp = 0;
        int least_loaded_cpu = this_cpu;
-       int shallowest_idle_cpu = -1;
+       int shallowest_idle_cpu = -1, si_cpu = -1;
        int i;
  
        /* Check if we have any choice: */
                                latest_idle_timestamp = rq->idle_stamp;
                                shallowest_idle_cpu = i;
                        }
-               } else if (shallowest_idle_cpu == -1) {
+               } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+                       if (sched_idle_cpu(i)) {
+                               si_cpu = i;
+                               continue;
+                       }
                        load = cpu_runnable_load(cpu_rq(i));
                        if (load < min_load) {
                                min_load = load;
                }
        }
  
-       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+       if (shallowest_idle_cpu != -1)
+               return shallowest_idle_cpu;
+       if (si_cpu != -1)
+               return si_cpu;
+       return least_loaded_cpu;
  }
  
  static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@@ -5940,7 -5904,7 +5909,7 @@@ static int select_idle_core(struct task
   */
  static int select_idle_smt(struct task_struct *p, int target)
  {
-       int cpu;
+       int cpu, si_cpu = -1;
  
        if (!static_branch_likely(&sched_smt_present))
                return -1;
                        continue;
                if (available_idle_cpu(cpu))
                        return cpu;
+               if (si_cpu == -1 && sched_idle_cpu(cpu))
+                       si_cpu = cpu;
        }
  
-       return -1;
+       return si_cpu;
  }
  
  #else /* CONFIG_SCHED_SMT */
@@@ -5980,8 -5946,8 +5951,8 @@@ static int select_idle_cpu(struct task_
        u64 avg_cost, avg_idle;
        u64 time, cost;
        s64 delta;
-       int cpu, nr = INT_MAX;
        int this = smp_processor_id();
+       int cpu, nr = INT_MAX, si_cpu = -1;
  
        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
        if (!this_sd)
  
        for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                if (!--nr)
-                       return -1;
+                       return si_cpu;
                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                        continue;
                if (available_idle_cpu(cpu))
                        break;
+               if (si_cpu == -1 && sched_idle_cpu(cpu))
+                       si_cpu = cpu;
        }
  
        time = cpu_clock(this) - time;
@@@ -6032,13 -6000,14 +6005,14 @@@ static int select_idle_sibling(struct t
        struct sched_domain *sd;
        int i, recent_used_cpu;
  
-       if (available_idle_cpu(target))
+       if (available_idle_cpu(target) || sched_idle_cpu(target))
                return target;
  
        /*
         * If the previous CPU is cache affine and idle, don't be stupid:
         */
-       if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+       if (prev != target && cpus_share_cache(prev, target) &&
+           (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                return prev;
  
        /* Check a recently used CPU as a potential idle candidate: */
        if (recent_used_cpu != prev &&
            recent_used_cpu != target &&
            cpus_share_cache(recent_used_cpu, target) &&
-           available_idle_cpu(recent_used_cpu) &&
+           (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
            cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
                /*
                 * Replace recent_used_cpu with prev as it is a potential
@@@ -6282,69 -6251,55 +6256,55 @@@ static unsigned long cpu_util_next(int 
  }
  
  /*
-  * compute_energy(): Estimates the energy that would be consumed if @p was
+  * compute_energy(): Estimates the energy that @pd would consume if @p was
   * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
-  * landscape of the * CPUs after the task migration, and uses the Energy Model
+  * landscape of @pd's CPUs after the task migration, and uses the Energy Model
   * to compute what would be the energy if we decided to actually migrate that
   * task.
   */
  static long
  compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
  {
-       unsigned int max_util, util_cfs, cpu_util, cpu_cap;
-       unsigned long sum_util, energy = 0;
-       struct task_struct *tsk;
+       struct cpumask *pd_mask = perf_domain_span(pd);
+       unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+       unsigned long max_util = 0, sum_util = 0;
        int cpu;
  
-       for (; pd; pd = pd->next) {
-               struct cpumask *pd_mask = perf_domain_span(pd);
+       /*
+        * The capacity state of CPUs of the current rd can be driven by CPUs
+        * of another rd if they belong to the same pd. So, account for the
+        * utilization of these CPUs too by masking pd with cpu_online_mask
+        * instead of the rd span.
+        *
+        * If an entire pd is outside of the current rd, it will not appear in
+        * its pd list and will not be accounted by compute_energy().
+        */
+       for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+               unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+               struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
  
                /*
-                * The energy model mandates all the CPUs of a performance
-                * domain have the same capacity.
+                * Busy time computation: utilization clamping is not
+                * required since the ratio (sum_util / cpu_capacity)
+                * is already enough to scale the EM reported power
+                * consumption at the (eventually clamped) cpu_capacity.
                 */
-               cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
-               max_util = sum_util = 0;
+               sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                              ENERGY_UTIL, NULL);
  
                /*
-                * The capacity state of CPUs of the current rd can be driven by
-                * CPUs of another rd if they belong to the same performance
-                * domain. So, account for the utilization of these CPUs too
-                * by masking pd with cpu_online_mask instead of the rd span.
-                *
-                * If an entire performance domain is outside of the current rd,
-                * it will not appear in its pd list and will not be accounted
-                * by compute_energy().
+                * Performance domain frequency: utilization clamping
+                * must be considered since it affects the selection
+                * of the performance domain frequency.
+                * NOTE: in case RT tasks are running, by default the
+                * FREQUENCY_UTIL's utilization can be max OPP.
                 */
-               for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-                       util_cfs = cpu_util_next(cpu, p, dst_cpu);
-                       /*
-                        * Busy time computation: utilization clamping is not
-                        * required since the ratio (sum_util / cpu_capacity)
-                        * is already enough to scale the EM reported power
-                        * consumption at the (eventually clamped) cpu_capacity.
-                        */
-                       sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                      ENERGY_UTIL, NULL);
-                       /*
-                        * Performance domain frequency: utilization clamping
-                        * must be considered since it affects the selection
-                        * of the performance domain frequency.
-                        * NOTE: in case RT tasks are running, by default the
-                        * FREQUENCY_UTIL's utilization can be max OPP.
-                        */
-                       tsk = cpu == dst_cpu ? p : NULL;
-                       cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                     FREQUENCY_UTIL, tsk);
-                       max_util = max(max_util, cpu_util);
-               }
-               energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+               cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                             FREQUENCY_UTIL, tsk);
+               max_util = max(max_util, cpu_util);
        }
  
-       return energy;
+       return em_pd_energy(pd->em_pd, max_util, sum_util);
  }
  
  /*
   * other use-cases too. So, until someone finds a better way to solve this,
   * let's keep things simple by re-using the existing slow path.
   */
  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
  {
-       unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+       unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+       unsigned long cpu_cap, util, base_energy = 0;
        int cpu, best_energy_cpu = prev_cpu;
-       struct perf_domain *head, *pd;
-       unsigned long cpu_cap, util;
        struct sched_domain *sd;
+       struct perf_domain *pd;
  
        rcu_read_lock();
        pd = rcu_dereference(rd->pd);
        if (!pd || READ_ONCE(rd->overutilized))
                goto fail;
-       head = pd;
  
        /*
         * Energy-aware wake-up happens on the lowest sched_domain starting
                goto unlock;
  
        for (; pd; pd = pd->next) {
-               unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+               unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+               unsigned long base_energy_pd;
                int max_spare_cap_cpu = -1;
  
+               /* Compute the 'base' energy of the pd, without @p */
+               base_energy_pd = compute_energy(p, -1, pd);
+               base_energy += base_energy_pd;
                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                continue;
                        /* Skip CPUs that will be overutilized. */
                        util = cpu_util_next(cpu, p, cpu);
                        cpu_cap = capacity_of(cpu);
-                       if (cpu_cap * 1024 < util * capacity_margin)
+                       if (!fits_capacity(util, cpu_cap))
                                continue;
  
                        /* Always use prev_cpu as a candidate. */
                        if (cpu == prev_cpu) {
-                               prev_energy = compute_energy(p, prev_cpu, head);
-                               best_energy = min(best_energy, prev_energy);
-                               continue;
+                               prev_delta = compute_energy(p, prev_cpu, pd);
+                               prev_delta -= base_energy_pd;
+                               best_delta = min(best_delta, prev_delta);
                        }
  
                        /*
  
                /* Evaluate the energy impact of using this CPU. */
                if (max_spare_cap_cpu >= 0) {
-                       cur_energy = compute_energy(p, max_spare_cap_cpu, head);
-                       if (cur_energy < best_energy) {
-                               best_energy = cur_energy;
+                       cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+                       cur_delta -= base_energy_pd;
+                       if (cur_delta < best_delta) {
+                               best_delta = cur_delta;
                                best_energy_cpu = max_spare_cap_cpu;
                        }
                }
@@@ -6464,10 -6423,10 +6428,10 @@@ unlock
         * Pick the best CPU if prev_cpu cannot be used, or if it saves at
         * least 6% of the energy used by prev_cpu.
         */
-       if (prev_energy == ULONG_MAX)
+       if (prev_delta == ULONG_MAX)
                return best_energy_cpu;
  
-       if ((prev_energy - best_energy) > (prev_energy >> 4))
+       if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
                return best_energy_cpu;
  
        return prev_cpu;
@@@ -6801,7 -6760,7 +6765,7 @@@ again
                goto idle;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (prev->sched_class != &fair_sched_class)
+       if (!prev || prev->sched_class != &fair_sched_class)
                goto simple;
  
        /*
        goto done;
  simple:
  #endif
-       put_prev_task(rq, prev);
+       if (prev)
+               put_prev_task(rq, prev);
  
        do {
                se = pick_next_entity(cfs_rq, NULL);
@@@ -6907,11 -6866,13 +6871,13 @@@ done: __maybe_unused
        return p;
  
  idle:
-       update_misfit_status(NULL, rq);
-       new_tasks = idle_balance(rq, rf);
+       if (!rf)
+               return NULL;
+       new_tasks = newidle_balance(rq, rf);
  
        /*
-        * Because idle_balance() releases (and re-acquires) rq->lock, it is
+        * Because newidle_balance() releases (and re-acquires) rq->lock, it is
         * possible for any higher priority task to appear. In that case we
         * must re-start the pick_next_entity() loop.
         */
  /*
   * Account for a descheduled task:
   */
- static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
        struct sched_entity *se = &prev->se;
        struct cfs_rq *cfs_rq;
@@@ -7435,7 -7396,7 +7401,7 @@@ static int detach_tasks(struct lb_env *
                detached++;
                env->imbalance -= load;
  
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
                /*
                 * NEWIDLE balancing is a source of latency, so preemptible
                 * kernels will stop after the first task is detached to minimize
@@@ -7982,8 -7943,7 +7948,7 @@@ group_is_overloaded(struct lb_env *env
  static inline bool
  group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
  {
-       return sg->sgc->min_capacity * capacity_margin <
-                                               ref->sgc->min_capacity * 1024;
+       return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
  }
  
  /*
  static inline bool
  group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
  {
-       return sg->sgc->max_capacity * capacity_margin <
-                                               ref->sgc->max_capacity * 1024;
+       return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
  }
  
  static inline enum
@@@ -9052,9 -9011,10 +9016,10 @@@ more_balance
  out_balanced:
        /*
         * We reach balance although we may have faced some affinity
-        * constraints. Clear the imbalance flag if it was set.
+        * constraints. Clear the imbalance flag only if other tasks got
+        * a chance to move and fix the imbalance.
         */
-       if (sd_parent) {
+       if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
                int *group_imbalance = &sd_parent->groups->sgc->imbalance;
  
                if (*group_imbalance)
@@@ -9075,10 -9035,10 +9040,10 @@@ out_one_pinned
        ld_moved = 0;
  
        /*
-        * idle_balance() disregards balance intervals, so we could repeatedly
-        * reach this code, which would lead to balance_interval skyrocketting
-        * in a short amount of time. Skip the balance_interval increase logic
-        * to avoid that.
+        * newidle_balance() disregards balance intervals, so we could
+        * repeatedly reach this code, which would lead to balance_interval
+        * skyrocketting in a short amount of time. Skip the balance_interval
+        * increase logic to avoid that.
         */
        if (env.idle == CPU_NEWLY_IDLE)
                goto out;
@@@ -9788,7 -9748,7 +9753,7 @@@ static inline void nohz_newidle_balance
   * idle_balance is called by schedule() if this_cpu is about to become
   * idle. Attempts to pull tasks from other CPUs.
   */
static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
  {
        unsigned long next_balance = jiffies + HZ;
        int this_cpu = this_rq->cpu;
        int pulled_task = 0;
        u64 curr_cost = 0;
  
+       update_misfit_status(NULL, this_rq);
        /*
         * We must set idle_stamp _before_ calling idle_balance(), such that we
         * measure the duration of idle_balance() as idle time.
@@@ -10180,9 -10141,19 +10146,19 @@@ static void switched_to_fair(struct rq 
   * This routine is mostly called to set cfs_rq->curr field when a task
   * migrates between groups/classes.
   */
- static void set_curr_task_fair(struct rq *rq)
+ static void set_next_task_fair(struct rq *rq, struct task_struct *p)
  {
-       struct sched_entity *se = &rq->curr->se;
+       struct sched_entity *se = &p->se;
+ #ifdef CONFIG_SMP
+       if (task_on_rq_queued(p)) {
+               /*
+                * Move the next running task to the front of the list, so our
+                * cfs_tasks list becomes MRU one.
+                */
+               list_move(&se->group_node, &rq->cfs_tasks);
+       }
+ #endif
  
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
  void online_fair_sched_group(struct task_group *tg)
  {
        struct sched_entity *se;
+       struct rq_flags rf;
        struct rq *rq;
        int i;
  
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
                se = tg->se[i];
-               raw_spin_lock_irq(&rq->lock);
+               rq_lock_irq(rq, &rf);
                update_rq_clock(rq);
                attach_entity_cfs_rq(se);
                sync_throttle(tg, i);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
        }
  }
  
@@@ -10453,7 -10424,9 +10429,9 @@@ const struct sched_class fair_sched_cla
        .check_preempt_curr     = check_preempt_wakeup,
  
        .pick_next_task         = pick_next_task_fair,
        .put_prev_task          = put_prev_task_fair,
+       .set_next_task          = set_next_task_fair,
  
  #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
        .set_cpus_allowed       = set_cpus_allowed_common,
  #endif
  
-       .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
        .task_fork              = task_fork_fair,
  
diff --combined kernel/sched/idle.c
index e4bc4aa739b830c5236cf84445b6278aa3c0470b,7c54550dda6a6b09ecdff57b17a32232039fb3b7..8bfeb6395bddb9f5c3ccc1267475b55ae7c3d086
@@@ -241,14 -241,13 +241,14 @@@ static void do_idle(void
                check_pgt_cache();
                rmb();
  
 +              local_irq_disable();
 +
                if (cpu_is_offline(cpu)) {
 -                      tick_nohz_idle_stop_tick_protected();
 +                      tick_nohz_idle_stop_tick();
                        cpuhp_report_idle_dead();
                        arch_cpu_idle_dead();
                }
  
 -              local_irq_disable();
                arch_cpu_idle_enter();
  
                /*
@@@ -375,14 -374,27 +375,27 @@@ static void check_preempt_curr_idle(str
        resched_curr(rq);
  }
  
- static struct task_struct *
- pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ {
+ }
+ static void set_next_task_idle(struct rq *rq, struct task_struct *next)
  {
-       put_prev_task(rq, prev);
        update_idle_core(rq);
        schedstat_inc(rq->sched_goidle);
+ }
+ static struct task_struct *
+ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ {
+       struct task_struct *next = rq->idle;
+       if (prev)
+               put_prev_task(rq, prev);
+       set_next_task_idle(rq, next);
  
-       return rq->idle;
+       return next;
  }
  
  /*
@@@ -398,10 -410,6 +411,6 @@@ dequeue_task_idle(struct rq *rq, struc
        raw_spin_lock_irq(&rq->lock);
  }
  
- static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
- {
- }
  /*
   * scheduler tick hitting a task of our scheduling class.
   *
@@@ -414,10 -422,6 +423,6 @@@ static void task_tick_idle(struct rq *r
  {
  }
  
- static void set_curr_task_idle(struct rq *rq)
- {
- }
  static void switched_to_idle(struct rq *rq, struct task_struct *p)
  {
        BUG();
@@@ -452,13 -456,13 +457,13 @@@ const struct sched_class idle_sched_cla
  
        .pick_next_task         = pick_next_task_idle,
        .put_prev_task          = put_prev_task_idle,
+       .set_next_task          = set_next_task_idle,
  
  #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
        .set_cpus_allowed       = set_cpus_allowed_common,
  #endif
  
-       .set_curr_task          = set_curr_task_idle,
        .task_tick              = task_tick_idle,
  
        .get_rr_interval        = get_rr_interval_idle,
diff --combined kernel/sched/psi.c
index 6e52b67b420e7a3312f463f8d3bb6baad6576041,4b14a3208fbec3a92f25720f3597ed1321404a50..517e3719027e619e5c7b565d1de9294dfffb5a3c
@@@ -1051,7 -1051,7 +1051,7 @@@ struct psi_trigger *psi_trigger_create(
  
        if (!rcu_access_pointer(group->poll_kworker)) {
                struct sched_param param = {
 -                      .sched_priority = MAX_RT_PRIO - 1,
 +                      .sched_priority = 1,
                };
                struct kthread_worker *kworker;
  
                        mutex_unlock(&group->trigger_lock);
                        return ERR_CAST(kworker);
                }
 -              sched_setscheduler(kworker->task, SCHED_FIFO, &param);
 +              sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
                kthread_init_delayed_work(&group->poll_work,
                                psi_poll_work);
                rcu_assign_pointer(group->poll_kworker, kworker);
@@@ -1131,15 -1131,7 +1131,15 @@@ static void psi_trigger_destroy(struct 
         * deadlock while waiting for psi_poll_work to acquire trigger_lock
         */
        if (kworker_to_destroy) {
 +              /*
 +               * After the RCU grace period has expired, the worker
 +               * can no longer be found through group->poll_kworker.
 +               * But it might have been already scheduled before
 +               * that - deschedule it cleanly before destroying it.
 +               */
                kthread_cancel_delayed_work_sync(&group->poll_work);
 +              atomic_set(&group->poll_scheduled, 0);
 +
                kthread_destroy_worker(kworker_to_destroy);
        }
        kfree(t);
@@@ -1198,7 -1190,7 +1198,7 @@@ static ssize_t psi_write(struct file *f
        if (static_branch_likely(&psi_disabled))
                return -EOPNOTSUPP;
  
-       buf_size = min(nbytes, (sizeof(buf) - 1));
+       buf_size = min(nbytes, sizeof(buf));
        if (copy_from_user(buf, user_buf, buf_size))
                return -EFAULT;
  
diff --combined kernel/trace/ftrace.c
index f9821a3374e9dd4b81ed0f7cfe8c436d7a58b4c5,a800e867c1a3f0cd7e999384f9d18aa788d53b26..356b848c697aa75b43a908de85c35a3658b8981e
@@@ -2814,7 -2814,7 +2814,7 @@@ int ftrace_shutdown(struct ftrace_ops *
                 * synchornize_rcu_tasks() will wait for those tasks to
                 * execute and either schedule voluntarily or enter user space.
                 */
-               if (IS_ENABLED(CONFIG_PREEMPT))
+               if (IS_ENABLED(CONFIG_PREEMPTION))
                        synchronize_rcu_tasks();
  
   free_ops:
@@@ -3095,14 -3095,6 +3095,14 @@@ t_probe_next(struct seq_file *m, loff_
                hnd = &iter->probe_entry->hlist;
  
        hash = iter->probe->ops.func_hash->filter_hash;
 +
 +      /*
 +       * A probe being registered may temporarily have an empty hash
 +       * and it's at the end of the func_probes list.
 +       */
 +      if (!hash || hash == EMPTY_HASH)
 +              return NULL;
 +
        size = 1 << hash->size_bits;
  
   retry:
@@@ -4328,21 -4320,12 +4328,21 @@@ register_ftrace_function_probe(char *gl
  
        mutex_unlock(&ftrace_lock);
  
 +      /*
 +       * Note, there's a small window here that the func_hash->filter_hash
 +       * may be NULL or empty. Need to be carefule when reading the loop.
 +       */
        mutex_lock(&probe->ops.func_hash->regex_lock);
  
        orig_hash = &probe->ops.func_hash->filter_hash;
        old_hash = *orig_hash;
        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
  
 +      if (!hash) {
 +              ret = -ENOMEM;
 +              goto out;
 +      }
 +
        ret = ftrace_match_records(hash, glob, strlen(glob));
  
        /* Nothing found? */
index 648930823b571083c1a95000937962e565162eb2,5a189fb8ec23368215690d9ecce9b1fe20ff9f72..b89cdfe20bc1626b1c4632c6bfcace5eedbfe1ef
@@@ -255,12 -255,12 +255,12 @@@ void *trace_event_buffer_reserve(struc
        local_save_flags(fbuffer->flags);
        fbuffer->pc = preempt_count();
        /*
-        * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+        * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
         * preemption (adding one to the preempt_count). Since we are
         * interested in the preempt_count at the time the tracepoint was
         * hit, we need to subtract one to offset the increment.
         */
-       if (IS_ENABLED(CONFIG_PREEMPT))
+       if (IS_ENABLED(CONFIG_PREEMPTION))
                fbuffer->pc--;
        fbuffer->trace_file = trace_file;
  
@@@ -787,7 -787,7 +787,7 @@@ static int __ftrace_set_clr_event(struc
        return ret;
  }
  
 -static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
  {
        char *event = NULL, *sub = NULL, *match;
        int ret;
diff --combined mm/page_alloc.c
index 9c9194959271cfc0d9214bf60bb09b96c5b1a96a,0d54cd2c43a47f55cd8821105ea153a4a8e8d0cc..6991ccec9c322ffb843110bb69cf2326d64b266c
@@@ -2238,12 -2238,27 +2238,12 @@@ static int move_freepages(struct zone *
        unsigned int order;
        int pages_moved = 0;
  
 -#ifndef CONFIG_HOLES_IN_ZONE
 -      /*
 -       * page_zone is not safe to call in this context when
 -       * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 -       * anyway as we check zone boundaries in move_freepages_block().
 -       * Remove at a later date when no bug reports exist related to
 -       * grouping pages by mobility
 -       */
 -      VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
 -                pfn_valid(page_to_pfn(end_page)) &&
 -                page_zone(start_page) != page_zone(end_page));
 -#endif
        for (page = start_page; page <= end_page;) {
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
                        continue;
                }
  
 -              /* Make sure we are not inadvertently changing nodes */
 -              VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 -
                if (!PageBuddy(page)) {
                        /*
                         * We assume that pages that could be isolated for
                        continue;
                }
  
 +              /* Make sure we are not inadvertently changing nodes */
 +              VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 +              VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 +
                order = page_order(page);
                move_to_free_area(page, &zone->free_area[order], migratetype);
                page += 1 << order;
@@@ -3511,7 -3522,7 +3511,7 @@@ bool zone_watermark_ok_safe(struct zon
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
-                               RECLAIM_DISTANCE;
+                               node_reclaim_distance;
  }
  #else /* CONFIG_NUMA */
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)