Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
diff --combined MAINTAINERS

index cbe625343277ea0277e470e413c9cdbf79cabc04,3a5ef62c9dd11a70d6ecc5da353b46bf11770ebd..49f75d1b7b51a95d1177f6c207b9ecb0e3a1b8dc
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -183,7 -183,7 +183,7 @@@ M: Realtek linux nic maintainers <nic_s
   M:    Heiner Kallweit <hkallweit1@gmail.com>
   L:    netdev@vger.kernel.org
   S:    Maintained
- -F:    drivers/net/ethernet/realtek/r8169.c
+ +F:    drivers/net/ethernet/realtek/r8169*
   
   8250/16?50 (AND CLONE UARTS) SERIAL DRIVER
   M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
@@@ -517,6 -517,14 +517,6 @@@ W:        http://ez.analog.com/community/linux
   S:    Supported
   F:    drivers/video/backlight/adp8860_bl.c
   
- -ADS1015 HARDWARE MONITOR DRIVER
- -M:    Dirk Eibach <eibach@gdsys.de>
- -L:    linux-hwmon@vger.kernel.org
- -S:    Maintained
- -F:    Documentation/hwmon/ads1015.rst
- -F:    drivers/hwmon/ads1015.c
- -F:    include/linux/platform_data/ads1015.h
- -
   ADT746X FAN DRIVER
   M:    Colin Leroy <colin@colino.net>
   S:    Maintained
@@@ -675,7 -683,7 +675,7 @@@ S: Maintaine
   F:    drivers/crypto/sunxi-ss/
   
   ALLWINNER VPU DRIVER
- -M:    Maxime Ripard <maxime.ripard@bootlin.com>
+ +M:    Maxime Ripard <mripard@kernel.org>
   M:    Paul Kocialkowski <paul.kocialkowski@bootlin.com>
   L:    linux-media@vger.kernel.org
   S:    Maintained
@@@ -1342,7 -1350,8 +1342,7 @@@ M:      Will Deacon <will@kernel.org
   R:    Robin Murphy <robin.murphy@arm.com>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
- -F:    drivers/iommu/arm-smmu.c
- -F:    drivers/iommu/arm-smmu-v3.c
+ +F:    drivers/iommu/arm-smmu*
   F:    drivers/iommu/io-pgtable-arm.c
   F:    drivers/iommu/io-pgtable-arm-v7s.c
   
@@@ -1399,7 -1408,7 +1399,7 @@@ S:      Maintaine
   F:    drivers/clk/sunxi/
   
   ARM/Allwinner sunXi SoC support
- -M:    Maxime Ripard <maxime.ripard@bootlin.com>
+ +M:    Maxime Ripard <mripard@kernel.org>
   M:    Chen-Yu Tsai <wens@csie.org>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
@@@ -1617,21 -1626,6 +1617,21 @@@ F:    drivers/clocksource/timer-atlas7.
   N:    [^a-z]sirf
   X:    drivers/gnss
   
+ +ARM/CZ.NIC TURRIS MOX SUPPORT
+ +M:    Marek Behun <marek.behun@nic.cz>
+ +W:    http://mox.turris.cz
+ +S:    Maintained
+ +F:    Documentation/ABI/testing/debugfs-moxtet
+ +F:    Documentation/ABI/testing/sysfs-bus-moxtet-devices
+ +F:    Documentation/ABI/testing/sysfs-firmware-turris-mox-rwtm
+ +F:    Documentation/devicetree/bindings/bus/moxtet.txt
+ +F:    Documentation/devicetree/bindings/firmware/cznic,turris-mox-rwtm.txt
+ +F:    Documentation/devicetree/bindings/gpio/gpio-moxtet.txt
+ +F:    include/linux/moxtet.h
+ +F:    drivers/bus/moxtet.c
+ +F:    drivers/firmware/turris-mox-rwtm.c
+ +F:    drivers/gpio/gpio-moxtet.c
+ +
   ARM/EBSA110 MACHINE SUPPORT
   M:    Russell King <linux@armlinux.org.uk>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1755,11 -1749,20 +1755,11 @@@ L:   linux-arm-kernel@lists.infradead.or
   S:    Maintained
   F:    arch/arm/mach-pxa/colibri-pxa270-income.c
   
- -ARM/INTEL IOP13XX ARM ARCHITECTURE
- -M:    Lennert Buytenhek <kernel@wantstofly.org>
- -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
- -S:    Maintained
- -
   ARM/INTEL IOP32X ARM ARCHITECTURE
   M:    Lennert Buytenhek <kernel@wantstofly.org>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
   
- -ARM/INTEL IOP33X ARM ARCHITECTURE
- -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
- -S:    Orphan
- -
   ARM/INTEL IQ81342EX MACHINE SUPPORT
   M:    Lennert Buytenhek <kernel@wantstofly.org>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1918,6 -1921,12 +1918,6 @@@ S:     Maintaine
   F:    drivers/phy/mediatek/
   F:    Documentation/devicetree/bindings/phy/phy-mtk-*
   
- -ARM/MICREL KS8695 ARCHITECTURE
- -M:    Greg Ungerer <gerg@uclinux.org>
- -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
- -F:    arch/arm/mach-ks8695/
- -S:    Odd Fixes
- -
   ARM/Microchip (AT91) SoC support
   M:    Nicolas Ferre <nicolas.ferre@microchip.com>
   M:    Alexandre Belloni <alexandre.belloni@bootlin.com>
@@@ -1959,7 -1968,6 +1959,7 @@@ F:      Documentation/devicetree/bindings/i2
   F:    arch/arm/mach-nomadik/
   F:    arch/arm/mach-u300/
   F:    arch/arm/mach-ux500/
+ +F:    drivers/soc/ux500/
   F:    arch/arm/boot/dts/ste-*
   F:    drivers/clk/clk-nomadik.c
   F:    drivers/clk/clk-u300.c
@@@ -2003,6 -2011,22 +2003,6 @@@ F:     drivers/*/*npcm
   F:    Documentation/devicetree/bindings/*/*npcm*
   F:    Documentation/devicetree/bindings/*/*/*npcm*
   
- -ARM/NUVOTON W90X900 ARM ARCHITECTURE
- -M:    Wan ZongShun <mcuos.com@gmail.com>
- -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
- -W:    http://www.mcuos.com
- -S:    Maintained
- -F:    arch/arm/mach-w90x900/
- -F:    drivers/input/keyboard/w90p910_keypad.c
- -F:    drivers/input/touchscreen/w90p910_ts.c
- -F:    drivers/watchdog/nuc900_wdt.c
- -F:    drivers/net/ethernet/nuvoton/w90p910_ether.c
- -F:    drivers/mtd/nand/raw/nuc900_nand.c
- -F:    drivers/rtc/rtc-nuc900.c
- -F:    drivers/spi/spi-nuc900.c
- -F:    drivers/usb/host/ehci-w90x900.c
- -F:    drivers/video/fbdev/nuc900fb.c
- -
   ARM/OPENMOKO NEO FREERUNNER (GTA02) MACHINE SUPPORT
   L:    openmoko-kernel@lists.openmoko.org (subscribers-only)
   W:    http://wiki.openmoko.org/wiki/Neo_FreeRunner
@@@ -2131,12 -2155,10 +2131,12 @@@ F:   Documentation/devicetree/bindings/ar
   
   ARM/RENESAS ARM64 ARCHITECTURE
   M:    Simon Horman <horms@verge.net.au>
+ +M:    Geert Uytterhoeven <geert+renesas@glider.be>
   M:    Magnus Damm <magnus.damm@gmail.com>
   L:    linux-renesas-soc@vger.kernel.org
   Q:    http://patchwork.kernel.org/project/linux-renesas-soc/list/
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next
   S:    Supported
   F:    arch/arm64/boot/dts/renesas/
   F:    Documentation/devicetree/bindings/arm/renesas.yaml
@@@ -2195,9 -2217,8 +2195,9 @@@ F:      drivers/*/*s3c24
   F:    drivers/*/*/*s3c24*
   F:    drivers/*/*s3c64xx*
   F:    drivers/*/*s5pv210*
- -F:    drivers/memory/samsung/*
- -F:    drivers/soc/samsung/*
+ +F:    drivers/memory/samsung/
+ +F:    drivers/soc/samsung/
+ +F:    include/linux/soc/samsung/
   F:    Documentation/arm/samsung/
   F:    Documentation/devicetree/bindings/arm/samsung/
   F:    Documentation/devicetree/bindings/sram/samsung-sram.txt
@@@ -2248,12 -2269,10 +2248,12 @@@ F:   drivers/media/platform/s5p-mfc
   
   ARM/SHMOBILE ARM ARCHITECTURE
   M:    Simon Horman <horms@verge.net.au>
+ +M:    Geert Uytterhoeven <geert+renesas@glider.be>
   M:    Magnus Damm <magnus.damm@gmail.com>
   L:    linux-renesas-soc@vger.kernel.org
   Q:    http://patchwork.kernel.org/project/linux-renesas-soc/list/
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next
   S:    Supported
   F:    arch/arm/boot/dts/emev2*
   F:    arch/arm/boot/dts/gr-peach*
@@@ -3554,7 -3573,7 +3554,7 @@@ F:      Documentation/filesystems/caching/ca
   F:    fs/cachefiles/
   
   CADENCE MIPI-CSI2 BRIDGES
- -M:    Maxime Ripard <maxime.ripard@bootlin.com>
+ +M:    Maxime Ripard <mripard@kernel.org>
   L:    linux-media@vger.kernel.org
   S:    Maintained
   F:    Documentation/devicetree/bindings/media/cdns,*.txt
@@@ -4267,14 -4286,6 +4267,14 @@@ S:    Supporte
   F:    drivers/cpuidle/cpuidle-exynos.c
   F:    arch/arm/mach-exynos/pm.c
   
+ +CPUIDLE DRIVER - ARM PSCI
+ +M:    Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ +M:    Sudeep Holla <sudeep.holla@arm.com>
+ +L:    linux-pm@vger.kernel.org
+ +L:    linux-arm-kernel@lists.infradead.org
+ +S:    Supported
+ +F:    drivers/cpuidle/cpuidle-psci.c
+ +
   CPU IDLE TIME MANAGEMENT FRAMEWORK
   M:    "Rafael J. Wysocki" <rjw@rjwysocki.net>
   M:    Daniel Lezcano <daniel.lezcano@linaro.org>
@@@ -5280,7 -5291,7 +5280,7 @@@ F:      include/linux/vga
   
   DRM DRIVERS AND MISC GPU PATCHES
   M:    Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
- -M:    Maxime Ripard <maxime.ripard@bootlin.com>
+ +M:    Maxime Ripard <mripard@kernel.org>
   M:    Sean Paul <sean@poorly.run>
   W:    https://01.org/linuxgraphics/gfx-docs/maintainer-tools/drm-misc.html
   S:    Maintained
@@@ -5293,7 -5304,7 +5293,7 @@@ F:      include/uapi/drm/drm
   F:    include/linux/vga*
   
   DRM DRIVERS FOR ALLWINNER A10
- -M:    Maxime Ripard  <maxime.ripard@bootlin.com>
+ +M:    Maxime Ripard <mripard@kernel.org>
   L:    dri-devel@lists.freedesktop.org
   S:    Supported
   F:    drivers/gpu/drm/sun4i/
@@@ -5746,11 -5757,6 +5746,11 @@@ S:    Supporte
   F:    drivers/edac/aspeed_edac.c
   F:    Documentation/devicetree/bindings/edac/aspeed-sdram-edac.txt
   
+ +EDAC-BLUEFIELD
+ +M:    Shravan Kumar Ramani <sramani@mellanox.com>
+ +S:    Supported
+ +F:    drivers/edac/bluefield_edac.c
+ +
   EDAC-CALXEDA
   M:    Robert Richter <rric@kernel.org>
   L:    linux-edac@vger.kernel.org
@@@ -5775,11 -5781,10 +5775,11 @@@ F:   drivers/edac/thunderx_edac
   EDAC-CORE
   M:    Borislav Petkov <bp@alien8.de>
   M:    Mauro Carvalho Chehab <mchehab@kernel.org>
+ +M:    Tony Luck <tony.luck@intel.com>
   R:    James Morse <james.morse@arm.com>
+ +R:    Robert Richter <rrichter@marvell.com>
   L:    linux-edac@vger.kernel.org
- -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git for-next
- -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac.git linux_next
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next
   S:    Supported
   F:    Documentation/admin-guide/ras.rst
   F:    Documentation/driver-api/edac.rst
@@@ -6056,7 -6061,7 +6056,7 @@@ M:      Florian Fainelli <f.fainelli@gmail.c
   M:    Heiner Kallweit <hkallweit1@gmail.com>
   L:    netdev@vger.kernel.org
   S:    Maintained
- -F:    Documentation/ABI/testing/sysfs-bus-mdio
+ +F:    Documentation/ABI/testing/sysfs-class-net-phydev
   F:    Documentation/devicetree/bindings/net/ethernet-phy.yaml
   F:    Documentation/devicetree/bindings/net/mdio*
   F:    Documentation/networking/phy.rst
@@@ -6317,16 -6322,24 +6317,16 @@@ F:   Documentation/devicetree/bindings/co
   F:    drivers/counter/ftm-quaddec.c
   
   FLOPPY DRIVER
- -S:    Orphan
+ +M:    Denis Efremov <efremov@linux.com>
+ +S:    Odd Fixes
   L:    linux-block@vger.kernel.org
   F:    drivers/block/floppy.c
   
- -FMC SUBSYSTEM
- -M:    Alessandro Rubini <rubini@gnudd.com>
- -W:    http://www.ohwr.org/projects/fmc-bus
- -S:    Supported
- -F:    drivers/fmc/
- -F:    include/linux/fmc*.h
- -F:    include/linux/ipmi-fru.h
- -K:    fmc_d.*register
- -
   FPGA MANAGER FRAMEWORK
   M:    Moritz Fischer <mdf@kernel.org>
   L:    linux-fpga@vger.kernel.org
   S:    Maintained
- -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/atull/linux-fpga.git
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mdf/linux-fpga.git
   Q:    http://patchwork.kernel.org/project/linux-fpga/list/
   F:    Documentation/fpga/
   F:    Documentation/driver-api/fpga/
@@@ -6359,7 -6372,7 +6359,7 @@@ FRAMEBUFFER LAYE
   M:    Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
   L:    dri-devel@lists.freedesktop.org
   L:    linux-fbdev@vger.kernel.org
- -T:    git git://github.com/bzolnier/linux.git
+ +T:    git git://anongit.freedesktop.org/drm/drm-misc
   Q:    http://patchwork.kernel.org/project/linux-fbdev/list/
   S:    Maintained
   F:    Documentation/fb/
@@@ -6421,17 -6434,8 +6421,17 @@@ M:    Frank Li <Frank.li@nxp.com
   L:    linux-arm-kernel@lists.infradead.org
   S:    Maintained
   F:    drivers/perf/fsl_imx8_ddr_perf.c
+ +F:    Documentation/admin-guide/perf/imx-ddr.rst
   F:    Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt
   
+ +FREESCALE IMX I2C DRIVER
+ +M:    Oleksij Rempel <o.rempel@pengutronix.de>
+ +R:    Pengutronix Kernel Team <kernel@pengutronix.de>
+ +L:    linux-i2c@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/i2c/busses/i2c-imx.c
+ +F:    Documentation/devicetree/bindings/i2c/i2c-imx.txt
+ +
   FREESCALE IMX LPI2C DRIVER
   M:    Dong Aisheng <aisheng.dong@nxp.com>
   L:    linux-i2c@vger.kernel.org
@@@ -6715,13 -6719,6 +6715,13 @@@ W:    https://linuxtv.or
   S:    Maintained
   F:    drivers/media/radio/radio-gemtek*
   
+ +GENERIC ARCHITECTURE TOPOLOGY
+ +M:    Sudeep Holla <sudeep.holla@arm.com>
+ +L:    linux-kernel@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/base/arch_topology.c
+ +F:    include/linux/arch_topology.h
+ +
   GENERIC GPIO I2C DRIVER
   M:    Wolfram Sang <wsa+renesas@sang-engineering.com>
   S:    Supported
@@@ -6825,6 -6822,13 +6825,6 @@@ F:     Documentation/filesystems/gfs2*.tx
   F:    fs/gfs2/
   F:    include/uapi/linux/gfs2_ondisk.h
   
- -GIGASET ISDN DRIVERS
- -M:    Paul Bolle <pebolle@tiscali.nl>
- -L:    gigaset307x-common@lists.sourceforge.net
- -W:    http://gigaset307x.sourceforge.net/
- -S:    Odd Fixes
- -F:    drivers/staging/isdn/gigaset/
- -
   GNSS SUBSYSTEM
   M:    Johan Hovold <johan@kernel.org>
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git
@@@ -7450,7 -7454,7 +7450,7 @@@ F:      drivers/net/hyperv
   F:    drivers/scsi/storvsc_drv.c
   F:    drivers/uio/uio_hv_generic.c
   F:    drivers/video/fbdev/hyperv_fb.c
- -F:    drivers/iommu/hyperv_iommu.c
+ +F:    drivers/iommu/hyperv-iommu.c
   F:    net/vmw_vsock/hyperv_transport.c
   F:    include/clocksource/hyperv_timer.h
   F:    include/linux/hyperv.h
@@@ -7503,7 -7507,7 +7503,7 @@@ I2C MV64XXX MARVELL AND ALLWINNER DRIVE
   M:    Gregory CLEMENT <gregory.clement@bootlin.com>
   L:    linux-i2c@vger.kernel.org
   S:    Maintained
- -F:    Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt
+ +F:    Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml
   F:    drivers/i2c/busses/i2c-mv64xxx.c
   
   I2C OVER PARALLEL PORT
@@@ -8040,7 -8044,6 +8040,7 @@@ S:      Maintaine
   F:    drivers/video/fbdev/i810/
   
   INTEL ASoC DRIVERS
+ +M:    Cezary Rojewski <cezary.rojewski@intel.com>
   M:    Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
   M:    Liam Girdwood <liam.r.girdwood@linux.intel.com>
   M:    Jie Yang <yang.jie@linux.intel.com>
@@@ -8062,13 -8065,6 +8062,13 @@@ T:    git git://git.code.sf.net/p/intel-sa
   S:    Supported
   F:    drivers/scsi/isci/
   
+ +INTEL CPU family model numbers
+ +M:    Tony Luck <tony.luck@intel.com>
+ +M:    x86@kernel.org
+ +L:    linux-kernel@vger.kernel.org
+ +S:    Supported
+ +F:    arch/x86/include/asm/intel-family.h
+ +
   INTEL DRM DRIVERS (excluding Poulsbo, Moorestown and derivative chipsets)
   M:    Jani Nikula <jani.nikula@linux.intel.com>
   M:    Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
@@@ -8375,6 -8371,12 +8375,6 @@@ F:     Documentation/x86/intel_txt.rs
   F:    include/linux/tboot.h
   F:    arch/x86/kernel/tboot.c
   
- -INTEL-MID GPIO DRIVER
- -M:    David Cohen <david.a.cohen@linux.intel.com>
- -L:    linux-gpio@vger.kernel.org
- -S:    Maintained
- -F:    drivers/gpio/gpio-intel-mid.c
- -
   INTERCONNECT API
   M:    Georgi Djakov <georgi.djakov@linaro.org>
   L:    linux-pm@vger.kernel.org
@@@ -8399,6 -8401,12 +8399,6 @@@ L:     linux-mips@vger.kernel.or
   S:    Maintained
   F:    drivers/net/ethernet/sgi/ioc3-eth.c
   
- -IOC3 SERIAL DRIVER
- -M:    Pat Gefre <pfg@sgi.com>
- -L:    linux-serial@vger.kernel.org
- -S:    Maintained
- -F:    drivers/tty/serial/ioc3_serial.c
- -
   IOMAP FILESYSTEM LIBRARY
   M:    Christoph Hellwig <hch@infradead.org>
   M:    Darrick J. Wong <darrick.wong@oracle.com>
@@@ -8408,6 -8416,7 +8408,6 @@@ L:      linux-xfs@vger.kernel.or
   L:    linux-fsdevel@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git
   S:    Supported
- -F:    fs/iomap.c
   F:    fs/iomap/
   F:    include/linux/iomap.h
   
@@@ -8432,6 -8441,11 +8432,6 @@@ S:     Maintaine
   F:    fs/io_uring.c
   F:    include/uapi/linux/io_uring.h
   
- -IP MASQUERADING
- -M:    Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
- -S:    Maintained
- -F:    net/ipv4/netfilter/ipt_MASQUERADE.c
- -
   IPMI SUBSYSTEM
   M:    Corey Minyard <minyard@acm.org>
   L:    openipmi-developer@lists.sourceforge.net (moderated for non-subscribers)
@@@ -8805,6 -8819,14 +8805,6 @@@ F:     virt/kvm/
   F:    tools/kvm/
   F:    tools/testing/selftests/kvm/
   
- -KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd)
- -M:    Joerg Roedel <joro@8bytes.org>
- -L:    kvm@vger.kernel.org
- -W:    http://www.linux-kvm.org/
- -S:    Maintained
- -F:    arch/x86/include/asm/svm.h
- -F:    arch/x86/kvm/svm.c
- -
   KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64)
   M:    Marc Zyngier <maz@kernel.org>
   R:    James Morse <james.morse@arm.com>
@@@ -8847,7 -8869,7 +8847,7 @@@ M:      Christian Borntraeger <borntraeger@d
   M:    Janosch Frank <frankja@linux.ibm.com>
   R:    David Hildenbrand <david@redhat.com>
   R:    Cornelia Huck <cohuck@redhat.com>
- -L:    linux-s390@vger.kernel.org
+ +L:    kvm@vger.kernel.org
   W:    http://www.ibm.com/developerworks/linux/linux390/
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
   S:    Supported
@@@ -8862,11 -8884,6 +8862,11 @@@ F:    tools/testing/selftests/kvm/*/s390x
   KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86)
   M:    Paolo Bonzini <pbonzini@redhat.com>
   M:    Radim Krčmář <rkrcmar@redhat.com>
+ +R:    Sean Christopherson <sean.j.christopherson@intel.com>
+ +R:    Vitaly Kuznetsov <vkuznets@redhat.com>
+ +R:    Wanpeng Li <wanpengli@tencent.com>
+ +R:    Jim Mattson <jmattson@google.com>
+ +R:    Joerg Roedel <joro@8bytes.org>
   L:    kvm@vger.kernel.org
   W:    http://www.linux-kvm.org
   T:    git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
@@@ -8874,12 -8891,8 +8874,12 @@@ S:    Supporte
   F:    arch/x86/kvm/
   F:    arch/x86/kvm/*/
   F:    arch/x86/include/uapi/asm/kvm*
+ +F:    arch/x86/include/uapi/asm/vmx.h
+ +F:    arch/x86/include/uapi/asm/svm.h
   F:    arch/x86/include/asm/kvm*
   F:    arch/x86/include/asm/pvclock-abi.h
+ +F:    arch/x86/include/asm/svm.h
+ +F:    arch/x86/include/asm/vmx.h
   F:    arch/x86/kernel/kvm.c
   F:    arch/x86/kernel/kvmclock.c
   
@@@ -8911,7 -8924,7 +8911,7 @@@ F:      security/keys/encrypted-keys
   
   KEYS-TRUSTED
   M:    James Bottomley <jejb@linux.ibm.com>
- -M:      Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
+ +M:    Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
   M:    Mimi Zohar <zohar@linux.ibm.com>
   L:    linux-integrity@vger.kernel.org
   L:    keyrings@vger.kernel.org
@@@ -9207,18 -9220,6 +9207,18 @@@ F:    include/linux/nd.
   F:    include/linux/libnvdimm.h
   F:    include/uapi/linux/ndctl.h
   
+ +LICENSES and SPDX stuff
+ +M:    Thomas Gleixner <tglx@linutronix.de>
+ +M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ +L:    linux-spdx@vger.kernel.org
+ +S:    Maintained
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/spdx.git
+ +F:    COPYING
+ +F:    Documentation/process/license-rules.rst
+ +F:    LICENSES/
+ +F:    scripts/spdxcheck-test.sh
+ +F:    scripts/spdxcheck.py
+ +
   LIGHTNVM PLATFORM SUPPORT
   M:    Matias Bjorling <mb@lightnvm.io>
   W:    http://github/OpenChannelSSD
@@@ -9325,7 -9326,7 +9325,7 @@@ F:      drivers/misc/lkdtm/
   
   LINUX KERNEL MEMORY CONSISTENCY MODEL (LKMM)
   M:    Alan Stern <stern@rowland.harvard.edu>
- -M:    Andrea Parri <andrea.parri@amarulasolutions.com>
+ +M:    Andrea Parri <parri.andrea@gmail.com>
   M:    Will Deacon <will@kernel.org>
   M:    Peter Zijlstra <peterz@infradead.org>
   M:    Boqun Feng <boqun.feng@gmail.com>
@@@ -9333,7 -9334,7 +9333,7 @@@ M:      Nicholas Piggin <npiggin@gmail.com
   M:    David Howells <dhowells@redhat.com>
   M:    Jade Alglave <j.alglave@ucl.ac.uk>
   M:    Luc Maranget <luc.maranget@inria.fr>
- -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
+ +M:    "Paul E. McKenney" <paulmck@kernel.org>
   R:    Akira Yokosawa <akiyks@gmail.com>
   R:    Daniel Lustig <dlustig@nvidia.com>
   L:    linux-kernel@vger.kernel.org
@@@ -10016,8 -10017,8 +10016,8 @@@ L:   linux-media@vger.kernel.or
   L:    linux-renesas-soc@vger.kernel.org
   T:    git git://linuxtv.org/media_tree.git
   S:    Supported
- -F:    Documentation/devicetree/bindings/media/renesas,rcar-csi2.txt
- -F:    Documentation/devicetree/bindings/media/rcar_vin.txt
+ +F:    Documentation/devicetree/bindings/media/renesas,csi2.txt
+ +F:    Documentation/devicetree/bindings/media/renesas,vin.txt
   F:    drivers/media/platform/rcar-vin/
   
   MEDIA DRIVERS FOR RENESAS - VSP1
@@@ -10362,7 -10363,7 +10362,7 @@@ F:   drivers/platform/x86/mlx-platform.
   
   MEMBARRIER SUPPORT
   M:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
+ +M:    "Paul E. McKenney" <paulmck@kernel.org>
   L:    linux-kernel@vger.kernel.org
   S:    Supported
   F:    kernel/sched/membarrier.c
@@@ -10614,6 -10615,12 +10614,6 @@@ M:  Nicolas Ferre <nicolas.ferre@microch
   S:    Supported
   F:    drivers/power/reset/at91-sama5d2_shdwc.c
   
- -MICROCHIP SAMA5D2-COMPATIBLE PIOBU GPIO
- -M:    Andrei Stefanescu <andrei.stefanescu@microchip.com>
- -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
- -L:    linux-gpio@vger.kernel.org
- -F:    drivers/gpio/gpio-sama5d2-piobu.c
- -
   MICROCHIP SPI DRIVER
   M:    Nicolas Ferre <nicolas.ferre@microchip.com>
   S:    Supported
@@@ -10626,6 -10633,13 +10626,6 @@@ S:  Supporte
   F:    drivers/misc/atmel-ssc.c
   F:    include/linux/atmel-ssc.h
   
- -MICROCHIP TIMER COUNTER (TC) AND CLOCKSOURCE DRIVERS
- -M:    Nicolas Ferre <nicolas.ferre@microchip.com>
- -L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
- -S:    Supported
- -F:    drivers/misc/atmel_tclib.c
- -F:    drivers/clocksource/tcb_clksrc.c
- -
   MICROCHIP USBA UDC DRIVER
   M:    Cristian Birsan <cristian.birsan@microchip.com>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -11058,7 -11072,7 +11058,7 @@@ NET_FAILOVER MODUL
   M:    Sridhar Samudrala <sridhar.samudrala@intel.com>
   L:    netdev@vger.kernel.org
   S:    Supported
- -F:    driver/net/net_failover.c
+ +F:    drivers/net/net_failover.c
   F:    include/net/net_failover.h
   F:    Documentation/networking/net_failover.rst
   
@@@ -11130,7 -11144,6 +11130,7 @@@ L:   netdev@vger.kernel.or
   S:    Maintained
   W:    https://fedorahosted.org/dropwatch/
   F:    net/core/drop_monitor.c
+ +F:    include/uapi/linux/net_dropmon.h
   
   NETWORKING DRIVERS
   M:    "David S. Miller" <davem@davemloft.net>
@@@ -11269,7 -11282,6 +11269,7 @@@ M:   Aviad Yehezkel <aviadye@mellanox.com
   M:    Dave Watson <davejwatson@fb.com>
   M:    John Fastabend <john.fastabend@gmail.com>
   M:    Daniel Borkmann <daniel@iogearbox.net>
+ +M:    Jakub Kicinski <jakub.kicinski@netronome.com>
   L:    netdev@vger.kernel.org
   S:    Maintained
   F:    net/tls/*
@@@ -12578,6 -12590,7 +12578,7 @@@ PERFORMANCE EVENTS SUBSYSTE
   M:    Peter Zijlstra <peterz@infradead.org>
   M:    Ingo Molnar <mingo@redhat.com>
   M:    Arnaldo Carvalho de Melo <acme@kernel.org>
+ R:    Mark Rutland <mark.rutland@arm.com>
   R:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
   R:    Jiri Olsa <jolsa@redhat.com>
   R:    Namhyung Kim <namhyung@kernel.org>
@@@ -12667,7 -12680,6 +12668,7 @@@ L:   linux-arm-kernel@lists.infradead.or
   L:    linux-gpio@vger.kernel.org
   S:    Supported
   F:    drivers/pinctrl/pinctrl-at91*
+ +F:    drivers/gpio/gpio-sama5d2-piobu.c
   
   PIN CONTROLLER - FREESCALE
   M:    Dong Aisheng <aisheng.dong@nxp.com>
@@@ -13465,7 -13477,7 +13466,7 @@@ S:   Orpha
   F:    drivers/net/wireless/ray*
   
   RCUTORTURE TEST FRAMEWORK
- -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
+ +M:    "Paul E. McKenney" <paulmck@kernel.org>
   M:    Josh Triplett <josh@joshtriplett.org>
   R:    Steven Rostedt <rostedt@goodmis.org>
   R:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
@@@ -13512,7 -13524,7 +13513,7 @@@ F:   arch/x86/include/asm/resctrl_sched.
   F:    Documentation/x86/resctrl*
   
   READ-COPY UPDATE (RCU)
- -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
+ +M:    "Paul E. McKenney" <paulmck@kernel.org>
   M:    Josh Triplett <josh@joshtriplett.org>
   R:    Steven Rostedt <rostedt@goodmis.org>
   R:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
@@@ -13670,7 -13682,7 +13671,7 @@@ F:   include/linux/reset-controller.
   RESTARTABLE SEQUENCES SUPPORT
   M:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   M:    Peter Zijlstra <peterz@infradead.org>
- -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
+ +M:    "Paul E. McKenney" <paulmck@kernel.org>
   M:    Boqun Feng <boqun.feng@gmail.com>
   L:    linux-kernel@vger.kernel.org
   S:    Supported
@@@ -14005,12 -14017,6 +14006,12 @@@ F: drivers/media/common/saa7146
   F:    drivers/media/pci/saa7146/
   F:    include/media/drv-intf/saa7146*
   
+ +SAFESETID SECURITY MODULE
+ +M:     Micah Morton <mortonm@chromium.org>
+ +S:     Supported
+ +F:     security/safesetid/
+ +F:     Documentation/admin-guide/LSM/SafeSetID.rst
+ +
   SAMSUNG AUDIO (ASoC) DRIVERS
   M:    Krzysztof Kozlowski <krzk@kernel.org>
   M:    Sangbeom Kim <sbkim73@samsung.com>
@@@ -14101,8 -14107,6 +14102,8 @@@ M:   Kamil Konieczny <k.konieczny@partner
   L:    linux-crypto@vger.kernel.org
   L:    linux-samsung-soc@vger.kernel.org
   S:    Maintained
+ +F:    Documentation/devicetree/bindings/crypto/samsung-slimsss.txt
+ +F:    Documentation/devicetree/bindings/crypto/samsung-sss.txt
   F:    drivers/crypto/s5p-sss.c
   
   SAMSUNG S5P/EXYNOS4 SOC SERIES CAMERA SUBSYSTEM DRIVERS
@@@ -14123,8 -14127,6 +14124,8 @@@ T:   git git://git.kernel.org/pub/scm/lin
   F:    drivers/clk/samsung/
   F:    include/dt-bindings/clock/exynos*.h
   F:    Documentation/devicetree/bindings/clock/exynos*.txt
+ +F:    Documentation/devicetree/bindings/clock/samsung,s3c*
+ +F:    Documentation/devicetree/bindings/clock/samsung,s5p*
   
   SAMSUNG SPI DRIVERS
   M:    Kukjin Kim <kgene@kernel.org>
@@@ -14175,6 -14177,12 +14176,12 @@@ F: drivers/watchdog/sc1200wdt.
   SCHEDULER
   M:    Ingo Molnar <mingo@redhat.com>
   M:    Peter Zijlstra <peterz@infradead.org>
+ M:    Juri Lelli <juri.lelli@redhat.com> (SCHED_DEADLINE)
+ M:    Vincent Guittot <vincent.guittot@linaro.org> (SCHED_NORMAL)
+ R:    Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
+ R:    Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
+ R:    Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
+ R:    Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
   L:    linux-kernel@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
   S:    Maintained
@@@ -14455,7 -14463,6 +14462,7 @@@ F:   drivers/net/phy/phylink.
   F:    drivers/net/phy/sfp*
   F:    include/linux/phylink.h
   F:    include/linux/sfp.h
+ +K:    phylink
   
   SGI GRU DRIVER
   M:    Dimitri Sivanich <sivanich@sgi.com>
@@@ -14710,7 -14717,7 +14717,7 @@@ F:   mm/sl?b
   
   SLEEPABLE READ-COPY UPDATE (SRCU)
   M:    Lai Jiangshan <jiangshanlai@gmail.com>
- -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
+ +M:    "Paul E. McKenney" <paulmck@kernel.org>
   M:    Josh Triplett <josh@joshtriplett.org>
   R:    Steven Rostedt <rostedt@goodmis.org>
   R:    Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
@@@ -14861,9 -14868,9 +14868,9 @@@ F:   include/linux/arm_sdei.
   F:    include/uapi/linux/arm_sdei.h
   
   SOFTWARE RAID (Multiple Disks) SUPPORT
- -M:    Shaohua Li <shli@kernel.org>
+ +M:    Song Liu <song@kernel.org>
   L:    linux-raid@vger.kernel.org
- -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
   S:    Supported
   F:    drivers/md/Makefile
   F:    drivers/md/Kconfig
@@@ -15545,7 -15552,6 +15552,7 @@@ F:   drivers/clk/clk-sc[mp]i.
   F:    drivers/cpufreq/sc[mp]i-cpufreq.c
   F:    drivers/firmware/arm_scpi.c
   F:    drivers/firmware/arm_scmi/
+ +F:    drivers/reset/reset-scmi.c
   F:    include/linux/sc[mp]i_protocol.h
   
   SYSTEM RESET/SHUTDOWN DRIVERS
@@@ -15854,7 -15860,6 +15861,7 @@@ F:   drivers/firmware/ti_sci
   F:    include/linux/soc/ti/ti_sci_protocol.h
   F:    Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt
   F:    drivers/soc/ti/ti_sci_pm_domains.c
+ +F:    include/dt-bindings/soc/ti,sci_pm_domain.h
   F:    Documentation/devicetree/bindings/reset/ti,sci-reset.txt
   F:    Documentation/devicetree/bindings/clock/ti,sci-clk.txt
   F:    drivers/clk/keystone/sci-clk.c
@@@ -16080,7 -16085,7 +16087,7 @@@ S:   Maintaine
   F:    drivers/net/ethernet/ti/netcp*
   
   TI PCM3060 ASoC CODEC DRIVER
- -M:    Kirill Marinushkin <kmarinushkin@birdec.tech>
+ +M:    Kirill Marinushkin <kmarinushkin@birdec.com>
   L:    alsa-devel@alsa-project.org (moderated for non-subscribers)
   S:    Maintained
   F:    Documentation/devicetree/bindings/sound/pcm3060.txt
@@@ -16209,7 -16214,7 +16216,7 @@@ F:   drivers/platform/x86/topstar-laptop.
   
   TORTURE-TEST MODULES
   M:    Davidlohr Bueso <dave@stgolabs.net>
- -M:    "Paul E. McKenney" <paulmck@linux.ibm.com>
+ +M:    "Paul E. McKenney" <paulmck@kernel.org>
   M:    Josh Triplett <josh@joshtriplett.org>
   L:    linux-kernel@vger.kernel.org
   S:    Supported
@@@ -17235,7 -17240,6 +17242,7 @@@ F:   Documentation/power/regulator
   F:    drivers/regulator/
   F:    include/dt-bindings/regulator/
   F:    include/linux/regulator/
+ +K:    regulator_get_optional
   
   VRF
   M:    David Ahern <dsa@cumulusnetworks.com>
@@@ -17557,6 -17561,7 +17564,6 @@@ M:   Jakub Kicinski <jakub.kicinski@netro
   M:    Jesper Dangaard Brouer <hawk@kernel.org>
   M:    John Fastabend <john.fastabend@gmail.com>
   L:    netdev@vger.kernel.org
- -L:    xdp-newbies@vger.kernel.org
   L:    bpf@vger.kernel.org
   S:    Supported
   F:    net/core/xdp.c
@@@ -17672,7 -17677,8 +17679,7 @@@ F:   include/uapi/linux/dqblk_xfs.
   F:    include/uapi/linux/fsmap.h
   
   XILINX AXI ETHERNET DRIVER
- -M:    Anirudha Sarangi <anirudh@xilinx.com>
- -M:    John Linn <John.Linn@xilinx.com>
+ +M:    Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
   S:    Maintained
   F:    drivers/net/ethernet/xilinx/xilinx_axienet*
   
diff --combined arch/Kconfig

index 71d9ae0c0ea16ea8990e1a81841d5bb31b77a07d,c7efbc018f4fa800ca08c752ea630c2238eeccae..6baedab10dcaa14130b15907313200a6f04b38b4
--- 1/arch/Kconfig
--- 2/arch/Kconfig
+++ b/arch/Kconfig
@@@ -18,9 -18,6 +18,9 @@@ config KEXEC_COR
         select CRASH_CORE
         bool
   
+ +config KEXEC_ELF
+ +      bool
+ +
   config HAVE_IMA_KEXEC
         bool
   
@@@ -106,7 -103,7 +106,7 @@@ config STATIC_KEYS_SELFTES
   config OPTPROBES
         def_bool y
         depends on KPROBES && HAVE_OPTPROBES
-       select TASKS_RCU if PREEMPT
+       select TASKS_RCU if PREEMPTION
   
   config KPROBES_ON_FTRACE
         def_bool y
@@@ -928,20 -925,6 +928,20 @@@ config LOCK_EVENT_COUNT
           the chance of application behavior change because of timing
           differences. The counts are reported via debugfs.
   
+ +# Select if the architecture has support for applying RELR relocations.
+ +config ARCH_HAS_RELR
+ +      bool
+ +
+ +config RELR
+ +      bool "Use RELR relocation packing"
+ +      depends on ARCH_HAS_RELR && TOOLS_SUPPORT_RELR
+ +      default y
+ +      help
+ +        Store the kernel's dynamic relocations in the RELR relocation packing
+ +        format. Requires a compatible linker (LLD supports this feature), as
+ +        well as compatible NM and OBJCOPY utilities (llvm-nm and llvm-objcopy
+ +        are compatible).
+ +
   source "kernel/gcov/Kconfig"
   
   source "scripts/gcc-plugins/Kconfig"
diff --combined arch/ia64/Kconfig

index 13d49c232556ce9e3bbdf1862fc3ca388d1b2e6a,997baba02b70e7e427511c324e4f8cb9dae5fef9..9711cf73092948678423b9d474030eecd24d4032
--- 1/arch/ia64/Kconfig
--- 2/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@@ -10,14 -10,12 +10,14 @@@ config IA6
         bool
         select ARCH_MIGHT_HAVE_PC_PARPORT
         select ARCH_MIGHT_HAVE_PC_SERIO
- -      select ACPI if (!IA64_HP_SIM)
- -      select ARCH_SUPPORTS_ACPI if (!IA64_HP_SIM)
+ +      select ACPI
+ +      select ACPI_NUMA if NUMA
+ +      select ARCH_SUPPORTS_ACPI
         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
         select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
- -      select FORCE_PCI if (!IA64_HP_SIM)
+ +      select FORCE_PCI
         select PCI_DOMAINS if PCI
+ +      select PCI_MSI
         select PCI_SYSCALL if PCI
         select HAVE_UNSTABLE_SCHED_CLOCK
         select HAVE_EXIT_THREAD
@@@ -32,8 -30,8 +32,8 @@@
         select HAVE_ARCH_TRACEHOOK
         select HAVE_MEMBLOCK_NODE_MAP
         select HAVE_VIRT_CPU_ACCOUNTING
- -      select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB
- -      select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB
+ +      select ARCH_HAS_DMA_COHERENT_TO_PFN
+ +      select ARCH_HAS_SYNC_DMA_FOR_CPU
         select VIRT_TO_BUS
         select GENERIC_IRQ_PROBE
         select GENERIC_PENDING_IRQ if SMP
@@@ -47,7 -45,6 +47,7 @@@
         select ARCH_THREAD_STACK_ALLOCATOR
         select ARCH_CLOCKSOURCE_DATA
         select GENERIC_TIME_VSYSCALL
+ +      select SWIOTLB
         select SYSCTL_ARCH_UNALIGN_NO_WARN
         select HAVE_MOD_ARCH_SPECIFIC
         select MODULES_USE_ELF_RELA
@@@ -55,7 -52,6 +55,7 @@@
         select HAVE_ARCH_AUDITSYSCALL
         select NEED_DMA_MAP_STATE
         select NEED_SG_DMA_LENGTH
+ +      select NUMA if !FLATMEM
         default y
         help
           The Itanium Processor Family is Intel's 64-bit successor to
@@@ -70,6 -66,7 +70,6 @@@ config 64BI
   
   config ZONE_DMA32
         def_bool y
- -      depends on !IA64_SGI_SN2
   
   config QUICKLIST
         bool
@@@ -123,6 -120,87 +123,6 @@@ config AUDIT_ARC
         bool
         default y
   
- -choice
- -      prompt "System type"
- -      default IA64_GENERIC
- -
- -config IA64_GENERIC
- -      bool "generic"
- -      select NUMA
- -      select ACPI_NUMA
- -      select SWIOTLB
- -      select PCI_MSI
- -      help
- -        This selects the system type of your hardware.  A "generic" kernel
- -        will run on any supported IA-64 system.  However, if you configure
- -        a kernel for your specific system, it will be faster and smaller.
- -
- -        generic               For any supported IA-64 system
- -        DIG-compliant         For DIG ("Developer's Interface Guide") compliant systems
- -        DIG+Intel+IOMMU       For DIG systems with Intel IOMMU
- -        HP-zx1/sx1000         For HP systems
- -        HP-zx1/sx1000+swiotlb For HP systems with (broken) DMA-constrained devices.
- -        SGI-SN2               For SGI Altix systems
- -        SGI-UV                For SGI UV systems
- -        Ski-simulator         For the HP simulator <http://www.hpl.hp.com/research/linux/ski/>
- -
- -        If you don't know what to do, choose "generic".
- -
- -config IA64_DIG
- -      bool "DIG-compliant"
- -      select SWIOTLB
- -
- -config IA64_DIG_VTD
- -      bool "DIG+Intel+IOMMU"
- -      select INTEL_IOMMU
- -      select PCI_MSI
- -
- -config IA64_HP_ZX1
- -      bool "HP-zx1/sx1000"
- -      help
- -        Build a kernel that runs on HP zx1 and sx1000 systems.  This adds
- -        support for the HP I/O MMU.
- -
- -config IA64_HP_ZX1_SWIOTLB
- -      bool "HP-zx1/sx1000 with software I/O TLB"
- -      select SWIOTLB
- -      help
- -        Build a kernel that runs on HP zx1 and sx1000 systems even when they
- -        have broken PCI devices which cannot DMA to full 32 bits.  Apart
- -        from support for the HP I/O MMU, this includes support for the software
- -        I/O TLB, which allows supporting the broken devices at the expense of
- -        wasting some kernel memory (about 2MB by default).
- -
- -config IA64_SGI_SN2
- -      bool "SGI-SN2"
- -      select NUMA
- -      select ACPI_NUMA
- -      help
- -        Selecting this option will optimize the kernel for use on sn2 based
- -        systems, but the resulting kernel binary will not run on other
- -        types of ia64 systems.  If you have an SGI Altix system, it's safe
- -        to select this option.  If in doubt, select ia64 generic support
- -        instead.
- -
- -config IA64_SGI_UV
- -      bool "SGI-UV"
- -      select NUMA
- -      select ACPI_NUMA
- -      select SWIOTLB
- -      help
- -        Selecting this option will optimize the kernel for use on UV based
- -        systems, but the resulting kernel binary will not run on other
- -        types of ia64 systems.  If you have an SGI UV system, it's safe
- -        to select this option.  If in doubt, select ia64 generic support
- -        instead.
- -
- -config IA64_HP_SIM
- -      bool "Ski-simulator"
- -      select SWIOTLB
- -      depends on !PM
- -
- -endchoice
- -
   choice
         prompt "Processor type"
         default ITANIUM
@@@ -174,7 -252,14 +174,7 @@@ config IA64_PAGE_SIZE_64K
   
   endchoice
   
- -if IA64_HP_SIM
- -config HZ
- -      default 32
- -endif
- -
- -if !IA64_HP_SIM
   source "kernel/Kconfig.hz"
- -endif
   
   config IA64_BRL_EMU
         bool
@@@ -187,26 -272,17 +187,26 @@@ config IA64_L1_CACHE_SHIF
         default "7" if MCKINLEY
         default "6" if ITANIUM
   
+ +config IA64_SGI_UV
+ +      bool "SGI-UV support"
+ +      help
+ +        Selecting this option will add specific support for running on SGI
+ +        UV based systems.  If you have an SGI UV system or are building a
+ +        distro kernel, select this option.
+ +
+ +config IA64_HP_SBA_IOMMU
+ +      bool "HP SBA IOMMU support"
+ +      default y
+ +      help
+ +        Say Y here to add support for the SBA IOMMU found on HP zx1 and
+ +        sx1000 systems.  If you're unsure, answer Y.
+ +
   config IA64_CYCLONE
         bool "Cyclone (EXA) Time Source support"
         help
           Say Y here to enable support for IBM EXA Cyclone time source.
           If you're unsure, answer N.
   
- -config IOSAPIC
- -      bool
- -      depends on !IA64_HP_SIM
- -      default y
- -
   config FORCE_MAX_ZONEORDER
         int "MAX_ORDER (11 - 17)"  if !HUGETLB_PAGE
         range 11 17  if !HUGETLB_PAGE
@@@ -305,12 -381,15 +305,13 @@@ config ARCH_SPARSEMEM_ENABL
         select SPARSEMEM_VMEMMAP_ENABLE
   
   config ARCH_DISCONTIGMEM_DEFAULT
- -      def_bool y if (IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB)
+ +      def_bool y
         depends on ARCH_DISCONTIGMEM_ENABLE
   
   config NUMA
         bool "NUMA support"
- -      depends on !IA64_HP_SIM && !FLATMEM
- -      default y if IA64_SGI_SN2
- -      select ACPI_NUMA if ACPI
+ +      depends on !FLATMEM
+       select SMP
         help
           Say Y to compile the kernel to support NUMA (Non-Uniform Memory
           Access).  This option is for configuring high-end multiprocessor
@@@ -331,7 -410,7 +332,7 @@@ config NODES_SHIF
   config VIRTUAL_MEM_MAP
         bool "Virtual mem map"
         depends on !SPARSEMEM
- -      default y if !IA64_HP_SIM
+ +      default y
         help
           Say Y to compile the kernel with support for a virtual mem map.
           This code also only takes effect if a memory hole of greater than
@@@ -394,6 -473,9 +395,6 @@@ config IA64_MC_ERR_INJEC
   
           If you're unsure, do not select this option.
   
- -config SGI_SN
- -      def_bool y if (IA64_SGI_SN2 || IA64_GENERIC)
- -
   config IA64_ESI
         bool "ESI (Extensible SAL Interface) support"
         help
@@@ -412,9 -494,11 +413,9 @@@ config IA64_HP_AML_NF
           the "force" module parameter, e.g., with the "aml_nfw.force"
           kernel command line option.
   
- -source "drivers/sn/Kconfig"
- -
   config KEXEC
         bool "kexec system call"
- -      depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+ +      depends on !SMP || HOTPLUG_CPU
         select KEXEC_CORE
         help
           kexec is a system call that implements the ability to shutdown your
@@@ -432,7 -516,7 +433,7 @@@
   
   config CRASH_DUMP
           bool "kernel crash dumps"
- -        depends on IA64_MCA_RECOVERY && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+ +        depends on IA64_MCA_RECOVERY && (!SMP || HOTPLUG_CPU)
           help
             Generate crash dump after being started by kexec.
   
@@@ -454,6 -538,8 +455,6 @@@ endi
   
   endmenu
   
- -source "arch/ia64/hp/sim/Kconfig"
- -
   config MSPEC
         tristate "Memory special operations driver"
         depends on IA64
diff --combined arch/x86/entry/entry_64.S

index be9ca198c581aea7ed29f4417aae9c1c1b835473,9701464341e49b3af4a51c9505388510babb7dff..af077ded196966256792af01507427fc800cf32e
--- 1/arch/x86/entry/entry_64.S
--- 2/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@@ -519,7 -519,7 +519,7 @@@ ENTRY(interrupt_entry
         testb   $3, CS-ORIG_RAX+8(%rsp)
         jz      1f
         SWAPGS
- -
+ +      FENCE_SWAPGS_USER_ENTRY
         /*
          * Switch to the thread stack. The IRET frame and orig_ax are
          * on the stack, as well as the return address. RDI..R12 are
@@@ -549,10 -549,8 +549,10 @@@
         UNWIND_HINT_FUNC
   
         movq    (%rdi), %rdi
+ +      jmp     2f
   1:
- -
+ +      FENCE_SWAPGS_KERNEL_ENTRY
+ +2:
         PUSH_AND_CLEAR_REGS save_ret=1
         ENCODE_FRAME_POINTER 8
   
@@@ -664,7 -662,7 +664,7 @@@ GLOBAL(swapgs_restore_regs_and_return_t
   
   /* Returning to kernel space */
   retint_kernel:
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
         /* Interrupts are off */
         /* Check if we need preemption */
         btl     $9, EFLAGS(%rsp)                /* were interrupts off? */
@@@ -1115,7 -1113,7 +1115,7 @@@ ENTRY(xen_do_hypervisor_callback)               /* d
         call    xen_evtchn_do_upcall
         LEAVE_IRQ_STACK
   
- #ifndef CONFIG_PREEMPT
+ #ifndef CONFIG_PREEMPTION
         call    xen_maybe_preempt_hcall
   #endif
         jmp     error_exit
@@@ -1240,13 -1238,6 +1240,13 @@@ ENTRY(paranoid_entry
          */
         SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
   
+ +      /*
+ +       * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
+ +       * unconditional CR3 write, even in the PTI case.  So do an lfence
+ +       * to prevent GS speculation, regardless of whether PTI is enabled.
+ +       */
+ +      FENCE_SWAPGS_KERNEL_ENTRY
+ +
         ret
   END(paranoid_entry)
   
@@@ -1297,7 -1288,6 +1297,7 @@@ ENTRY(error_entry
          * from user mode due to an IRET fault.
          */
         SWAPGS
+ +      FENCE_SWAPGS_USER_ENTRY
         /* We have user CR3.  Change to kernel CR3. */
         SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
   
@@@ -1311,8 -1301,6 +1311,8 @@@
         pushq   %r12
         ret
   
+ +.Lerror_entry_done_lfence:
+ +      FENCE_SWAPGS_KERNEL_ENTRY
   .Lerror_entry_done:
         ret
   
@@@ -1330,7 -1318,7 +1330,7 @@@
         cmpq    %rax, RIP+8(%rsp)
         je      .Lbstep_iret
         cmpq    $.Lgs_change, RIP+8(%rsp)
- -      jne     .Lerror_entry_done
+ +      jne     .Lerror_entry_done_lfence
   
         /*
          * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
@@@ -1338,7 -1326,6 +1338,7 @@@
          * .Lgs_change's error handler with kernel gsbase.
          */
         SWAPGS
+ +      FENCE_SWAPGS_USER_ENTRY
         SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
         jmp .Lerror_entry_done
   
@@@ -1353,7 -1340,6 +1353,7 @@@
          * gsbase and CR3.  Switch to kernel gsbase and CR3:
          */
         SWAPGS
+ +      FENCE_SWAPGS_USER_ENTRY
         SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
   
         /*
@@@ -1445,7 -1431,6 +1445,7 @@@ ENTRY(nmi
   
         swapgs
         cld
+ +      FENCE_SWAPGS_USER_ENTRY
         SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
         movq    %rsp, %rdx
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
diff --combined arch/x86/kernel/cpu/amd.c

index 68c363c341bf2a3794ec26f29df4dd28281d4dc5,ceeb8afc7cf3a43d998ce581849729d235847497..7d6e0efcc2db3cf909c2fd868311ac496b9a1070
--- 1/arch/x86/kernel/cpu/amd.c
--- 2/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@@ -8,6 -8,7 +8,7 @@@
   #include <linux/sched.h>
   #include <linux/sched/clock.h>
   #include <linux/random.h>
+ #include <linux/topology.h>
   #include <asm/processor.h>
   #include <asm/apic.h>
   #include <asm/cacheinfo.h>
@@@ -804,64 -805,6 +805,64 @@@ static void init_amd_ln(struct cpuinfo_
         msr_set_bit(MSR_AMD64_DE_CFG, 31);
   }
   
+ +static bool rdrand_force;
+ +
+ +static int __init rdrand_cmdline(char *str)
+ +{
+ +      if (!str)
+ +              return -EINVAL;
+ +
+ +      if (!strcmp(str, "force"))
+ +              rdrand_force = true;
+ +      else
+ +              return -EINVAL;
+ +
+ +      return 0;
+ +}
+ +early_param("rdrand", rdrand_cmdline);
+ +
+ +static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
+ +{
+ +      /*
+ +       * Saving of the MSR used to hide the RDRAND support during
+ +       * suspend/resume is done by arch/x86/power/cpu.c, which is
+ +       * dependent on CONFIG_PM_SLEEP.
+ +       */
+ +      if (!IS_ENABLED(CONFIG_PM_SLEEP))
+ +              return;
+ +
+ +      /*
+ +       * The nordrand option can clear X86_FEATURE_RDRAND, so check for
+ +       * RDRAND support using the CPUID function directly.
+ +       */
+ +      if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
+ +              return;
+ +
+ +      msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
+ +
+ +      /*
+ +       * Verify that the CPUID change has occurred in case the kernel is
+ +       * running virtualized and the hypervisor doesn't support the MSR.
+ +       */
+ +      if (cpuid_ecx(1) & BIT(30)) {
+ +              pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
+ +              return;
+ +      }
+ +
+ +      clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ +      pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
+ +}
+ +
+ +static void init_amd_jg(struct cpuinfo_x86 *c)
+ +{
+ +      /*
+ +       * Some BIOS implementations do not restore proper RDRAND support
+ +       * across suspend and resume. Check on whether to hide the RDRAND
+ +       * instruction support via CPUID.
+ +       */
+ +      clear_rdrand_cpuid_bit(c);
+ +}
+ +
   static void init_amd_bd(struct cpuinfo_x86 *c)
   {
         u64 value;
@@@ -876,19 -819,16 +877,23 @@@
                         wrmsrl_safe(MSR_F15H_IC_CFG, value);
                 }
         }
+ +
+ +      /*
+ +       * Some BIOS implementations do not restore proper RDRAND support
+ +       * across suspend and resume. Check on whether to hide the RDRAND
+ +       * instruction support via CPUID.
+ +       */
+ +      clear_rdrand_cpuid_bit(c);
   }
   
   static void init_amd_zn(struct cpuinfo_x86 *c)
   {
         set_cpu_cap(c, X86_FEATURE_ZEN);
   
+ #ifdef CONFIG_NUMA
+       node_reclaim_distance = 32;
+ #endif
+ 
         /*
          * Fix erratum 1076: CPB feature bit not being set in CPUID.
          * Always set it, except when running under a hypervisor.
@@@ -925,7 -865,6 +930,7 @@@ static void init_amd(struct cpuinfo_x8
         case 0x10: init_amd_gh(c); break;
         case 0x12: init_amd_ln(c); break;
         case 0x15: init_amd_bd(c); break;
+ +      case 0x16: init_amd_jg(c); break;
         case 0x17: init_amd_zn(c); break;
         }
   
diff --combined arch/x86/kernel/kvm.c

index 4ab377c9fffede8af8c93b620bdb9d90803fd353,3d07f84c4846943f122d0345e540565e7954f207..4cc967178bf952ca32b01059505b588c3439db1d
--- 1/arch/x86/kernel/kvm.c
--- 2/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@@ -308,10 -308,13 +308,10 @@@ static notrace void kvm_guest_apic_eoi_
   
   static void kvm_guest_cpu_init(void)
   {
- -      if (!kvm_para_available())
- -              return;
- -
         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
                 u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
   
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
                 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
   #endif
                 pa |= KVM_ASYNC_PF_ENABLED;
@@@ -622,6 -625,9 +622,6 @@@ static void __init kvm_guest_init(void
   {
         int i;
   
- -      if (!kvm_para_available())
- -              return;
- -
         paravirt_ops_setup();
         register_reboot_notifier(&kvm_pv_reboot_nb);
         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
@@@ -842,6 -848,8 +842,6 @@@ asm
    */
   void __init kvm_spinlock_init(void)
   {
- -      if (!kvm_para_available())
- -              return;
         /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
         if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
                 return;
diff --combined include/linux/rcupdate.h

index 80d6056f58556590af7adb5978daf40692cca87e,c4f76a310443f306e930a2da55e4604b021e6263..75a2eded7aa2ce6973622ecfd5a2a00772f07270
--- 1/include/linux/rcupdate.h
--- 2/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@@ -221,7 -221,6 +221,7 @@@ int debug_lockdep_rcu_enabled(void)
   int rcu_read_lock_held(void);
   int rcu_read_lock_bh_held(void);
   int rcu_read_lock_sched_held(void);
+ +int rcu_read_lock_any_held(void);
   
   #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
   
@@@ -242,12 -241,6 +242,12 @@@ static inline int rcu_read_lock_sched_h
   {
         return !preemptible();
   }
+ +
+ +static inline int rcu_read_lock_any_held(void)
+ +{
+ +      return !preemptible();
+ +}
+ +
   #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
   
   #ifdef CONFIG_PROVE_RCU
@@@ -483,7 -476,7 +483,7 @@@ do {                                                                             
    * The no-tracing version of rcu_dereference_raw() must not call
    * rcu_read_lock_held().
    */
- -#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
+ +#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)
   
   /**
    * rcu_dereference_protected() - fetch RCU pointer when updates prevented
@@@ -585,7 -578,7 +585,7 @@@
    *
    * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU),
    * it is illegal to block while in an RCU read-side critical section.
-  * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPT
+  * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
    * kernel builds, RCU read-side critical sections may be preempted,
    * but explicit blocking is illegal.  Finally, in preemptible RCU
    * implementations in real-time (with -rt patchset) kernel builds, RCU
diff --combined include/linux/topology.h

index 2a19d196af2886899a552dda7c68edaf1015dc66,579522ec446c3aa76a9c5f0d5c2f90543b9d407d..eb2fe6edd73c80ad16ddad96fd9c10777b72d051
--- 1/include/linux/topology.h
--- 2/include/linux/topology.h
+++ b/include/linux/topology.h
@@@ -27,7 -27,6 +27,7 @@@
   #ifndef _LINUX_TOPOLOGY_H
   #define _LINUX_TOPOLOGY_H
   
+ +#include <linux/arch_topology.h>
   #include <linux/cpumask.h>
   #include <linux/bitops.h>
   #include <linux/mmzone.h>
@@@ -60,6 -59,20 +60,20 @@@ int arch_update_cpu_topology(void)
    */
   #define RECLAIM_DISTANCE 30
   #endif
+ 
+ /*
+  * The following tunable allows platforms to override the default node
+  * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
+  * sufficiently fast that the default value actually hurts
+  * performance.
+  *
+  * AMD EPYC machines use this because even though the 2-hop distance
+  * is 32 (3.2x slower than a local memory access) performance actually
+  * *improves* if allowed to reclaim memory and load balance tasks
+  * between NUMA nodes 2-hops apart.
+  */
+ extern int __read_mostly node_reclaim_distance;
+ 
   #ifndef PENALTY_FOR_NODE_WITH_CPUS
   #define PENALTY_FOR_NODE_WITH_CPUS    (1)
   #endif
diff --combined init/Kconfig

index d96127ebc44e08526f0be1586098d2ecd52e7104,ac285cfa78b6ca22254bcb6868e9abb3eacd7bdb..ec1021fd33712afdc98b5ff454518320d858d8f5
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -30,9 -30,6 +30,9 @@@ config CC_CAN_LIN
   config CC_HAS_ASM_GOTO
         def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC))
   
+ +config TOOLS_SUPPORT_RELR
+ +      def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh)
+ +
   config CC_HAS_WARN_MAYBE_UNINITIALIZED
         def_bool $(cc-option,-Wmaybe-uninitialized)
         help
@@@ -931,6 -928,28 +931,28 @@@ config RT_GROUP_SCHE
   
   endif #CGROUP_SCHED
   
+ config UCLAMP_TASK_GROUP
+       bool "Utilization clamping per group of tasks"
+       depends on CGROUP_SCHED
+       depends on UCLAMP_TASK
+       default n
+       help
+         This feature enables the scheduler to track the clamped utilization
+         of each CPU based on RUNNABLE tasks currently scheduled on that CPU.
+ 
+         When this option is enabled, the user can specify a min and max
+         CPU bandwidth which is allowed for each single task in a group.
+         The max bandwidth allows to clamp the maximum frequency a task
+         can use, while the min bandwidth allows to define a minimum
+         frequency a task will always use.
+ 
+         When task group based utilization clamping is enabled, an eventually
+         specified task-specific clamp value is constrained by the cgroup
+         specified clamp value. Both minimum and maximum task clamping cannot
+         be bigger than the corresponding clamping defined at task group level.
+ 
+         If in doubt, say N.
+ 
   config CGROUP_PIDS
         bool "PIDs controller"
         help
diff --combined kernel/cgroup/cgroup.c

index 8be1da1ebd9a4f3d4ee3f6038a85e18e8d5fa685,4b5bc452176ca83f807fb5055ecff7ea15b01c64..a7ce73a2c40198e8cdd53df154db0d16af89daaa
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -1891,7 -1891,7 +1891,7 @@@ static int cgroup_reconfigure(struct fs
    */
   static bool use_task_css_set_links __read_mostly;
   
- static void cgroup_enable_task_cg_lists(void)
+ void cgroup_enable_task_cg_lists(void)
   {
         struct task_struct *p, *g;
   
@@@ -5255,16 -5255,8 +5255,16 @@@ static struct cgroup *cgroup_create(str
          * if the parent has to be frozen, the child has too.
          */
         cgrp->freezer.e_freeze = parent->freezer.e_freeze;
- -      if (cgrp->freezer.e_freeze)
+ +      if (cgrp->freezer.e_freeze) {
+ +              /*
+ +               * Set the CGRP_FREEZE flag, so when a process will be
+ +               * attached to the child cgroup, it will become frozen.
+ +               * At this point the new cgroup is unpopulated, so we can
+ +               * consider it frozen immediately.
+ +               */
+ +              set_bit(CGRP_FREEZE, &cgrp->flags);
                 set_bit(CGRP_FROZEN, &cgrp->flags);
+ +      }
   
         spin_lock_irq(&css_set_lock);
         for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
diff --combined kernel/events/core.c

index 2aad959e6def727accc954163a3d8f532e3fd83b,c1f52a749db25cd89f5ccc7cf5e6d0237677adae..1c414b8866b454aed555aafdf34e823256f0c8ba
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -1887,89 -1887,6 +1887,89 @@@ list_del_event(struct perf_event *event
         ctx->generation++;
   }
   
+ +static int
+ +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
+ +{
+ +      if (!has_aux(aux_event))
+ +              return 0;
+ +
+ +      if (!event->pmu->aux_output_match)
+ +              return 0;
+ +
+ +      return event->pmu->aux_output_match(aux_event);
+ +}
+ +
+ +static void put_event(struct perf_event *event);
+ +static void event_sched_out(struct perf_event *event,
+ +                          struct perf_cpu_context *cpuctx,
+ +                          struct perf_event_context *ctx);
+ +
+ +static void perf_put_aux_event(struct perf_event *event)
+ +{
+ +      struct perf_event_context *ctx = event->ctx;
+ +      struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ +      struct perf_event *iter;
+ +
+ +      /*
+ +       * If event uses aux_event tear down the link
+ +       */
+ +      if (event->aux_event) {
+ +              iter = event->aux_event;
+ +              event->aux_event = NULL;
+ +              put_event(iter);
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * If the event is an aux_event, tear down all links to
+ +       * it from other events.
+ +       */
+ +      for_each_sibling_event(iter, event->group_leader) {
+ +              if (iter->aux_event != event)
+ +                      continue;
+ +
+ +              iter->aux_event = NULL;
+ +              put_event(event);
+ +
+ +              /*
+ +               * If it's ACTIVE, schedule it out and put it into ERROR
+ +               * state so that we don't try to schedule it again. Note
+ +               * that perf_event_enable() will clear the ERROR status.
+ +               */
+ +              event_sched_out(iter, cpuctx, ctx);
+ +              perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ +      }
+ +}
+ +
+ +static int perf_get_aux_event(struct perf_event *event,
+ +                            struct perf_event *group_leader)
+ +{
+ +      /*
+ +       * Our group leader must be an aux event if we want to be
+ +       * an aux_output. This way, the aux event will precede its
+ +       * aux_output events in the group, and therefore will always
+ +       * schedule first.
+ +       */
+ +      if (!group_leader)
+ +              return 0;
+ +
+ +      if (!perf_aux_output_match(event, group_leader))
+ +              return 0;
+ +
+ +      if (!atomic_long_inc_not_zero(&group_leader->refcount))
+ +              return 0;
+ +
+ +      /*
+ +       * Link aux_outputs to their aux event; this is undone in
+ +       * perf_group_detach() by perf_put_aux_event(). When the
+ +       * group in torn down, the aux_output events loose their
+ +       * link to the aux_event and can't schedule any more.
+ +       */
+ +      event->aux_event = group_leader;
+ +
+ +      return 1;
+ +}
+ +
   static void perf_group_detach(struct perf_event *event)
   {
         struct perf_event *sibling, *tmp;
@@@ -1985,8 -1902,6 +1985,8 @@@
   
         event->attach_state &= ~PERF_ATTACH_GROUP;
   
+ +      perf_put_aux_event(event);
+ +
         /*
          * If this is a sibling, remove it from its group.
          */
@@@ -4174,10 -4089,8 +4174,8 @@@ alloc_perf_context(struct pmu *pmu, str
                 return NULL;
   
         __perf_event_init_context(ctx);
-       if (task) {
-               ctx->task = task;
-               get_task_struct(task);
-       }
+       if (task)
+               ctx->task = get_task_struct(task);
         ctx->pmu = pmu;
   
         return ctx;
@@@ -10440,8 -10353,7 +10438,7 @@@ perf_event_alloc(struct perf_event_att
                  * and we cannot use the ctx information because we need the
                  * pmu before we get a ctx.
                  */
-               get_task_struct(task);
-               event->hw.target = task;
+               event->hw.target = get_task_struct(task);
         }
   
         event->clock = &local_clock;
@@@ -10511,12 -10423,6 +10508,12 @@@
                 goto err_ns;
         }
   
+ +      if (event->attr.aux_output &&
+ +          !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ +              err = -EOPNOTSUPP;
+ +              goto err_pmu;
+ +      }
+ +
         err = exclusive_event_init(event);
         if (err)
                 goto err_pmu;
@@@ -11173,8 -11079,6 +11170,8 @@@ SYSCALL_DEFINE5(perf_event_open
                 }
         }
   
+ +      if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
+ +              goto err_locked;
   
         /*
          * Must be under the same ctx::mutex as perf_install_in_context(),
diff --combined kernel/kprobes.c

index ebe8315a756a2593f0e9bab3f37efe7885a347ed,8bc5f1ffd68e4351ac472dae4fcdca2862f88bcd..1b66ccbb744a6a991dbaa8670f0bf26ff826837a
--- 1/kernel/kprobes.c
--- 2/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@@ -470,7 -470,6 +470,7 @@@ static DECLARE_DELAYED_WORK(optimizing_
    */
   static void do_optimize_kprobes(void)
   {
+ +      lockdep_assert_held(&text_mutex);
         /*
          * The optimization/unoptimization refers online_cpus via
          * stop_machine() and cpu-hotplug modifies online_cpus.
@@@ -488,7 -487,9 +488,7 @@@
             list_empty(&optimizing_list))
                 return;
   
- -      mutex_lock(&text_mutex);
         arch_optimize_kprobes(&optimizing_list);
- -      mutex_unlock(&text_mutex);
   }
   
   /*
@@@ -499,7 -500,6 +499,7 @@@ static void do_unoptimize_kprobes(void
   {
         struct optimized_kprobe *op, *tmp;
   
+ +      lockdep_assert_held(&text_mutex);
         /* See comment in do_optimize_kprobes() */
         lockdep_assert_cpus_held();
   
@@@ -507,6 -507,7 +507,6 @@@
         if (list_empty(&unoptimizing_list))
                 return;
   
- -      mutex_lock(&text_mutex);
         arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
         /* Loop free_list for disarming */
         list_for_each_entry_safe(op, tmp, &freeing_list, list) {
@@@ -523,6 -524,7 +523,6 @@@
                 } else
                         list_del_init(&op->list);
         }
- -      mutex_unlock(&text_mutex);
   }
   
   /* Reclaim all kprobes on the free_list */
@@@ -554,7 -556,6 +554,7 @@@ static void kprobe_optimizer(struct wor
   {
         mutex_lock(&kprobe_mutex);
         cpus_read_lock();
+ +      mutex_lock(&text_mutex);
         /* Lock modules while optimizing kprobes */
         mutex_lock(&module_mutex);
   
@@@ -582,7 -583,6 +582,7 @@@
         do_free_cleaned_kprobes();
   
         mutex_unlock(&module_mutex);
+ +      mutex_unlock(&text_mutex);
         cpus_read_unlock();
         mutex_unlock(&kprobe_mutex);
   
@@@ -1514,8 -1514,7 +1514,8 @@@ static int check_kprobe_address_safe(st
         /* Ensure it is not in reserved area nor out of text */
         if (!kernel_text_address((unsigned long) p->addr) ||
             within_kprobe_blacklist((unsigned long) p->addr) ||
- -          jump_label_text_reserved(p->addr, p->addr)) {
+ +          jump_label_text_reserved(p->addr, p->addr) ||
+ +          find_bug((unsigned long)p->addr)) {
                 ret = -EINVAL;
                 goto out;
         }
@@@ -1907,7 -1906,7 +1907,7 @@@ int register_kretprobe(struct kretprob
   
         /* Pre-allocate memory for max kretprobe instances */
         if (rp->maxactive <= 0) {
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
                 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
   #else
                 rp->maxactive = num_possible_cpus();
diff --combined kernel/rcu/tree.c

index 71395e91b876809bdfabdcf9408a76bb5172b57c,6a37afd5436c32a6635fee042c5562f10ed24baf..81105141b6a823689254b5a9033cc7b62e330213
--- 1/kernel/rcu/tree.c
--- 2/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@@ -56,7 -56,6 +56,7 @@@
   #include <linux/smpboot.h>
   #include <linux/jiffies.h>
   #include <linux/sched/isolation.h>
+ +#include <linux/sched/clock.h>
   #include "../time/tick-internal.h"
   
   #include "tree.h"
@@@ -211,9 -210,9 +211,9 @@@ static long rcu_get_n_cbs_cpu(int cpu
   {
         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   
- -      if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */
+ +      if (rcu_segcblist_is_enabled(&rdp->cblist))
                 return rcu_segcblist_n_cbs(&rdp->cblist);
- -      return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */
+ +      return 0;
   }
   
   void rcu_softirq_qs(void)
@@@ -417,12 -416,6 +417,12 @@@ module_param(qlowmark, long, 0444)
   static ulong jiffies_till_first_fqs = ULONG_MAX;
   static ulong jiffies_till_next_fqs = ULONG_MAX;
   static bool rcu_kick_kthreads;
+ +static int rcu_divisor = 7;
+ +module_param(rcu_divisor, int, 0644);
+ +
+ +/* Force an exit from rcu_do_batch() after 3 milliseconds. */
+ +static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
+ +module_param(rcu_resched_ns, long, 0644);
   
   /*
    * How long the grace period must be before we start recruiting
@@@ -1258,7 -1251,6 +1258,7 @@@ static bool rcu_accelerate_cbs(struct r
         unsigned long gp_seq_req;
         bool ret = false;
   
+ +      rcu_lockdep_assert_cblist_protected(rdp);
         raw_lockdep_assert_held_rcu_node(rnp);
   
         /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@@ -1300,7 -1292,7 +1300,7 @@@ static void rcu_accelerate_cbs_unlocked
         unsigned long c;
         bool needwake;
   
- -      lockdep_assert_irqs_disabled();
+ +      rcu_lockdep_assert_cblist_protected(rdp);
         c = rcu_seq_snap(&rcu_state.gp_seq);
         if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
                 /* Old request still live, so mark recent callbacks. */
@@@ -1326,7 -1318,6 +1326,7 @@@
    */
   static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
   {
+ +      rcu_lockdep_assert_cblist_protected(rdp);
         raw_lockdep_assert_held_rcu_node(rnp);
   
         /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@@ -1343,21 -1334,6 +1343,21 @@@
         return rcu_accelerate_cbs(rnp, rdp);
   }
   
+ +/*
+ + * Move and classify callbacks, but only if doing so won't require
+ + * that the RCU grace-period kthread be awakened.
+ + */
+ +static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
+ +                                                struct rcu_data *rdp)
+ +{
+ +      rcu_lockdep_assert_cblist_protected(rdp);
+ +      if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
+ +          !raw_spin_trylock_rcu_node(rnp))
+ +              return;
+ +      WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ +      raw_spin_unlock_rcu_node(rnp);
+ +}
+ +
   /*
    * Update CPU-local rcu_data state to record the beginnings and ends of
    * grace periods.  The caller must hold the ->lock of the leaf rcu_node
@@@ -1366,10 -1342,8 +1366,10 @@@
    */
   static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
   {
- -      bool ret;
+ +      bool ret = false;
         bool need_gp;
+ +      const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ +                             rcu_segcblist_is_offloaded(&rdp->cblist);
   
         raw_lockdep_assert_held_rcu_node(rnp);
   
@@@ -1379,12 -1353,10 +1379,12 @@@
         /* Handle the ends of any preceding grace periods first. */
         if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
             unlikely(READ_ONCE(rdp->gpwrap))) {
- -              ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */
+ +              if (!offloaded)
+ +                      ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
                 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
         } else {
- -              ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */
+ +              if (!offloaded)
+ +                      ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
         }
   
         /* Now handle the beginnings of any new-to-this-CPU grace periods. */
@@@ -1685,7 -1657,6 +1685,7 @@@ static void rcu_gp_cleanup(void
         unsigned long gp_duration;
         bool needgp = false;
         unsigned long new_gp_seq;
+ +      bool offloaded;
         struct rcu_data *rdp;
         struct rcu_node *rnp = rcu_get_root();
         struct swait_queue_head *sq;
@@@ -1751,9 -1722,7 +1751,9 @@@
                 needgp = true;
         }
         /* Advance CBs to reduce false positives below. */
- -      if (!rcu_accelerate_cbs(rnp, rdp) && needgp) {
+ +      offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ +                  rcu_segcblist_is_offloaded(&rdp->cblist);
+ +      if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
                 WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
                 rcu_state.gp_req_activity = jiffies;
                 trace_rcu_grace_period(rcu_state.name,
@@@ -1912,7 -1881,7 +1912,7 @@@ rcu_report_unblock_qs_rnp(struct rcu_no
         struct rcu_node *rnp_p;
   
         raw_lockdep_assert_held_rcu_node(rnp);
-       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
+       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
             WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
             rnp->qsmask != 0) {
                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@@ -1947,9 -1916,7 +1947,9 @@@ rcu_report_qs_rdp(int cpu, struct rcu_d
   {
         unsigned long flags;
         unsigned long mask;
- -      bool needwake;
+ +      bool needwake = false;
+ +      const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ +                             rcu_segcblist_is_offloaded(&rdp->cblist);
         struct rcu_node *rnp;
   
         rnp = rdp->mynode;
@@@ -1976,8 -1943,7 +1976,8 @@@
                  * This GP can't end until cpu checks in, so all of our
                  * callbacks can be processed during the next GP.
                  */
- -              needwake = rcu_accelerate_cbs(rnp, rdp);
+ +              if (!offloaded)
+ +                      needwake = rcu_accelerate_cbs(rnp, rdp);
   
                 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
                 /* ^^^ Released rnp->lock */
@@@ -2111,12 -2077,9 +2111,12 @@@ int rcutree_dead_cpu(unsigned int cpu
   static void rcu_do_batch(struct rcu_data *rdp)
   {
         unsigned long flags;
+ +      const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ +                             rcu_segcblist_is_offloaded(&rdp->cblist);
         struct rcu_head *rhp;
         struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
         long bl, count;
+ +      long pending, tlimit = 0;
   
         /* If no callbacks are ready, just return. */
         if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@@ -2136,19 -2099,13 +2136,19 @@@
          * callback counts, as rcu_barrier() needs to be conservative.
          */
         local_irq_save(flags);
+ +      rcu_nocb_lock(rdp);
         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- -      bl = rdp->blimit;
+ +      pending = rcu_segcblist_n_cbs(&rdp->cblist);
+ +      bl = max(rdp->blimit, pending >> rcu_divisor);
+ +      if (unlikely(bl > 100))
+ +              tlimit = local_clock() + rcu_resched_ns;
         trace_rcu_batch_start(rcu_state.name,
                               rcu_segcblist_n_lazy_cbs(&rdp->cblist),
                               rcu_segcblist_n_cbs(&rdp->cblist), bl);
         rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- -      local_irq_restore(flags);
+ +      if (offloaded)
+ +              rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+ +      rcu_nocb_unlock_irqrestore(rdp, flags);
   
         /* Invoke callbacks. */
         rhp = rcu_cblist_dequeue(&rcl);
@@@ -2160,29 -2117,13 +2160,29 @@@
                  * Stop only if limit reached and CPU has something to do.
                  * Note: The rcl structure counts down from zero.
                  */
- -              if (-rcl.len >= bl &&
+ +              if (-rcl.len >= bl && !offloaded &&
                     (need_resched() ||
                      (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                         break;
+ +              if (unlikely(tlimit)) {
+ +                      /* only call local_clock() every 32 callbacks */
+ +                      if (likely((-rcl.len & 31) || local_clock() < tlimit))
+ +                              continue;
+ +                      /* Exceeded the time limit, so leave. */
+ +                      break;
+ +              }
+ +              if (offloaded) {
+ +                      WARN_ON_ONCE(in_serving_softirq());
+ +                      local_bh_enable();
+ +                      lockdep_assert_irqs_enabled();
+ +                      cond_resched_tasks_rcu_qs();
+ +                      lockdep_assert_irqs_enabled();
+ +                      local_bh_disable();
+ +              }
         }
   
         local_irq_save(flags);
+ +      rcu_nocb_lock(rdp);
         count = -rcl.len;
         trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
                             is_idle_task(current), rcu_is_callbacks_kthread());
@@@ -2208,14 -2149,12 +2208,14 @@@
          * The following usually indicates a double call_rcu().  To track
          * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
          */
- -      WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
+ +      WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ +      WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ +                   count != 0 && rcu_segcblist_empty(&rdp->cblist));
   
- -      local_irq_restore(flags);
+ +      rcu_nocb_unlock_irqrestore(rdp, flags);
   
         /* Re-invoke RCU core processing if there are callbacks remaining. */
- -      if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ +      if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
                 invoke_rcu_core();
   }
   
@@@ -2266,7 -2205,7 +2266,7 @@@ static void force_qs_rnp(int (*f)(struc
                 mask = 0;
                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
                 if (rnp->qsmask == 0) {
-                       if (!IS_ENABLED(CONFIG_PREEMPT) ||
+                       if (!IS_ENABLED(CONFIG_PREEMPTION) ||
                             rcu_preempt_blocked_readers_cgp(rnp)) {
                                 /*
                                  * No point in scanning bits because they
@@@ -2341,8 -2280,6 +2341,8 @@@ static __latent_entropy void rcu_core(v
         unsigned long flags;
         struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
         struct rcu_node *rnp = rdp->mynode;
+ +      const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ +                             rcu_segcblist_is_offloaded(&rdp->cblist);
   
         if (cpu_is_offline(smp_processor_id()))
                 return;
@@@ -2362,7 -2299,7 +2362,7 @@@
   
         /* No grace period and unregistered callbacks? */
         if (!rcu_gp_in_progress() &&
- -          rcu_segcblist_is_enabled(&rdp->cblist)) {
+ +          rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
                 local_irq_save(flags);
                 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
                         rcu_accelerate_cbs_unlocked(rnp, rdp);
@@@ -2372,7 -2309,7 +2372,7 @@@
         rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
   
         /* If there are callbacks ready, invoke them. */
- -      if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ +      if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
             likely(READ_ONCE(rcu_scheduler_fully_active)))
                 rcu_do_batch(rdp);
   
@@@ -2552,11 -2489,10 +2552,11 @@@ static void rcu_leak_callback(struct rc
    * is expected to specify a CPU.
    */
   static void
- -__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
+ +__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
   {
         unsigned long flags;
         struct rcu_data *rdp;
+ +      bool was_alldone;
   
         /* Misaligned rcu_head! */
         WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@@ -2578,18 -2514,28 +2578,18 @@@
         rdp = this_cpu_ptr(&rcu_data);
   
         /* Add the callback to our list. */
- -      if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
- -              int offline;
- -
- -              if (cpu != -1)
- -                      rdp = per_cpu_ptr(&rcu_data, cpu);
- -              if (likely(rdp->mynode)) {
- -                      /* Post-boot, so this should be for a no-CBs CPU. */
- -                      offline = !__call_rcu_nocb(rdp, head, lazy, flags);
- -                      WARN_ON_ONCE(offline);
- -                      /* Offline CPU, _call_rcu() illegal, leak callback.  */
- -                      local_irq_restore(flags);
- -                      return;
- -              }
- -              /*
- -               * Very early boot, before rcu_init().  Initialize if needed
- -               * and then drop through to queue the callback.
- -               */
- -              WARN_ON_ONCE(cpu != -1);
+ +      if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
+ +              // This can trigger due to call_rcu() from offline CPU:
+ +              WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
                 WARN_ON_ONCE(!rcu_is_watching());
+ +              // Very early boot, before rcu_init().  Initialize if needed
+ +              // and then drop through to queue the callback.
                 if (rcu_segcblist_empty(&rdp->cblist))
                         rcu_segcblist_init(&rdp->cblist);
         }
+ +      if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+ +              return; // Enqueued onto ->nocb_bypass, so just leave.
+ +      /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
         rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
         if (__is_kfree_rcu_offset((unsigned long)func))
                 trace_rcu_kfree_callback(rcu_state.name, head,
@@@ -2602,13 -2548,8 +2602,13 @@@
                                    rcu_segcblist_n_cbs(&rdp->cblist));
   
         /* Go handle any RCU core processing required. */
- -      __call_rcu_core(rdp, head, flags);
- -      local_irq_restore(flags);
+ +      if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ +          unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
+ +              __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+ +      } else {
+ +              __call_rcu_core(rdp, head, flags);
+ +              local_irq_restore(flags);
+ +      }
   }
   
   /**
@@@ -2648,7 -2589,7 +2648,7 @@@
    */
   void call_rcu(struct rcu_head *head, rcu_callback_t func)
   {
- -      __call_rcu(head, func, -1, 0);
+ +      __call_rcu(head, func, 0);
   }
   EXPORT_SYMBOL_GPL(call_rcu);
   
@@@ -2661,7 -2602,7 +2661,7 @@@
    */
   void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
   {
- -      __call_rcu(head, func, -1, 1);
+ +      __call_rcu(head, func, 1);
   }
   EXPORT_SYMBOL_GPL(kfree_call_rcu);
   
@@@ -2681,7 -2622,7 +2681,7 @@@ static int rcu_blocking_is_gp(void
   {
         int ret;
   
-       if (IS_ENABLED(CONFIG_PREEMPT))
+       if (IS_ENABLED(CONFIG_PREEMPTION))
                 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
         might_sleep();  /* Check for RCU read-side critical section. */
         preempt_disable();
@@@ -2794,10 -2735,6 +2794,10 @@@ static int rcu_pending(void
         /* Check for CPU stalls, if enabled. */
         check_cpu_stall(rdp);
   
+ +      /* Does this CPU need a deferred NOCB wakeup? */
+ +      if (rcu_nocb_need_deferred_wakeup(rdp))
+ +              return 1;
+ +
         /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
         if (rcu_nohz_full_cpu())
                 return 0;
@@@ -2813,8 -2750,6 +2813,8 @@@
         /* Has RCU gone idle with this CPU needing another grace period? */
         if (!rcu_gp_in_progress() &&
             rcu_segcblist_is_enabled(&rdp->cblist) &&
+ +          (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
+ +           !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
             !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
                 return 1;
   
@@@ -2823,6 -2758,10 +2823,6 @@@
             unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
                 return 1;
   
- -      /* Does this CPU need a deferred NOCB wakeup? */
- -      if (rcu_nocb_need_deferred_wakeup(rdp))
- -              return 1;
- -
         /* nothing to do */
         return 0;
   }
@@@ -2862,8 -2801,6 +2862,8 @@@ static void rcu_barrier_func(void *unus
         rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
         rdp->barrier_head.func = rcu_barrier_callback;
         debug_rcu_head_queue(&rdp->barrier_head);
+ +      rcu_nocb_lock(rdp);
+ +      WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
         if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
                 atomic_inc(&rcu_state.barrier_cpu_count);
         } else {
@@@ -2871,7 -2808,6 +2871,7 @@@
                 rcu_barrier_trace(TPS("IRQNQ"), -1,
                                    rcu_state.barrier_sequence);
         }
+ +      rcu_nocb_unlock(rdp);
   }
   
   /**
@@@ -2922,11 -2858,22 +2922,11 @@@ void rcu_barrier(void
          * corresponding CPU's preceding callbacks have been invoked.
          */
         for_each_possible_cpu(cpu) {
- -              if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
- -                      continue;
                 rdp = per_cpu_ptr(&rcu_data, cpu);
- -              if (rcu_is_nocb_cpu(cpu)) {
- -                      if (!rcu_nocb_cpu_needs_barrier(cpu)) {
- -                              rcu_barrier_trace(TPS("OfflineNoCB"), cpu,
- -                                                 rcu_state.barrier_sequence);
- -                      } else {
- -                              rcu_barrier_trace(TPS("OnlineNoCB"), cpu,
- -                                                 rcu_state.barrier_sequence);
- -                              smp_mb__before_atomic();
- -                              atomic_inc(&rcu_state.barrier_cpu_count);
- -                              __call_rcu(&rdp->barrier_head,
- -                                         rcu_barrier_callback, cpu, 0);
- -                      }
- -              } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
+ +              if (!cpu_online(cpu) &&
+ +                  !rcu_segcblist_is_offloaded(&rdp->cblist))
+ +                      continue;
+ +              if (rcu_segcblist_n_cbs(&rdp->cblist)) {
                         rcu_barrier_trace(TPS("OnlineQ"), cpu,
                                            rcu_state.barrier_sequence);
                         smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
@@@ -3011,8 -2958,7 +3011,8 @@@ rcu_boot_init_percpu_data(int cpu
    * Initializes a CPU's per-CPU RCU data.  Note that only one online or
    * offline event can be happening at a given time.  Note also that we can
    * accept some slop in the rsp->gp_seq access due to the fact that this
- - * CPU cannot possibly have any RCU callbacks in flight yet.
+ + * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
+ + * And any offloaded callbacks are being numbered elsewhere.
    */
   int rcutree_prepare_cpu(unsigned int cpu)
   {
@@@ -3026,7 -2972,7 +3026,7 @@@
         rdp->n_force_qs_snap = rcu_state.n_force_qs;
         rdp->blimit = blimit;
         if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- -          !init_nocb_callback_list(rdp))
+ +          !rcu_segcblist_is_offloaded(&rdp->cblist))
                 rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
         rdp->dynticks_nesting = 1;      /* CPU not up, no tearing. */
         rcu_dynticks_eqs_online();
@@@ -3205,38 -3151,29 +3205,38 @@@ void rcutree_migrate_callbacks(int cpu
   {
         unsigned long flags;
         struct rcu_data *my_rdp;
+ +      struct rcu_node *my_rnp;
         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- -      struct rcu_node *rnp_root = rcu_get_root();
         bool needwake;
   
- -      if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ +      if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ +          rcu_segcblist_empty(&rdp->cblist))
                 return;  /* No callbacks to migrate. */
   
         local_irq_save(flags);
         my_rdp = this_cpu_ptr(&rcu_data);
- -      if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
- -              local_irq_restore(flags);
- -              return;
- -      }
- -      raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ +      my_rnp = my_rdp->mynode;
+ +      rcu_nocb_lock(my_rdp); /* irqs already disabled. */
+ +      WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+ +      raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
         /* Leverage recent GPs and set GP for new callbacks. */
- -      needwake = rcu_advance_cbs(rnp_root, rdp) ||
- -                 rcu_advance_cbs(rnp_root, my_rdp);
+ +      needwake = rcu_advance_cbs(my_rnp, rdp) ||
+ +                 rcu_advance_cbs(my_rnp, my_rdp);
         rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ +      needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
+ +      rcu_segcblist_disable(&rdp->cblist);
         WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
                      !rcu_segcblist_n_cbs(&my_rdp->cblist));
- -      raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ +      if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
+ +              raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
+ +              __call_rcu_nocb_wake(my_rdp, true, flags);
+ +      } else {
+ +              rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
+ +              raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
+ +      }
         if (needwake)
                 rcu_gp_kthread_wake();
+ +      lockdep_assert_irqs_enabled();
         WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
                   !rcu_segcblist_empty(&rdp->cblist),
                   "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@@ -3297,13 -3234,13 +3297,13 @@@ static int __init rcu_spawn_gp_kthread(
         t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
         if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
                 return 0;
-       rnp = rcu_get_root();
-       raw_spin_lock_irqsave_rcu_node(rnp, flags);
-       rcu_state.gp_kthread = t;
         if (kthread_prio) {
                 sp.sched_priority = kthread_prio;
                 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
         }
+       rnp = rcu_get_root();
+       raw_spin_lock_irqsave_rcu_node(rnp, flags);
+       rcu_state.gp_kthread = t;
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
         wake_up_process(t);
         rcu_spawn_nocb_kthreads();
diff --combined kernel/rcu/tree_stall.h

index 841ab43f3e60d4b26df3a63477fe65814947a826,9b92bf18b737e2dbf2702d8be91b924e60082bb1..c0b8c458d8a6ad267151f6cbffc791c217aeefdf
--- 1/kernel/rcu/tree_stall.h
--- 2/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@@ -163,7 -163,7 +163,7 @@@ static void rcu_iw_handler(struct irq_w
   //
   // Printing RCU CPU stall warnings
   
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
   
   /*
    * Dump detailed information for all tasks blocking the current RCU
@@@ -215,7 -215,7 +215,7 @@@ static int rcu_print_task_stall(struct 
         return ndetected;
   }
   
- #else /* #ifdef CONFIG_PREEMPT */
+ #else /* #ifdef CONFIG_PREEMPTION */
   
   /*
    * Because preemptible RCU does not exist, we never have to check for
@@@ -233,7 -233,7 +233,7 @@@ static int rcu_print_task_stall(struct 
   {
         return 0;
   }
- #endif /* #else #ifdef CONFIG_PREEMPT */
+ #endif /* #else #ifdef CONFIG_PREEMPTION */
   
   /*
    * Dump stacks of all tasks running on stalled CPUs.  First try using
@@@ -527,8 -527,6 +527,8 @@@ static void check_cpu_stall(struct rcu_
   
                 /* We haven't checked in, so go dump stack. */
                 print_cpu_stall();
+ +              if (rcu_cpu_stall_ftrace_dump)
+ +                      rcu_ftrace_dump(DUMP_ALL);
   
         } else if (rcu_gp_in_progress() &&
                    ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
@@@ -536,8 -534,6 +536,8 @@@
   
                 /* They had a few time units to dump stack, so complain. */
                 print_other_cpu_stall(gs2);
+ +              if (rcu_cpu_stall_ftrace_dump)
+ +                      rcu_ftrace_dump(DUMP_ALL);
         }
   }
   
@@@ -589,11 -585,6 +589,11 @@@ void show_rcu_gp_kthreads(void
                                 cpu, (long)rdp->gp_seq_needed);
                 }
         }
+ +      for_each_possible_cpu(cpu) {
+ +              rdp = per_cpu_ptr(&rcu_data, cpu);
+ +              if (rcu_segcblist_is_offloaded(&rdp->cblist))
+ +                      show_rcu_nocb_state(rdp);
+ +      }
         /* sched_show_task(rcu_state.gp_kthread); */
   }
   EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
diff --combined kernel/sched/core.c

index 7fa8e74ad2ab4003457d266df57373f41f0e0d2a,87b84a726db448c76edd1fd46a387e392de9255c..06961b997ed6d8c13ced5558520f75b07c85aedc
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -773,6 -773,18 +773,18 @@@ static void set_load_weight(struct task
   }
   
   #ifdef CONFIG_UCLAMP_TASK
+ /*
+  * Serializes updates of utilization clamp values
+  *
+  * The (slow-path) user-space triggers utilization clamp value updates which
+  * can require updates on (fast-path) scheduler's data structures used to
+  * support enqueue/dequeue operations.
+  * While the per-CPU rq lock protects fast-path update operations, user-space
+  * requests are serialized using a mutex to reduce the risk of conflicting
+  * updates or API abuses.
+  */
+ static DEFINE_MUTEX(uclamp_mutex);
+ 
   /* Max allowed minimum utilization */
   unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
   
@@@ -798,7 -810,7 +810,7 @@@ static inline unsigned int uclamp_bucke
         return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
   }
   
- static inline unsigned int uclamp_none(int clamp_id)
+ static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
   {
         if (clamp_id == UCLAMP_MIN)
                 return 0;
@@@ -814,7 -826,7 +826,7 @@@ static inline void uclamp_se_set(struc
   }
   
   static inline unsigned int
- uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+ uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                   unsigned int clamp_value)
   {
         /*
@@@ -830,7 -842,7 +842,7 @@@
         return uclamp_none(UCLAMP_MIN);
   }
   
- static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                      unsigned int clamp_value)
   {
         /* Reset max-clamp retention only on idle exit */
@@@ -841,8 -853,8 +853,8 @@@
   }
   
   static inline
- unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
-                                unsigned int clamp_value)
+ enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+                                  unsigned int clamp_value)
   {
         struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
         int bucket_id = UCLAMP_BUCKETS - 1;
@@@ -861,16 -873,42 +873,42 @@@
         return uclamp_idle_value(rq, clamp_id, clamp_value);
   }
   
+ static inline struct uclamp_se
+ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+ {
+       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+       struct uclamp_se uc_max;
+ 
+       /*
+        * Tasks in autogroups or root task group will be
+        * restricted by system defaults.
+        */
+       if (task_group_is_autogroup(task_group(p)))
+               return uc_req;
+       if (task_group(p) == &root_task_group)
+               return uc_req;
+ 
+       uc_max = task_group(p)->uclamp[clamp_id];
+       if (uc_req.value > uc_max.value || !uc_req.user_defined)
+               return uc_max;
+ #endif
+ 
+       return uc_req;
+ }
+ 
   /*
    * The effective clamp bucket index of a task depends on, by increasing
    * priority:
    * - the task specific clamp value, when explicitly requested from userspace
+  * - the task group effective clamp value, for tasks not either in the root
+  *   group or in an autogroup
    * - the system default clamp value, defined by the sysadmin
    */
   static inline struct uclamp_se
- uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
   {
-       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+       struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
         struct uclamp_se uc_max = uclamp_default[clamp_id];
   
         /* System default restrictions always apply */
@@@ -880,7 -918,7 +918,7 @@@
         return uc_req;
   }
   
- unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+ enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
   {
         struct uclamp_se uc_eff;
   
@@@ -904,7 -942,7 +942,7 @@@
    * for each bucket when all its RUNNABLE tasks require the same clamp.
    */
   static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
   {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@@ -942,7 -980,7 +980,7 @@@
    * enforce the expected state and warn.
    */
   static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
   {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@@ -981,7 -1019,7 +1019,7 @@@
   
   static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
   {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
   
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@@ -996,7 -1034,7 +1034,7 @@@
   
   static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
   {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
   
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@@ -1005,15 -1043,82 +1043,82 @@@
                 uclamp_rq_dec_id(rq, p, clamp_id);
   }
   
+ static inline void
+ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+ {
+       struct rq_flags rf;
+       struct rq *rq;
+ 
+       /*
+        * Lock the task and the rq where the task is (or was) queued.
+        *
+        * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+        * price to pay to safely serialize util_{min,max} updates with
+        * enqueues, dequeues and migration operations.
+        * This is the same locking schema used by __set_cpus_allowed_ptr().
+        */
+       rq = task_rq_lock(p, &rf);
+ 
+       /*
+        * Setting the clamp bucket is serialized by task_rq_lock().
+        * If the task is not yet RUNNABLE and its task_struct is not
+        * affecting a valid clamp bucket, the next time it's enqueued,
+        * it will already see the updated clamp bucket value.
+        */
+       if (!p->uclamp[clamp_id].active) {
+               uclamp_rq_dec_id(rq, p, clamp_id);
+               uclamp_rq_inc_id(rq, p, clamp_id);
+       }
+ 
+       task_rq_unlock(rq, p, &rf);
+ }
+ 
+ static inline void
+ uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+                          unsigned int clamps)
+ {
+       enum uclamp_id clamp_id;
+       struct css_task_iter it;
+       struct task_struct *p;
+ 
+       css_task_iter_start(css, 0, &it);
+       while ((p = css_task_iter_next(&it))) {
+               for_each_clamp_id(clamp_id) {
+                       if ((0x1 << clamp_id) & clamps)
+                               uclamp_update_active(p, clamp_id);
+               }
+       }
+       css_task_iter_end(&it);
+ }
+ 
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+ static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+ static void uclamp_update_root_tg(void)
+ {
+       struct task_group *tg = &root_task_group;
+ 
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+                     sysctl_sched_uclamp_util_min, false);
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+                     sysctl_sched_uclamp_util_max, false);
+ 
+       rcu_read_lock();
+       cpu_util_update_eff(&root_task_group.css);
+       rcu_read_unlock();
+ }
+ #else
+ static void uclamp_update_root_tg(void) { }
+ #endif
+ 
   int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos)
   {
+       bool update_root_tg = false;
         int old_min, old_max;
-       static DEFINE_MUTEX(mutex);
         int result;
   
-       mutex_lock(&mutex);
+       mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
   
@@@ -1032,23 -1137,30 +1137,30 @@@
         if (old_min != sysctl_sched_uclamp_util_min) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                               sysctl_sched_uclamp_util_min, false);
+               update_root_tg = true;
         }
         if (old_max != sysctl_sched_uclamp_util_max) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                               sysctl_sched_uclamp_util_max, false);
+               update_root_tg = true;
         }
   
+       if (update_root_tg)
+               uclamp_update_root_tg();
+ 
         /*
-        * Updating all the RUNNABLE task is expensive, keep it simple and do
-        * just a lazy update at each next enqueue time.
+        * We update all RUNNABLE tasks only when task groups are in use.
+        * Otherwise, keep it simple and do just a lazy update at each next
+        * task enqueue time.
          */
+ 
         goto done;
   
   undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
   done:
-       mutex_unlock(&mutex);
+       mutex_unlock(&uclamp_mutex);
   
         return result;
   }
@@@ -1075,7 -1187,7 +1187,7 @@@ static int uclamp_validate(struct task_
   static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr)
   {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
   
         /*
          * On scheduling class change, reset to default clamps for tasks
@@@ -1112,7 -1224,7 +1224,7 @@@
   
   static void uclamp_fork(struct task_struct *p)
   {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
   
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
@@@ -1134,9 -1246,11 +1246,11 @@@
   static void __init init_uclamp(void)
   {
         struct uclamp_se uc_max = {};
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
         int cpu;
   
+       mutex_init(&uclamp_mutex);
+ 
         for_each_possible_cpu(cpu) {
                 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                 cpu_rq(cpu)->uclamp_flags = 0;
@@@ -1149,8 -1263,13 +1263,13 @@@
   
         /* System defaults allow max clamp values for both indexes */
         uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
-       for_each_clamp_id(clamp_id)
+       for_each_clamp_id(clamp_id) {
                 uclamp_default[clamp_id] = uc_max;
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+               root_task_group.uclamp_req[clamp_id] = uc_max;
+               root_task_group.uclamp[clamp_id] = uc_max;
+ #endif
+       }
   }
   
   #else /* CONFIG_UCLAMP_TASK */
@@@ -1494,7 -1613,7 +1613,7 @@@ void do_set_cpus_allowed(struct task_st
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
   }
   
   /*
@@@ -3214,12 -3333,8 +3333,8 @@@ static __always_inline struct rq 
   context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next, struct rq_flags *rf)
   {
-       struct mm_struct *mm, *oldmm;
- 
         prepare_task_switch(rq, prev, next);
   
-       mm = next->mm;
-       oldmm = prev->active_mm;
         /*
          * For paravirt, this is coupled with an exit in switch_to to
          * combine the page table reload and the switch backend into
@@@ -3228,22 -3343,37 +3343,37 @@@
         arch_start_context_switch(prev);
   
         /*
-        * If mm is non-NULL, we pass through switch_mm(). If mm is
-        * NULL, we will pass through mmdrop() in finish_task_switch().
-        * Both of these contain the full memory barrier required by
-        * membarrier after storing to rq->curr, before returning to
-        * user-space.
+        * kernel -> kernel   lazy + transfer active
+        *   user -> kernel   lazy + mmgrab() active
+        *
+        * kernel ->   user   switch + mmdrop() active
+        *   user ->   user   switch
          */
-       if (!mm) {
-               next->active_mm = oldmm;
-               mmgrab(oldmm);
-               enter_lazy_tlb(oldmm, next);
-       } else
-               switch_mm_irqs_off(oldmm, mm, next);
+       if (!next->mm) {                                // to kernel
+               enter_lazy_tlb(prev->active_mm, next);
+ 
+               next->active_mm = prev->active_mm;
+               if (prev->mm)                           // from user
+                       mmgrab(prev->active_mm);
+               else
+                       prev->active_mm = NULL;
+       } else {                                        // to user
+               /*
+                * sys_membarrier() requires an smp_mb() between setting
+                * rq->curr and returning to userspace.
+                *
+                * The below provides this either through switch_mm(), or in
+                * case 'prev->active_mm == next->mm' through
+                * finish_task_switch()'s mmdrop().
+                */
+ 
+               switch_mm_irqs_off(prev->active_mm, next->mm, next);
   
-       if (!prev->mm) {
-               prev->active_mm = NULL;
-               rq->prev_mm = oldmm;
+               if (!prev->mm) {                        // from kernel
+                       /* will mmdrop() in finish_task_switch(). */
+                       rq->prev_mm = prev->active_mm;
+                       prev->active_mm = NULL;
+               }
         }
   
         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@@ -3622,7 -3752,7 +3752,7 @@@ static inline void sched_tick_start(in
   static inline void sched_tick_stop(int cpu) { }
   #endif
   
- #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
   /*
    * If the value passed in is equal to the current preempt count
@@@ -3780,7 -3910,7 +3910,7 @@@ pick_next_task(struct rq *rq, struct ta
   
                 p = fair_sched_class.pick_next_task(rq, prev, rf);
                 if (unlikely(p == RETRY_TASK))
-                       goto again;
+                       goto restart;
   
                 /* Assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
@@@ -3789,14 -3919,19 +3919,19 @@@
                 return p;
         }
   
- again:
+ restart:
+       /*
+        * Ensure that we put DL/RT tasks before the pick loop, such that they
+        * can PULL higher prio tasks when we lower the RQ 'priority'.
+        */
+       prev->sched_class->put_prev_task(rq, prev, rf);
+       if (!rq->nr_running)
+               newidle_balance(rq, rf);
+ 
         for_each_class(class) {
-               p = class->pick_next_task(rq, prev, rf);
-               if (p) {
-                       if (unlikely(p == RETRY_TASK))
-                               goto again;
+               p = class->pick_next_task(rq, NULL, NULL);
+               if (p)
                         return p;
-               }
         }
   
         /* The idle class should always have a runnable task: */
@@@ -3823,7 -3958,7 +3958,7 @@@
    *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
    *      called on the nearest possible occasion:
    *
-  *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+  *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
    *
    *         - in syscall or exception context, at the next outmost
    *           preempt_enable(). (this might be as soon as the wake_up()'s
@@@ -3832,7 -3967,7 +3967,7 @@@
    *         - in IRQ context, return from interrupt-handler to
    *           preemptible context
    *
-  *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+  *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
    *         then at the next:
    *
    *          - cond_resched() call
@@@ -3945,7 -4080,7 +4080,7 @@@ void __noreturn do_task_dead(void
   
   static inline void sched_submit_work(struct task_struct *tsk)
   {
- -      if (!tsk->state || tsk_is_pi_blocked(tsk))
+ +      if (!tsk->state)
                 return;
   
         /*
@@@ -3961,9 -4096,6 +4096,9 @@@
                 preempt_enable_no_resched();
         }
   
+ +      if (tsk_is_pi_blocked(tsk))
+ +              return;
+ +
         /*
          * If we are going to sleep and we have plugged IO queued,
          * make sure to submit it to avoid deadlocks.
@@@ -4077,7 -4209,7 +4212,7 @@@ static void __sched notrace preempt_sch
         } while (need_resched());
   }
   
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
   /*
    * this is the entry point to schedule() from in-kernel preemption
    * off of preempt_enable. Kernel preemptions off return from interrupt
@@@ -4149,7 -4281,7 +4284,7 @@@ asmlinkage __visible void __sched notra
   }
   EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
   
- #endif /* CONFIG_PREEMPT */
+ #endif /* CONFIG_PREEMPTION */
   
   /*
    * this is the entry point to schedule() from kernel preemption
@@@ -4317,7 -4449,7 +4452,7 @@@ void rt_mutex_setprio(struct task_struc
         if (queued)
                 enqueue_task(rq, p, queue_flag);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
   
         check_class_changed(rq, p, prev_class, oldprio);
   out_unlock:
@@@ -4384,7 -4516,7 +4519,7 @@@ void set_user_nice(struct task_struct *
                         resched_curr(rq);
         }
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
   out_unlock:
         task_rq_unlock(rq, p, &rf);
   }
@@@ -4701,6 -4833,9 +4836,9 @@@ recheck
                         return retval;
         }
   
+       if (pi)
+               cpuset_read_lock();
+ 
         /*
          * Make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
@@@ -4715,8 -4850,8 +4853,8 @@@
          * Changing the policy of the stop threads its a very bad idea:
          */
         if (p == rq->stop) {
-               task_rq_unlock(rq, p, &rf);
-               return -EINVAL;
+               retval = -EINVAL;
+               goto unlock;
         }
   
         /*
@@@ -4734,8 -4869,8 +4872,8 @@@
                         goto change;
   
                 p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &rf);
-               return 0;
+               retval = 0;
+               goto unlock;
         }
   change:
   
@@@ -4748,8 -4883,8 +4886,8 @@@
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &rf);
-                       return -EPERM;
+                       retval = -EPERM;
+                       goto unlock;
                 }
   #endif
   #ifdef CONFIG_SMP
@@@ -4764,8 -4899,8 +4902,8 @@@
                          */
                         if (!cpumask_subset(span, p->cpus_ptr) ||
                             rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &rf);
-                               return -EPERM;
+                               retval = -EPERM;
+                               goto unlock;
                         }
                 }
   #endif
@@@ -4775,6 -4910,8 +4913,8 @@@
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
                 task_rq_unlock(rq, p, &rf);
+               if (pi)
+                       cpuset_read_unlock();
                 goto recheck;
         }
   
@@@ -4784,8 -4921,8 +4924,8 @@@
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &rf);
-               return -EBUSY;
+               retval = -EBUSY;
+               goto unlock;
         }
   
         p->sched_reset_on_fork = reset_on_fork;
@@@ -4827,7 -4964,7 +4967,7 @@@
                 enqueue_task(rq, p, queue_flags);
         }
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
   
         check_class_changed(rq, p, prev_class, oldprio);
   
@@@ -4835,14 -4972,22 +4975,22 @@@
         preempt_disable();
         task_rq_unlock(rq, p, &rf);
   
-       if (pi)
+       if (pi) {
+               cpuset_read_unlock();
                 rt_mutex_adjust_pi(p);
+       }
   
         /* Run balance callbacks after we've adjusted the PI chain: */
         balance_callback(rq);
         preempt_enable();
   
         return 0;
+ 
+ unlock:
+       task_rq_unlock(rq, p, &rf);
+       if (pi)
+               cpuset_read_unlock();
+       return retval;
   }
   
   static int _sched_setscheduler(struct task_struct *p, int policy,
@@@ -4926,10 -5071,15 +5074,15 @@@ do_sched_setscheduler(pid_t pid, int po
         rcu_read_lock();
         retval = -ESRCH;
         p = find_process_by_pid(pid);
-       if (p != NULL)
-               retval = sched_setscheduler(p, policy, &lparam);
+       if (likely(p))
+               get_task_struct(p);
         rcu_read_unlock();
   
+       if (likely(p)) {
+               retval = sched_setscheduler(p, policy, &lparam);
+               put_task_struct(p);
+       }
+ 
         return retval;
   }
   
@@@ -5146,40 -5296,37 +5299,40 @@@ out_unlock
         return retval;
   }
   
- -static int sched_read_attr(struct sched_attr __user *uattr,
- -                         struct sched_attr *attr,
- -                         unsigned int usize)
+ +/*
+ + * Copy the kernel size attribute structure (which might be larger
+ + * than what user-space knows about) to user-space.
+ + *
+ + * Note that all cases are valid: user-space buffer can be larger or
+ + * smaller than the kernel-space buffer. The usual case is that both
+ + * have the same size.
+ + */
+ +static int
+ +sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ +                      struct sched_attr *kattr,
+ +                      unsigned int usize)
   {
- -      int ret;
+ +      unsigned int ksize = sizeof(*kattr);
   
         if (!access_ok(uattr, usize))
                 return -EFAULT;
   
         /*
- -       * If we're handed a smaller struct than we know of,
- -       * ensure all the unknown bits are 0 - i.e. old
- -       * user-space does not get uncomplete information.
+ +       * sched_getattr() ABI forwards and backwards compatibility:
+ +       *
+ +       * If usize == ksize then we just copy everything to user-space and all is good.
+ +       *
+ +       * If usize < ksize then we only copy as much as user-space has space for,
+ +       * this keeps ABI compatibility as well. We skip the rest.
+ +       *
+ +       * If usize > ksize then user-space is using a newer version of the ABI,
+ +       * which part the kernel doesn't know about. Just ignore it - tooling can
+ +       * detect the kernel's knowledge of attributes from the attr->size value
+ +       * which is set to ksize in this case.
          */
- -      if (usize < sizeof(*attr)) {
- -              unsigned char *addr;
- -              unsigned char *end;
- -
- -              addr = (void *)attr + usize;
- -              end  = (void *)attr + sizeof(*attr);
+ +      kattr->size = min(usize, ksize);
   
- -              for (; addr < end; addr++) {
- -                      if (*addr)
- -                              return -EFBIG;
- -              }
- -
- -              attr->size = usize;
- -      }
- -
- -      ret = copy_to_user(uattr, attr, attr->size);
- -      if (ret)
+ +      if (copy_to_user(uattr, kattr, kattr->size))
                 return -EFAULT;
   
         return 0;
@@@ -5189,18 -5336,20 +5342,18 @@@
    * sys_sched_getattr - similar to sched_getparam, but with sched_attr
    * @pid: the pid in question.
    * @uattr: structure containing the extended parameters.
- - * @size: sizeof(attr) for fwd/bwd comp.
+ + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
    * @flags: for future extension.
    */
   SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- -              unsigned int, size, unsigned int, flags)
+ +              unsigned int, usize, unsigned int, flags)
   {
- -      struct sched_attr attr = {
- -              .size = sizeof(struct sched_attr),
- -      };
+ +      struct sched_attr kattr = { };
         struct task_struct *p;
         int retval;
   
- -      if (!uattr || pid < 0 || size > PAGE_SIZE ||
- -          size < SCHED_ATTR_SIZE_VER0 || flags)
+ +      if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ +          usize < SCHED_ATTR_SIZE_VER0 || flags)
                 return -EINVAL;
   
         rcu_read_lock();
@@@ -5213,24 -5362,25 +5366,24 @@@
         if (retval)
                 goto out_unlock;
   
- -      attr.sched_policy = p->policy;
+ +      kattr.sched_policy = p->policy;
         if (p->sched_reset_on_fork)
- -              attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ +              kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
         if (task_has_dl_policy(p))
- -              __getparam_dl(p, &attr);
+ +              __getparam_dl(p, &kattr);
         else if (task_has_rt_policy(p))
- -              attr.sched_priority = p->rt_priority;
+ +              kattr.sched_priority = p->rt_priority;
         else
- -              attr.sched_nice = task_nice(p);
+ +              kattr.sched_nice = task_nice(p);
   
   #ifdef CONFIG_UCLAMP_TASK
- -      attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- -      attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ +      kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ +      kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
   #endif
   
         rcu_read_unlock();
   
- -      retval = sched_read_attr(uattr, &attr, size);
- -      return retval;
+ +      return sched_attr_copy_to_user(uattr, &kattr, usize);
   
   out_unlock:
         rcu_read_unlock();
@@@ -5460,7 -5610,7 +5613,7 @@@ SYSCALL_DEFINE0(sched_yield
         return 0;
   }
   
- #ifndef CONFIG_PREEMPT
+ #ifndef CONFIG_PREEMPTION
   int __sched _cond_resched(void)
   {
         if (should_resched(0)) {
@@@ -5477,7 -5627,7 +5630,7 @@@ EXPORT_SYMBOL(_cond_resched)
    * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
    * call schedule, and on return reacquire the lock.
    *
-  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+  * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
    * operations here to prevent schedule() from being called twice (once via
    * spin_unlock(), once by hand).
    */
@@@ -6016,7 -6166,7 +6169,7 @@@ void sched_setnuma(struct task_struct *
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
         task_rq_unlock(rq, p, &rf);
   }
   #endif /* CONFIG_NUMA_BALANCING */
@@@ -6056,21 -6206,22 +6209,22 @@@ static void calc_load_migrate(struct r
                 atomic_long_add(delta, &calc_load_tasks);
   }
   
- static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+ static struct task_struct *__pick_migrate_task(struct rq *rq)
   {
- }
+       const struct sched_class *class;
+       struct task_struct *next;
   
- static const struct sched_class fake_sched_class = {
-       .put_prev_task = put_prev_task_fake,
- };
+       for_each_class(class) {
+               next = class->pick_next_task(rq, NULL, NULL);
+               if (next) {
+                       next->sched_class->put_prev_task(rq, next, NULL);
+                       return next;
+               }
+       }
   
- static struct task_struct fake_task = {
-       /*
-        * Avoid pull_{rt,dl}_task()
-        */
-       .prio = MAX_PRIO + 1,
-       .sched_class = &fake_sched_class,
- };
+       /* The idle class should always have a runnable task */
+       BUG();
+ }
   
   /*
    * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@@ -6113,12 -6264,7 +6267,7 @@@ static void migrate_tasks(struct rq *de
                 if (rq->nr_running == 1)
                         break;
   
-               /*
-                * pick_next_task() assumes pinned rq->lock:
-                */
-               next = pick_next_task(rq, &fake_task, rf);
-               BUG_ON(!next);
-               put_prev_task(rq, next);
+               next = __pick_migrate_task(rq);
   
                 /*
                  * Rules for changing task_struct::cpus_mask are holding
@@@ -6415,19 -6561,19 +6564,19 @@@ DECLARE_PER_CPU(cpumask_var_t, select_i
   
   void __init sched_init(void)
   {
-       unsigned long alloc_size = 0, ptr;
+       unsigned long ptr = 0;
         int i;
   
         wait_bit_init();
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
   #endif
   #ifdef CONFIG_RT_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
   #endif
-       if (alloc_size) {
-               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+       if (ptr) {
+               ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.se = (struct sched_entity **)ptr;
@@@ -6746,7 -6892,7 +6895,7 @@@ struct task_struct *curr_task(int cpu
   
   #ifdef CONFIG_IA64
   /**
-  * set_curr_task - set the current task for a given CPU.
+  * ia64_set_curr_task - set the current task for a given CPU.
    * @cpu: the processor in question.
    * @p: the task pointer to set.
    *
@@@ -6771,6 -6917,20 +6920,20 @@@ void ia64_set_curr_task(int cpu, struc
   /* task_group_lock serializes the addition/removal of task groups */
   static DEFINE_SPINLOCK(task_group_lock);
   
+ static inline void alloc_uclamp_sched_group(struct task_group *tg,
+                                           struct task_group *parent)
+ {
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+       enum uclamp_id clamp_id;
+ 
+       for_each_clamp_id(clamp_id) {
+               uclamp_se_set(&tg->uclamp_req[clamp_id],
+                             uclamp_none(clamp_id), false);
+               tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+       }
+ #endif
+ }
+ 
   static void sched_free_group(struct task_group *tg)
   {
         free_fair_sched_group(tg);
@@@ -6794,6 -6954,8 +6957,8 @@@ struct task_group *sched_create_group(s
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
   
+       alloc_uclamp_sched_group(tg, parent);
+ 
         return tg;
   
   err:
@@@ -6897,7 -7059,7 +7062,7 @@@ void sched_move_task(struct task_struc
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
         if (running)
-               set_curr_task(rq, tsk);
+               set_next_task(rq, tsk);
   
         task_rq_unlock(rq, tsk, &rf);
   }
@@@ -6980,10 -7142,6 +7145,6 @@@ static int cpu_cgroup_can_attach(struc
   #ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
- #else
-               /* We don't support RT-tasks being in separate groups */
-               if (task->sched_class != &fair_sched_class)
-                       return -EINVAL;
   #endif
                 /*
                  * Serialize against wake_up_new_task() such that if its
@@@ -7014,6 -7172,178 +7175,178 @@@ static void cpu_cgroup_attach(struct cg
                 sched_move_task(task);
   }
   
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+ {
+       struct cgroup_subsys_state *top_css = css;
+       struct uclamp_se *uc_parent = NULL;
+       struct uclamp_se *uc_se = NULL;
+       unsigned int eff[UCLAMP_CNT];
+       enum uclamp_id clamp_id;
+       unsigned int clamps;
+ 
+       css_for_each_descendant_pre(css, top_css) {
+               uc_parent = css_tg(css)->parent
+                       ? css_tg(css)->parent->uclamp : NULL;
+ 
+               for_each_clamp_id(clamp_id) {
+                       /* Assume effective clamps matches requested clamps */
+                       eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+                       /* Cap effective clamps with parent's effective clamps */
+                       if (uc_parent &&
+                           eff[clamp_id] > uc_parent[clamp_id].value) {
+                               eff[clamp_id] = uc_parent[clamp_id].value;
+                       }
+               }
+               /* Ensure protection is always capped by limit */
+               eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+ 
+               /* Propagate most restrictive effective clamps */
+               clamps = 0x0;
+               uc_se = css_tg(css)->uclamp;
+               for_each_clamp_id(clamp_id) {
+                       if (eff[clamp_id] == uc_se[clamp_id].value)
+                               continue;
+                       uc_se[clamp_id].value = eff[clamp_id];
+                       uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+                       clamps |= (0x1 << clamp_id);
+               }
+               if (!clamps) {
+                       css = css_rightmost_descendant(css);
+                       continue;
+               }
+ 
+               /* Immediately update descendants RUNNABLE tasks */
+               uclamp_update_active_tasks(css, clamps);
+       }
+ }
+ 
+ /*
+  * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+  * C expression. Since there is no way to convert a macro argument (N) into a
+  * character constant, use two levels of macros.
+  */
+ #define _POW10(exp) ((unsigned int)1e##exp)
+ #define POW10(exp) _POW10(exp)
+ 
+ struct uclamp_request {
+ #define UCLAMP_PERCENT_SHIFT  2
+ #define UCLAMP_PERCENT_SCALE  (100 * POW10(UCLAMP_PERCENT_SHIFT))
+       s64 percent;
+       u64 util;
+       int ret;
+ };
+ 
+ static inline struct uclamp_request
+ capacity_from_percent(char *buf)
+ {
+       struct uclamp_request req = {
+               .percent = UCLAMP_PERCENT_SCALE,
+               .util = SCHED_CAPACITY_SCALE,
+               .ret = 0,
+       };
+ 
+       buf = strim(buf);
+       if (strcmp(buf, "max")) {
+               req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+                                            &req.percent);
+               if (req.ret)
+                       return req;
+               if (req.percent > UCLAMP_PERCENT_SCALE) {
+                       req.ret = -ERANGE;
+                       return req;
+               }
+ 
+               req.util = req.percent << SCHED_CAPACITY_SHIFT;
+               req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+       }
+ 
+       return req;
+ }
+ 
+ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off,
+                               enum uclamp_id clamp_id)
+ {
+       struct uclamp_request req;
+       struct task_group *tg;
+ 
+       req = capacity_from_percent(buf);
+       if (req.ret)
+               return req.ret;
+ 
+       mutex_lock(&uclamp_mutex);
+       rcu_read_lock();
+ 
+       tg = css_tg(of_css(of));
+       if (tg->uclamp_req[clamp_id].value != req.util)
+               uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+ 
+       /*
+        * Because of not recoverable conversion rounding we keep track of the
+        * exact requested value
+        */
+       tg->uclamp_pct[clamp_id] = req.percent;
+ 
+       /* Update effective clamps to track the most restrictive value */
+       cpu_util_update_eff(of_css(of));
+ 
+       rcu_read_unlock();
+       mutex_unlock(&uclamp_mutex);
+ 
+       return nbytes;
+ }
+ 
+ static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+ {
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+ }
+ 
+ static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+ {
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+ }
+ 
+ static inline void cpu_uclamp_print(struct seq_file *sf,
+                                   enum uclamp_id clamp_id)
+ {
+       struct task_group *tg;
+       u64 util_clamp;
+       u64 percent;
+       u32 rem;
+ 
+       rcu_read_lock();
+       tg = css_tg(seq_css(sf));
+       util_clamp = tg->uclamp_req[clamp_id].value;
+       rcu_read_unlock();
+ 
+       if (util_clamp == SCHED_CAPACITY_SCALE) {
+               seq_puts(sf, "max\n");
+               return;
+       }
+ 
+       percent = tg->uclamp_pct[clamp_id];
+       percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+       seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+ }
+ 
+ static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+ {
+       cpu_uclamp_print(sf, UCLAMP_MIN);
+       return 0;
+ }
+ 
+ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+ {
+       cpu_uclamp_print(sf, UCLAMP_MAX);
+       return 0;
+ }
+ #endif /* CONFIG_UCLAMP_TASK_GROUP */
+ 
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@@ -7358,6 -7688,20 +7691,20 @@@ static struct cftype cpu_legacy_files[
                 .read_u64 = cpu_rt_period_read_uint,
                 .write_u64 = cpu_rt_period_write_uint,
         },
+ #endif
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
   #endif
         { }     /* Terminate */
   };
@@@ -7525,6 -7869,20 +7872,20 @@@ static struct cftype cpu_files[] = 
                 .seq_show = cpu_max_show,
                 .write = cpu_max_write,
         },
+ #endif
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
   #endif
         { }     /* terminate */
   };
diff --combined kernel/sched/cpufreq_schedutil.c

index 867b4bb6d4beb541d1d9eb087711d1e52a446416,e127d89d5974499bfbcba34e6d0a456c0d9ac6d3..fdce9cfaca05b802c87a232d77903d6731809716
--- 1/kernel/sched/cpufreq_schedutil.c
--- 2/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@@ -40,7 -40,6 +40,7 @@@ struct sugov_policy 
         struct task_struct      *thread;
         bool                    work_in_progress;
   
+ +      bool                    limits_changed;
         bool                    need_freq_update;
   };
   
@@@ -90,11 -89,8 +90,11 @@@ static bool sugov_should_update_freq(st
             !cpufreq_this_cpu_can_update(sg_policy->policy))
                 return false;
   
- -      if (unlikely(sg_policy->need_freq_update))
+ +      if (unlikely(sg_policy->limits_changed)) {
+ +              sg_policy->limits_changed = false;
+ +              sg_policy->need_freq_update = true;
                 return true;
+ +      }
   
         delta_ns = time - sg_policy->last_freq_update_time;
   
@@@ -263,9 -259,9 +263,9 @@@ unsigned long schedutil_cpu_util(int cp
          * irq metric. Because IRQ/steal time is hidden from the task clock we
          * need to scale the task numbers:
          *
-        *              1 - irq
-        *   U' = irq + ------- * U
-        *                max
+        *              max - irq
+        *   U' = irq + --------- * U
+        *                 max
          */
         util = scale_irq_capacity(util, irq, max);
         util += irq;
@@@ -441,7 -437,7 +441,7 @@@ static inline bool sugov_cpu_is_busy(st
   static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
   {
         if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- -              sg_policy->need_freq_update = true;
+ +              sg_policy->limits_changed = true;
   }
   
   static void sugov_update_single(struct update_util_data *hook, u64 time,
@@@ -461,8 -457,7 +461,8 @@@
         if (!sugov_should_update_freq(sg_policy, time))
                 return;
   
- -      busy = sugov_cpu_is_busy(sg_cpu);
+ +      /* Limits may have changed, don't skip frequency update */
+ +      busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
   
         util = sugov_get_util(sg_cpu);
         max = sg_cpu->max;
@@@ -836,7 -831,6 +836,7 @@@ static int sugov_start(struct cpufreq_p
         sg_policy->last_freq_update_time        = 0;
         sg_policy->next_freq                    = 0;
         sg_policy->work_in_progress             = false;
+ +      sg_policy->limits_changed               = false;
         sg_policy->need_freq_update             = false;
         sg_policy->cached_raw_freq              = 0;
   
@@@ -885,7 -879,7 +885,7 @@@ static void sugov_limits(struct cpufreq
                 mutex_unlock(&sg_policy->work_lock);
         }
   
- -      sg_policy->need_freq_update = true;
+ +      sg_policy->limits_changed = true;
   }
   
   struct cpufreq_governor schedutil_gov = {
diff --combined kernel/sched/deadline.c

index 46122edd8552c9abd7acb3cf665332d91746ed7d,0b9cbfb2b1d4fcfd339c20c6e8ecb29d78444032..39dc9f74f2898f13b56837f8073f49043275a5d2
--- 1/kernel/sched/deadline.c
--- 2/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@@ -529,6 -529,7 +529,7 @@@ static struct rq *find_lock_later_rq(st
   static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
   {
         struct rq *later_rq = NULL;
+       struct dl_bw *dl_b;
   
         later_rq = find_lock_later_rq(p, rq);
         if (!later_rq) {
@@@ -557,6 -558,38 +558,38 @@@
                 double_lock_balance(rq, later_rq);
         }
   
+       if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+               /*
+                * Inactive timer is armed (or callback is running, but
+                * waiting for us to release rq locks). In any case, when it
+                * will fire (or continue), it will see running_bw of this
+                * task migrated to later_rq (and correctly handle it).
+                */
+               sub_running_bw(&p->dl, &rq->dl);
+               sub_rq_bw(&p->dl, &rq->dl);
+ 
+               add_rq_bw(&p->dl, &later_rq->dl);
+               add_running_bw(&p->dl, &later_rq->dl);
+       } else {
+               sub_rq_bw(&p->dl, &rq->dl);
+               add_rq_bw(&p->dl, &later_rq->dl);
+       }
+ 
+       /*
+        * And we finally need to fixup root_domain(s) bandwidth accounting,
+        * since p is still hanging out in the old (now moved to default) root
+        * domain.
+        */
+       dl_b = &rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+       __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+       raw_spin_unlock(&dl_b->lock);
+ 
+       dl_b = &later_rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+       __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+       raw_spin_unlock(&dl_b->lock);
+ 
         set_task_cpu(p, later_rq->cpu);
         double_unlock_balance(later_rq, rq);
   
@@@ -1694,12 -1727,20 +1727,20 @@@ static void start_hrtick_dl(struct rq *
   }
   #endif
   
- static inline void set_next_task(struct rq *rq, struct task_struct *p)
+ static void set_next_task_dl(struct rq *rq, struct task_struct *p)
   {
         p->se.exec_start = rq_clock_task(rq);
   
         /* You can't push away the running task */
         dequeue_pushable_dl_task(rq, p);
+ 
+       if (hrtick_enabled(rq))
+               start_hrtick_dl(rq, p);
+ 
+       if (rq->curr->sched_class != &dl_sched_class)
+               update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ 
+       deadline_queue_push_tasks(rq);
   }
   
   static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@@ -1720,64 -1761,42 +1761,42 @@@ pick_next_task_dl(struct rq *rq, struc
         struct task_struct *p;
         struct dl_rq *dl_rq;
   
-       dl_rq = &rq->dl;
- 
-       if (need_pull_dl_task(rq, prev)) {
-               /*
-                * This is OK, because current is on_cpu, which avoids it being
-                * picked for load-balance and preemption/IRQs are still
-                * disabled avoiding further scheduler activity on it and we're
-                * being very careful to re-start the picking loop.
-                */
-               rq_unpin_lock(rq, rf);
-               pull_dl_task(rq);
-               rq_repin_lock(rq, rf);
-               /*
-                * pull_dl_task() can drop (and re-acquire) rq->lock; this
-                * means a stop task can slip in, in which case we need to
-                * re-start task selection.
-                */
-               if (rq->stop && task_on_rq_queued(rq->stop))
-                       return RETRY_TASK;
-       }
+       WARN_ON_ONCE(prev || rf);
   
-       /*
-        * When prev is DL, we may throttle it in put_prev_task().
-        * So, we update time before we check for dl_nr_running.
-        */
-       if (prev->sched_class == &dl_sched_class)
-               update_curr_dl(rq);
+       dl_rq = &rq->dl;
   
         if (unlikely(!dl_rq->dl_nr_running))
                 return NULL;
   
-       put_prev_task(rq, prev);
- 
         dl_se = pick_next_dl_entity(rq, dl_rq);
         BUG_ON(!dl_se);
   
         p = dl_task_of(dl_se);
   
-       set_next_task(rq, p);
- 
-       if (hrtick_enabled(rq))
-               start_hrtick_dl(rq, p);
- 
-       deadline_queue_push_tasks(rq);
- 
-       if (rq->curr->sched_class != &dl_sched_class)
-               update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+       set_next_task_dl(rq, p);
   
         return p;
   }
   
- static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
   {
         update_curr_dl(rq);
   
         update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
         if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_dl_task(rq, p);
+ 
+       if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+               /*
+                * This is OK, because current is on_cpu, which avoids it being
+                * picked for load-balance and preemption/IRQs are still
+                * disabled avoiding further scheduler activity on it and we've
+                * not yet started the picking loop.
+                */
+               rq_unpin_lock(rq, rf);
+               pull_dl_task(rq);
+               rq_repin_lock(rq, rf);
+       }
   }
   
   /*
@@@ -1811,11 -1830,6 +1830,6 @@@ static void task_fork_dl(struct task_st
          */
   }
   
- static void set_curr_task_dl(struct rq *rq)
- {
-       set_next_task(rq, rq->curr);
- }
- 
   #ifdef CONFIG_SMP
   
   /* Only try algorithms three times */
@@@ -2088,13 -2102,17 +2102,13 @@@ retry
         }
   
         deactivate_task(rq, next_task, 0);
- -      sub_running_bw(&next_task->dl, &rq->dl);
- -      sub_rq_bw(&next_task->dl, &rq->dl);
         set_task_cpu(next_task, later_rq->cpu);
- -      add_rq_bw(&next_task->dl, &later_rq->dl);
   
         /*
          * Update the later_rq clock here, because the clock is used
          * by the cpufreq_update_util() inside __add_running_bw().
          */
         update_rq_clock(later_rq);
- -      add_running_bw(&next_task->dl, &later_rq->dl);
         activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
         ret = 1;
   
@@@ -2182,7 -2200,11 +2196,7 @@@ static void pull_dl_task(struct rq *thi
                         resched = true;
   
                         deactivate_task(src_rq, p, 0);
- -                      sub_running_bw(&p->dl, &src_rq->dl);
- -                      sub_rq_bw(&p->dl, &src_rq->dl);
                         set_task_cpu(p, this_cpu);
- -                      add_rq_bw(&p->dl, &this_rq->dl);
- -                      add_running_bw(&p->dl, &this_rq->dl);
                         activate_task(this_rq, p, 0);
                         dmin = p->dl.deadline;
   
@@@ -2275,6 -2297,36 +2289,36 @@@ void __init init_sched_dl_class(void
                                         GFP_KERNEL, cpu_to_node(i));
   }
   
+ void dl_add_task_root_domain(struct task_struct *p)
+ {
+       struct rq_flags rf;
+       struct rq *rq;
+       struct dl_bw *dl_b;
+ 
+       rq = task_rq_lock(p, &rf);
+       if (!dl_task(p))
+               goto unlock;
+ 
+       dl_b = &rq->rd->dl_bw;
+       raw_spin_lock(&dl_b->lock);
+ 
+       __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ 
+       raw_spin_unlock(&dl_b->lock);
+ 
+ unlock:
+       task_rq_unlock(rq, p, &rf);
+ }
+ 
+ void dl_clear_root_domain(struct root_domain *rd)
+ {
+       unsigned long flags;
+ 
+       raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+       rd->dl_bw.total_bw = 0;
+       raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+ }
+ 
   #endif /* CONFIG_SMP */
   
   static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@@ -2395,6 -2447,7 +2439,7 @@@ const struct sched_class dl_sched_clas
   
         .pick_next_task         = pick_next_task_dl,
         .put_prev_task          = put_prev_task_dl,
+       .set_next_task          = set_next_task_dl,
   
   #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_dl,
@@@ -2405,7 -2458,6 +2450,6 @@@
         .task_woken             = task_woken_dl,
   #endif
   
-       .set_curr_task          = set_curr_task_dl,
         .task_tick              = task_tick_dl,
         .task_fork              = task_fork_dl,
   
diff --combined kernel/sched/fair.c

index 500f5db0de0ba86a331586d4189e3b299cb6148e,1f0a5e1a90faf36d18277401df7d321c1dbfdb26..d4bbf68c31611fcd6fa3da456ef435021cefae53
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -96,12 -96,12 +96,12 @@@ int __weak arch_asym_cpu_priority(int c
   }
   
   /*
-  * The margin used when comparing utilization with CPU capacity:
-  * util * margin < capacity * 1024
+  * The margin used when comparing utilization with CPU capacity.
    *
    * (default: ~20%)
    */
- static unsigned int capacity_margin                   = 1280;
+ #define fits_capacity(cap, max)       ((cap) * 1280 < (max) * 1024)
+ 
   #endif
   
   #ifdef CONFIG_CFS_BANDWIDTH
@@@ -1188,47 -1188,6 +1188,6 @@@ static unsigned int task_scan_max(struc
         return max(smin, smax);
   }
   
- void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
- {
-       int mm_users = 0;
-       struct mm_struct *mm = p->mm;
- 
-       if (mm) {
-               mm_users = atomic_read(&mm->mm_users);
-               if (mm_users == 1) {
-                       mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-                       mm->numa_scan_seq = 0;
-               }
-       }
-       p->node_stamp                   = 0;
-       p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
-       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
-       p->numa_work.next               = &p->numa_work;
-       p->numa_faults                  = NULL;
-       RCU_INIT_POINTER(p->numa_group, NULL);
-       p->last_task_numa_placement     = 0;
-       p->last_sum_exec_runtime        = 0;
- 
-       /* New address space, reset the preferred nid */
-       if (!(clone_flags & CLONE_VM)) {
-               p->numa_preferred_nid = NUMA_NO_NODE;
-               return;
-       }
- 
-       /*
-        * New thread, keep existing numa_preferred_nid which should be copied
-        * already by arch_dup_task_struct but stagger when scans start.
-        */
-       if (mm) {
-               unsigned int delay;
- 
-               delay = min_t(unsigned int, task_scan_max(current),
-                       current->numa_scan_period * mm_users * NSEC_PER_MSEC);
-               delay += 2 * TICK_NSEC;
-               p->node_stamp = delay;
-       }
- }
- 
   static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
   {
         rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@@ -2523,7 -2482,7 +2482,7 @@@ static void reset_ptenuma_scan(struct t
    * The expensive part of numa migration is done from task_work context.
    * Triggered from task_tick_numa().
    */
- void task_numa_work(struct callback_head *work)
+ static void task_numa_work(struct callback_head *work)
   {
         unsigned long migrate, next_scan, now = jiffies;
         struct task_struct *p = current;
@@@ -2536,7 -2495,7 +2495,7 @@@
   
         SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
   
-       work->next = work; /* protect against double add */
+       work->next = work;
         /*
          * Who cares about NUMA placement when they're dying.
          *
@@@ -2665,6 -2624,50 +2624,50 @@@ out
         }
   }
   
+ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+ {
+       int mm_users = 0;
+       struct mm_struct *mm = p->mm;
+ 
+       if (mm) {
+               mm_users = atomic_read(&mm->mm_users);
+               if (mm_users == 1) {
+                       mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+                       mm->numa_scan_seq = 0;
+               }
+       }
+       p->node_stamp                   = 0;
+       p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
+       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
+       /* Protect against double add, see task_tick_numa and task_numa_work */
+       p->numa_work.next               = &p->numa_work;
+       p->numa_faults                  = NULL;
+       RCU_INIT_POINTER(p->numa_group, NULL);
+       p->last_task_numa_placement     = 0;
+       p->last_sum_exec_runtime        = 0;
+ 
+       init_task_work(&p->numa_work, task_numa_work);
+ 
+       /* New address space, reset the preferred nid */
+       if (!(clone_flags & CLONE_VM)) {
+               p->numa_preferred_nid = NUMA_NO_NODE;
+               return;
+       }
+ 
+       /*
+        * New thread, keep existing numa_preferred_nid which should be copied
+        * already by arch_dup_task_struct but stagger when scans start.
+        */
+       if (mm) {
+               unsigned int delay;
+ 
+               delay = min_t(unsigned int, task_scan_max(current),
+                       current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+               delay += 2 * TICK_NSEC;
+               p->node_stamp = delay;
+       }
+ }
+ 
   /*
    * Drive the periodic memory faults..
    */
@@@ -2693,10 -2696,8 +2696,8 @@@ static void task_tick_numa(struct rq *r
                         curr->numa_scan_period = task_scan_start(curr);
                 curr->node_stamp += period;
   
-               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
-                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+               if (!time_before(jiffies, curr->mm->numa_next_scan))
                         task_work_add(curr, work, true);
-               }
         }
   }
   
@@@ -3689,8 -3690,6 +3690,6 @@@ static inline unsigned long cfs_rq_load
         return cfs_rq->avg.load_avg;
   }
   
- static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
- 
   static inline unsigned long task_util(struct task_struct *p)
   {
         return READ_ONCE(p->se.avg.util_avg);
@@@ -3807,7 -3806,7 +3806,7 @@@ util_est_dequeue(struct cfs_rq *cfs_rq
   
   static inline int task_fits_capacity(struct task_struct *p, long capacity)
   {
-       return capacity * 1024 > task_util_est(p) * capacity_margin;
+       return fits_capacity(task_util_est(p), capacity);
   }
   
   static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@@ -4370,8 -4369,6 +4369,6 @@@ void __refill_cfs_bandwidth_runtime(str
   
         now = sched_clock_cpu(smp_processor_id());
         cfs_b->runtime = cfs_b->quota;
-       cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
-       cfs_b->expires_seq++;
   }
   
   static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@@ -4393,8 -4390,7 +4390,7 @@@ static int assign_cfs_rq_runtime(struc
   {
         struct task_group *tg = cfs_rq->tg;
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount, expires;
-       int expires_seq;
+       u64 amount = 0, min_amount;
   
         /* note: this is a positive sum as runtime_remaining <= 0 */
         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@@ -4411,67 -4407,21 +4407,23 @@@
                         cfs_b->idle = 0;
                 }
         }
-       expires_seq = cfs_b->expires_seq;
-       expires = cfs_b->runtime_expires;
         raw_spin_unlock(&cfs_b->lock);
   
         cfs_rq->runtime_remaining += amount;
-       /*
-        * we may have advanced our local expiration to account for allowed
-        * spread between our sched_clock and the one on which runtime was
-        * issued.
-        */
-       if (cfs_rq->expires_seq != expires_seq) {
-               cfs_rq->expires_seq = expires_seq;
-               cfs_rq->runtime_expires = expires;
-       }
   
         return cfs_rq->runtime_remaining > 0;
   }
   
- /*
-  * Note: This depends on the synchronization provided by sched_clock and the
-  * fact that rq->clock snapshots this value.
-  */
- static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
- {
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- 
-       /* if the deadline is ahead of our clock, nothing to do */
-       if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
-               return;
- 
-       if (cfs_rq->runtime_remaining < 0)
-               return;
- 
-       /*
-        * If the local deadline has passed we have to consider the
-        * possibility that our sched_clock is 'fast' and the global deadline
-        * has not truly expired.
-        *
-        * Fortunately we can check determine whether this the case by checking
-        * whether the global deadline(cfs_b->expires_seq) has advanced.
-        */
-       if (cfs_rq->expires_seq == cfs_b->expires_seq) {
-               /* extend local deadline, drift is bounded above by 2 ticks */
-               cfs_rq->runtime_expires += TICK_NSEC;
-       } else {
-               /* global deadline is ahead, expiration has passed */
-               cfs_rq->runtime_remaining = 0;
-       }
- }
- 
   static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
   {
         /* dock delta_exec before expiring quota (as it could span periods) */
         cfs_rq->runtime_remaining -= delta_exec;
-       expire_cfs_rq_runtime(cfs_rq);
   
         if (likely(cfs_rq->runtime_remaining > 0))
                 return;
   
+ +      if (cfs_rq->throttled)
+ +              return;
         /*
          * if we're unable to extend our runtime we resched so that the active
          * hierarchy can be throttled
@@@ -4556,7 -4506,7 +4508,7 @@@ static void throttle_cfs_rq(struct cfs_
         struct rq *rq = rq_of(cfs_rq);
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
-       long task_delta, dequeue = 1;
+       long task_delta, idle_task_delta, dequeue = 1;
         bool empty;
   
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@@ -4567,6 -4517,7 +4519,7 @@@
         rcu_read_unlock();
   
         task_delta = cfs_rq->h_nr_running;
+       idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
                 /* throttled entity or throttle-on-deactivate */
@@@ -4576,6 -4527,7 +4529,7 @@@
                 if (dequeue)
                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
                 qcfs_rq->h_nr_running -= task_delta;
+               qcfs_rq->idle_h_nr_running -= idle_task_delta;
   
                 if (qcfs_rq->load.weight)
                         dequeue = 0;
@@@ -4615,7 -4567,7 +4569,7 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         int enqueue = 1;
-       long task_delta;
+       long task_delta, idle_task_delta;
   
         se = cfs_rq->tg->se[cpu_of(rq)];
   
@@@ -4635,6 -4587,7 +4589,7 @@@
                 return;
   
         task_delta = cfs_rq->h_nr_running;
+       idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
                 if (se->on_rq)
                         enqueue = 0;
@@@ -4643,6 -4596,7 +4598,7 @@@
                 if (enqueue)
                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
                 cfs_rq->h_nr_running += task_delta;
+               cfs_rq->idle_h_nr_running += idle_task_delta;
   
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@@ -4658,8 -4612,7 +4614,7 @@@
                 resched_curr(rq);
   }
   
- static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
-               u64 remaining, u64 expires)
+ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
   {
         struct cfs_rq *cfs_rq;
         u64 runtime;
@@@ -4675,16 -4628,12 +4630,15 @@@
                 if (!cfs_rq_throttled(cfs_rq))
                         goto next;
   
+ +              /* By the above check, this should never be true */
+ +              SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+ +
                 runtime = -cfs_rq->runtime_remaining + 1;
                 if (runtime > remaining)
                         runtime = remaining;
                 remaining -= runtime;
   
                 cfs_rq->runtime_remaining += runtime;
-               cfs_rq->runtime_expires = expires;
   
                 /* we check whether we're throttled above */
                 if (cfs_rq->runtime_remaining > 0)
@@@ -4709,7 -4658,7 +4663,7 @@@ next
    */
   static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
   {
-       u64 runtime, runtime_expires;
+       u64 runtime;
         int throttled;
   
         /* no need to continue the timer with no bandwidth constraint */
@@@ -4737,8 -4686,6 +4691,6 @@@
         /* account preceding periods in which throttling occurred */
         cfs_b->nr_throttled += overrun;
   
-       runtime_expires = cfs_b->runtime_expires;
- 
         /*
          * This check is repeated as we are holding onto the new bandwidth while
          * we unthrottle. This can potentially race with an unthrottled group
@@@ -4751,8 -4698,7 +4703,7 @@@
                 cfs_b->distribute_running = 1;
                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
-               runtime = distribute_cfs_runtime(cfs_b, runtime,
-                                                runtime_expires);
+               runtime = distribute_cfs_runtime(cfs_b, runtime);
                 raw_spin_lock_irqsave(&cfs_b->lock, flags);
   
                 cfs_b->distribute_running = 0;
@@@ -4834,8 -4780,7 +4785,7 @@@ static void __return_cfs_rq_runtime(str
                 return;
   
         raw_spin_lock(&cfs_b->lock);
-       if (cfs_b->quota != RUNTIME_INF &&
-           cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+       if (cfs_b->quota != RUNTIME_INF) {
                 cfs_b->runtime += slack_runtime;
   
                 /* we are under rq->lock, defer unthrottling using a timer */
@@@ -4868,7 -4813,6 +4818,6 @@@ static void do_sched_cfs_slack_timer(st
   {
         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
         unsigned long flags;
-       u64 expires;
   
         /* confirm we're still not at a refresh boundary */
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@@ -4886,7 -4830,6 +4835,6 @@@
         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                 runtime = cfs_b->runtime;
   
-       expires = cfs_b->runtime_expires;
         if (runtime)
                 cfs_b->distribute_running = 1;
   
@@@ -4895,11 -4838,10 +4843,10 @@@
         if (!runtime)
                 return;
   
-       runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+       runtime = distribute_cfs_runtime(cfs_b, runtime);
   
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
-       if (expires == cfs_b->runtime_expires)
-               lsub_positive(&cfs_b->runtime, runtime);
+       lsub_positive(&cfs_b->runtime, runtime);
         cfs_b->distribute_running = 0;
         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   }
@@@ -5056,8 -4998,6 +5003,6 @@@ void start_cfs_bandwidth(struct cfs_ban
   
         cfs_b->period_active = 1;
         overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
-       cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
-       cfs_b->expires_seq++;
         hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
   }
   
@@@ -5235,7 -5175,7 +5180,7 @@@ static inline unsigned long cpu_util(in
   
   static inline bool cpu_overutilized(int cpu)
   {
-       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+       return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
   }
   
   static inline void update_overutilized_status(struct rq *rq)
@@@ -5259,6 -5199,7 +5204,7 @@@ enqueue_task_fair(struct rq *rq, struc
   {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
+       int idle_h_nr_running = task_has_idle_policy(p);
   
         /*
          * The code below (indirectly) updates schedutil which looks at
@@@ -5291,6 -5232,7 +5237,7 @@@
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
   
                 flags = ENQUEUE_WAKEUP;
         }
@@@ -5298,6 -5240,7 +5245,7 @@@
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
   
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@@ -5359,6 -5302,7 +5307,7 @@@ static void dequeue_task_fair(struct r
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
         int task_sleep = flags & DEQUEUE_SLEEP;
+       int idle_h_nr_running = task_has_idle_policy(p);
   
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
@@@ -5373,6 -5317,7 +5322,7 @@@
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
@@@ -5392,6 -5337,7 +5342,7 @@@
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@@ -5425,6 -5371,15 +5376,15 @@@ static struct 
   
   #endif /* CONFIG_NO_HZ_COMMON */
   
+ /* CPU only has SCHED_IDLE tasks enqueued */
+ static int sched_idle_cpu(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+ 
+       return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+                       rq->nr_running);
+ }
+ 
   static unsigned long cpu_runnable_load(struct rq *rq)
   {
         return cfs_rq_runnable_load_avg(&rq->cfs);
@@@ -5747,7 -5702,7 +5707,7 @@@ find_idlest_group_cpu(struct sched_grou
         unsigned int min_exit_latency = UINT_MAX;
         u64 latest_idle_timestamp = 0;
         int least_loaded_cpu = this_cpu;
-       int shallowest_idle_cpu = -1;
+       int shallowest_idle_cpu = -1, si_cpu = -1;
         int i;
   
         /* Check if we have any choice: */
@@@ -5778,7 -5733,12 +5738,12 @@@
                                 latest_idle_timestamp = rq->idle_stamp;
                                 shallowest_idle_cpu = i;
                         }
-               } else if (shallowest_idle_cpu == -1) {
+               } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+                       if (sched_idle_cpu(i)) {
+                               si_cpu = i;
+                               continue;
+                       }
+ 
                         load = cpu_runnable_load(cpu_rq(i));
                         if (load < min_load) {
                                 min_load = load;
@@@ -5787,7 -5747,11 +5752,11 @@@
                 }
         }
   
-       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+       if (shallowest_idle_cpu != -1)
+               return shallowest_idle_cpu;
+       if (si_cpu != -1)
+               return si_cpu;
+       return least_loaded_cpu;
   }
   
   static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@@ -5940,7 -5904,7 +5909,7 @@@ static int select_idle_core(struct task
    */
   static int select_idle_smt(struct task_struct *p, int target)
   {
-       int cpu;
+       int cpu, si_cpu = -1;
   
         if (!static_branch_likely(&sched_smt_present))
                 return -1;
@@@ -5950,9 -5914,11 +5919,11 @@@
                         continue;
                 if (available_idle_cpu(cpu))
                         return cpu;
+               if (si_cpu == -1 && sched_idle_cpu(cpu))
+                       si_cpu = cpu;
         }
   
-       return -1;
+       return si_cpu;
   }
   
   #else /* CONFIG_SCHED_SMT */
@@@ -5980,8 -5946,8 +5951,8 @@@ static int select_idle_cpu(struct task_
         u64 avg_cost, avg_idle;
         u64 time, cost;
         s64 delta;
-       int cpu, nr = INT_MAX;
         int this = smp_processor_id();
+       int cpu, nr = INT_MAX, si_cpu = -1;
   
         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
         if (!this_sd)
@@@ -6009,11 -5975,13 +5980,13 @@@
   
         for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                 if (!--nr)
-                       return -1;
+                       return si_cpu;
                 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                         continue;
                 if (available_idle_cpu(cpu))
                         break;
+               if (si_cpu == -1 && sched_idle_cpu(cpu))
+                       si_cpu = cpu;
         }
   
         time = cpu_clock(this) - time;
@@@ -6032,13 -6000,14 +6005,14 @@@ static int select_idle_sibling(struct t
         struct sched_domain *sd;
         int i, recent_used_cpu;
   
-       if (available_idle_cpu(target))
+       if (available_idle_cpu(target) || sched_idle_cpu(target))
                 return target;
   
         /*
          * If the previous CPU is cache affine and idle, don't be stupid:
          */
-       if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+       if (prev != target && cpus_share_cache(prev, target) &&
+           (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                 return prev;
   
         /* Check a recently used CPU as a potential idle candidate: */
@@@ -6046,7 -6015,7 +6020,7 @@@
         if (recent_used_cpu != prev &&
             recent_used_cpu != target &&
             cpus_share_cache(recent_used_cpu, target) &&
-           available_idle_cpu(recent_used_cpu) &&
+           (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
             cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
                 /*
                  * Replace recent_used_cpu with prev as it is a potential
@@@ -6282,69 -6251,55 +6256,55 @@@ static unsigned long cpu_util_next(int 
   }
   
   /*
-  * compute_energy(): Estimates the energy that would be consumed if @p was
+  * compute_energy(): Estimates the energy that @pd would consume if @p was
    * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
-  * landscape of the * CPUs after the task migration, and uses the Energy Model
+  * landscape of @pd's CPUs after the task migration, and uses the Energy Model
    * to compute what would be the energy if we decided to actually migrate that
    * task.
    */
   static long
   compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
   {
-       unsigned int max_util, util_cfs, cpu_util, cpu_cap;
-       unsigned long sum_util, energy = 0;
-       struct task_struct *tsk;
+       struct cpumask *pd_mask = perf_domain_span(pd);
+       unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+       unsigned long max_util = 0, sum_util = 0;
         int cpu;
   
-       for (; pd; pd = pd->next) {
-               struct cpumask *pd_mask = perf_domain_span(pd);
+       /*
+        * The capacity state of CPUs of the current rd can be driven by CPUs
+        * of another rd if they belong to the same pd. So, account for the
+        * utilization of these CPUs too by masking pd with cpu_online_mask
+        * instead of the rd span.
+        *
+        * If an entire pd is outside of the current rd, it will not appear in
+        * its pd list and will not be accounted by compute_energy().
+        */
+       for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+               unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+               struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
   
                 /*
-                * The energy model mandates all the CPUs of a performance
-                * domain have the same capacity.
+                * Busy time computation: utilization clamping is not
+                * required since the ratio (sum_util / cpu_capacity)
+                * is already enough to scale the EM reported power
+                * consumption at the (eventually clamped) cpu_capacity.
                  */
-               cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
-               max_util = sum_util = 0;
+               sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                              ENERGY_UTIL, NULL);
   
                 /*
-                * The capacity state of CPUs of the current rd can be driven by
-                * CPUs of another rd if they belong to the same performance
-                * domain. So, account for the utilization of these CPUs too
-                * by masking pd with cpu_online_mask instead of the rd span.
-                *
-                * If an entire performance domain is outside of the current rd,
-                * it will not appear in its pd list and will not be accounted
-                * by compute_energy().
+                * Performance domain frequency: utilization clamping
+                * must be considered since it affects the selection
+                * of the performance domain frequency.
+                * NOTE: in case RT tasks are running, by default the
+                * FREQUENCY_UTIL's utilization can be max OPP.
                  */
-               for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-                       util_cfs = cpu_util_next(cpu, p, dst_cpu);
- 
-                       /*
-                        * Busy time computation: utilization clamping is not
-                        * required since the ratio (sum_util / cpu_capacity)
-                        * is already enough to scale the EM reported power
-                        * consumption at the (eventually clamped) cpu_capacity.
-                        */
-                       sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                      ENERGY_UTIL, NULL);
- 
-                       /*
-                        * Performance domain frequency: utilization clamping
-                        * must be considered since it affects the selection
-                        * of the performance domain frequency.
-                        * NOTE: in case RT tasks are running, by default the
-                        * FREQUENCY_UTIL's utilization can be max OPP.
-                        */
-                       tsk = cpu == dst_cpu ? p : NULL;
-                       cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                     FREQUENCY_UTIL, tsk);
-                       max_util = max(max_util, cpu_util);
-               }
- 
-               energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+               cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                             FREQUENCY_UTIL, tsk);
+               max_util = max(max_util, cpu_util);
         }
   
-       return energy;
+       return em_pd_energy(pd->em_pd, max_util, sum_util);
   }
   
   /*
@@@ -6386,21 -6341,19 +6346,19 @@@
    * other use-cases too. So, until someone finds a better way to solve this,
    * let's keep things simple by re-using the existing slow path.
    */
- 
   static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
   {
-       unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+       unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+       unsigned long cpu_cap, util, base_energy = 0;
         int cpu, best_energy_cpu = prev_cpu;
-       struct perf_domain *head, *pd;
-       unsigned long cpu_cap, util;
         struct sched_domain *sd;
+       struct perf_domain *pd;
   
         rcu_read_lock();
         pd = rcu_dereference(rd->pd);
         if (!pd || READ_ONCE(rd->overutilized))
                 goto fail;
-       head = pd;
   
         /*
          * Energy-aware wake-up happens on the lowest sched_domain starting
@@@ -6417,9 -6370,14 +6375,14 @@@
                 goto unlock;
   
         for (; pd; pd = pd->next) {
-               unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+               unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+               unsigned long base_energy_pd;
                 int max_spare_cap_cpu = -1;
   
+               /* Compute the 'base' energy of the pd, without @p */
+               base_energy_pd = compute_energy(p, -1, pd);
+               base_energy += base_energy_pd;
+ 
                 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                 continue;
@@@ -6427,14 -6385,14 +6390,14 @@@
                         /* Skip CPUs that will be overutilized. */
                         util = cpu_util_next(cpu, p, cpu);
                         cpu_cap = capacity_of(cpu);
-                       if (cpu_cap * 1024 < util * capacity_margin)
+                       if (!fits_capacity(util, cpu_cap))
                                 continue;
   
                         /* Always use prev_cpu as a candidate. */
                         if (cpu == prev_cpu) {
-                               prev_energy = compute_energy(p, prev_cpu, head);
-                               best_energy = min(best_energy, prev_energy);
-                               continue;
+                               prev_delta = compute_energy(p, prev_cpu, pd);
+                               prev_delta -= base_energy_pd;
+                               best_delta = min(best_delta, prev_delta);
                         }
   
                         /*
@@@ -6450,9 -6408,10 +6413,10 @@@
   
                 /* Evaluate the energy impact of using this CPU. */
                 if (max_spare_cap_cpu >= 0) {
-                       cur_energy = compute_energy(p, max_spare_cap_cpu, head);
-                       if (cur_energy < best_energy) {
-                               best_energy = cur_energy;
+                       cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+                       cur_delta -= base_energy_pd;
+                       if (cur_delta < best_delta) {
+                               best_delta = cur_delta;
                                 best_energy_cpu = max_spare_cap_cpu;
                         }
                 }
@@@ -6464,10 -6423,10 +6428,10 @@@ unlock
          * Pick the best CPU if prev_cpu cannot be used, or if it saves at
          * least 6% of the energy used by prev_cpu.
          */
-       if (prev_energy == ULONG_MAX)
+       if (prev_delta == ULONG_MAX)
                 return best_energy_cpu;
   
-       if ((prev_energy - best_energy) > (prev_energy >> 4))
+       if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
                 return best_energy_cpu;
   
         return prev_cpu;
@@@ -6801,7 -6760,7 +6765,7 @@@ again
                 goto idle;
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (prev->sched_class != &fair_sched_class)
+       if (!prev || prev->sched_class != &fair_sched_class)
                 goto simple;
   
         /*
@@@ -6878,8 -6837,8 +6842,8 @@@
         goto done;
   simple:
   #endif
- 
-       put_prev_task(rq, prev);
+       if (prev)
+               put_prev_task(rq, prev);
   
         do {
                 se = pick_next_entity(cfs_rq, NULL);
@@@ -6907,11 -6866,13 +6871,13 @@@ done: __maybe_unused
         return p;
   
   idle:
-       update_misfit_status(NULL, rq);
-       new_tasks = idle_balance(rq, rf);
+       if (!rf)
+               return NULL;
+ 
+       new_tasks = newidle_balance(rq, rf);
   
         /*
-        * Because idle_balance() releases (and re-acquires) rq->lock, it is
+        * Because newidle_balance() releases (and re-acquires) rq->lock, it is
          * possible for any higher priority task to appear. In that case we
          * must re-start the pick_next_entity() loop.
          */
@@@ -6933,7 -6894,7 +6899,7 @@@
   /*
    * Account for a descheduled task:
    */
- static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   {
         struct sched_entity *se = &prev->se;
         struct cfs_rq *cfs_rq;
@@@ -7435,7 -7396,7 +7401,7 @@@ static int detach_tasks(struct lb_env *
                 detached++;
                 env->imbalance -= load;
   
- #ifdef CONFIG_PREEMPT
+ #ifdef CONFIG_PREEMPTION
                 /*
                  * NEWIDLE balancing is a source of latency, so preemptible
                  * kernels will stop after the first task is detached to minimize
@@@ -7982,8 -7943,7 +7948,7 @@@ group_is_overloaded(struct lb_env *env
   static inline bool
   group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
   {
-       return sg->sgc->min_capacity * capacity_margin <
-                                               ref->sgc->min_capacity * 1024;
+       return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
   }
   
   /*
@@@ -7993,8 -7953,7 +7958,7 @@@
   static inline bool
   group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
   {
-       return sg->sgc->max_capacity * capacity_margin <
-                                               ref->sgc->max_capacity * 1024;
+       return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
   }
   
   static inline enum
@@@ -9052,9 -9011,10 +9016,10 @@@ more_balance
   out_balanced:
         /*
          * We reach balance although we may have faced some affinity
-        * constraints. Clear the imbalance flag if it was set.
+        * constraints. Clear the imbalance flag only if other tasks got
+        * a chance to move and fix the imbalance.
          */
-       if (sd_parent) {
+       if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
   
                 if (*group_imbalance)
@@@ -9075,10 -9035,10 +9040,10 @@@ out_one_pinned
         ld_moved = 0;
   
         /*
-        * idle_balance() disregards balance intervals, so we could repeatedly
-        * reach this code, which would lead to balance_interval skyrocketting
-        * in a short amount of time. Skip the balance_interval increase logic
-        * to avoid that.
+        * newidle_balance() disregards balance intervals, so we could
+        * repeatedly reach this code, which would lead to balance_interval
+        * skyrocketting in a short amount of time. Skip the balance_interval
+        * increase logic to avoid that.
          */
         if (env.idle == CPU_NEWLY_IDLE)
                 goto out;
@@@ -9788,7 -9748,7 +9753,7 @@@ static inline void nohz_newidle_balance
    * idle_balance is called by schedule() if this_cpu is about to become
    * idle. Attempts to pull tasks from other CPUs.
    */
- static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
   {
         unsigned long next_balance = jiffies + HZ;
         int this_cpu = this_rq->cpu;
@@@ -9796,6 -9756,7 +9761,7 @@@
         int pulled_task = 0;
         u64 curr_cost = 0;
   
+       update_misfit_status(NULL, this_rq);
         /*
          * We must set idle_stamp _before_ calling idle_balance(), such that we
          * measure the duration of idle_balance() as idle time.
@@@ -10180,9 -10141,19 +10146,19 @@@ static void switched_to_fair(struct rq 
    * This routine is mostly called to set cfs_rq->curr field when a task
    * migrates between groups/classes.
    */
- static void set_curr_task_fair(struct rq *rq)
+ static void set_next_task_fair(struct rq *rq, struct task_struct *p)
   {
-       struct sched_entity *se = &rq->curr->se;
+       struct sched_entity *se = &p->se;
+ 
+ #ifdef CONFIG_SMP
+       if (task_on_rq_queued(p)) {
+               /*
+                * Move the next running task to the front of the list, so our
+                * cfs_tasks list becomes MRU one.
+                */
+               list_move(&se->group_node, &rq->cfs_tasks);
+       }
+ #endif
   
         for_each_sched_entity(se) {
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@@ -10300,18 -10271,18 +10276,18 @@@ err
   void online_fair_sched_group(struct task_group *tg)
   {
         struct sched_entity *se;
+       struct rq_flags rf;
         struct rq *rq;
         int i;
   
         for_each_possible_cpu(i) {
                 rq = cpu_rq(i);
                 se = tg->se[i];
- 
-               raw_spin_lock_irq(&rq->lock);
+               rq_lock_irq(rq, &rf);
                 update_rq_clock(rq);
                 attach_entity_cfs_rq(se);
                 sync_throttle(tg, i);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
         }
   }
   
@@@ -10453,7 -10424,9 +10429,9 @@@ const struct sched_class fair_sched_cla
         .check_preempt_curr     = check_preempt_wakeup,
   
         .pick_next_task         = pick_next_task_fair,
+ 
         .put_prev_task          = put_prev_task_fair,
+       .set_next_task          = set_next_task_fair,
   
   #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_fair,
@@@ -10466,7 -10439,6 +10444,6 @@@
         .set_cpus_allowed       = set_cpus_allowed_common,
   #endif
   
-       .set_curr_task          = set_curr_task_fair,
         .task_tick              = task_tick_fair,
         .task_fork              = task_fork_fair,
   
diff --combined kernel/sched/idle.c

index e4bc4aa739b830c5236cf84445b6278aa3c0470b,7c54550dda6a6b09ecdff57b17a32232039fb3b7..8bfeb6395bddb9f5c3ccc1267475b55ae7c3d086
--- 1/kernel/sched/idle.c
--- 2/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@@ -241,14 -241,13 +241,14 @@@ static void do_idle(void
                 check_pgt_cache();
                 rmb();
   
+ +              local_irq_disable();
+ +
                 if (cpu_is_offline(cpu)) {
- -                      tick_nohz_idle_stop_tick_protected();
+ +                      tick_nohz_idle_stop_tick();
                         cpuhp_report_idle_dead();
                         arch_cpu_idle_dead();
                 }
   
- -              local_irq_disable();
                 arch_cpu_idle_enter();
   
                 /*
@@@ -375,14 -374,27 +375,27 @@@ static void check_preempt_curr_idle(str
         resched_curr(rq);
   }
   
- static struct task_struct *
- pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ {
+ }
+ 
+ static void set_next_task_idle(struct rq *rq, struct task_struct *next)
   {
-       put_prev_task(rq, prev);
         update_idle_core(rq);
         schedstat_inc(rq->sched_goidle);
+ }
+ 
+ static struct task_struct *
+ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ {
+       struct task_struct *next = rq->idle;
+ 
+       if (prev)
+               put_prev_task(rq, prev);
+ 
+       set_next_task_idle(rq, next);
   
-       return rq->idle;
+       return next;
   }
   
   /*
@@@ -398,10 -410,6 +411,6 @@@ dequeue_task_idle(struct rq *rq, struc
         raw_spin_lock_irq(&rq->lock);
   }
   
- static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
- {
- }
- 
   /*
    * scheduler tick hitting a task of our scheduling class.
    *
@@@ -414,10 -422,6 +423,6 @@@ static void task_tick_idle(struct rq *r
   {
   }
   
- static void set_curr_task_idle(struct rq *rq)
- {
- }
- 
   static void switched_to_idle(struct rq *rq, struct task_struct *p)
   {
         BUG();
@@@ -452,13 -456,13 +457,13 @@@ const struct sched_class idle_sched_cla
   
         .pick_next_task         = pick_next_task_idle,
         .put_prev_task          = put_prev_task_idle,
+       .set_next_task          = set_next_task_idle,
   
   #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_idle,
         .set_cpus_allowed       = set_cpus_allowed_common,
   #endif
   
-       .set_curr_task          = set_curr_task_idle,
         .task_tick              = task_tick_idle,
   
         .get_rr_interval        = get_rr_interval_idle,
diff --combined kernel/sched/psi.c

index 6e52b67b420e7a3312f463f8d3bb6baad6576041,4b14a3208fbec3a92f25720f3597ed1321404a50..517e3719027e619e5c7b565d1de9294dfffb5a3c
--- 1/kernel/sched/psi.c
--- 2/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@@ -1051,7 -1051,7 +1051,7 @@@ struct psi_trigger *psi_trigger_create(
   
         if (!rcu_access_pointer(group->poll_kworker)) {
                 struct sched_param param = {
- -                      .sched_priority = MAX_RT_PRIO - 1,
+ +                      .sched_priority = 1,
                 };
                 struct kthread_worker *kworker;
   
@@@ -1061,7 -1061,7 +1061,7 @@@
                         mutex_unlock(&group->trigger_lock);
                         return ERR_CAST(kworker);
                 }
- -              sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+ +              sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
                 kthread_init_delayed_work(&group->poll_work,
                                 psi_poll_work);
                 rcu_assign_pointer(group->poll_kworker, kworker);
@@@ -1131,15 -1131,7 +1131,15 @@@ static void psi_trigger_destroy(struct 
          * deadlock while waiting for psi_poll_work to acquire trigger_lock
          */
         if (kworker_to_destroy) {
+ +              /*
+ +               * After the RCU grace period has expired, the worker
+ +               * can no longer be found through group->poll_kworker.
+ +               * But it might have been already scheduled before
+ +               * that - deschedule it cleanly before destroying it.
+ +               */
                 kthread_cancel_delayed_work_sync(&group->poll_work);
+ +              atomic_set(&group->poll_scheduled, 0);
+ +
                 kthread_destroy_worker(kworker_to_destroy);
         }
         kfree(t);
@@@ -1198,7 -1190,7 +1198,7 @@@ static ssize_t psi_write(struct file *f
         if (static_branch_likely(&psi_disabled))
                 return -EOPNOTSUPP;
   
-       buf_size = min(nbytes, (sizeof(buf) - 1));
+       buf_size = min(nbytes, sizeof(buf));
         if (copy_from_user(buf, user_buf, buf_size))
                 return -EFAULT;
   
diff --combined kernel/trace/ftrace.c

index f9821a3374e9dd4b81ed0f7cfe8c436d7a58b4c5,a800e867c1a3f0cd7e999384f9d18aa788d53b26..356b848c697aa75b43a908de85c35a3658b8981e
--- 1/kernel/trace/ftrace.c
--- 2/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@@ -2814,7 -2814,7 +2814,7 @@@ int ftrace_shutdown(struct ftrace_ops *
                  * synchornize_rcu_tasks() will wait for those tasks to
                  * execute and either schedule voluntarily or enter user space.
                  */
-               if (IS_ENABLED(CONFIG_PREEMPT))
+               if (IS_ENABLED(CONFIG_PREEMPTION))
                         synchronize_rcu_tasks();
   
    free_ops:
@@@ -3095,14 -3095,6 +3095,14 @@@ t_probe_next(struct seq_file *m, loff_
                 hnd = &iter->probe_entry->hlist;
   
         hash = iter->probe->ops.func_hash->filter_hash;
+ +
+ +      /*
+ +       * A probe being registered may temporarily have an empty hash
+ +       * and it's at the end of the func_probes list.
+ +       */
+ +      if (!hash || hash == EMPTY_HASH)
+ +              return NULL;
+ +
         size = 1 << hash->size_bits;
   
    retry:
@@@ -4328,21 -4320,12 +4328,21 @@@ register_ftrace_function_probe(char *gl
   
         mutex_unlock(&ftrace_lock);
   
+ +      /*
+ +       * Note, there's a small window here that the func_hash->filter_hash
+ +       * may be NULL or empty. Need to be carefule when reading the loop.
+ +       */
         mutex_lock(&probe->ops.func_hash->regex_lock);
   
         orig_hash = &probe->ops.func_hash->filter_hash;
         old_hash = *orig_hash;
         hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
   
+ +      if (!hash) {
+ +              ret = -ENOMEM;
+ +              goto out;
+ +      }
+ +
         ret = ftrace_match_records(hash, glob, strlen(glob));
   
         /* Nothing found? */
diff --combined kernel/trace/trace_events.c

index 648930823b571083c1a95000937962e565162eb2,5a189fb8ec23368215690d9ecce9b1fe20ff9f72..b89cdfe20bc1626b1c4632c6bfcace5eedbfe1ef
--- 1/kernel/trace/trace_events.c
--- 2/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@@ -255,12 -255,12 +255,12 @@@ void *trace_event_buffer_reserve(struc
         local_save_flags(fbuffer->flags);
         fbuffer->pc = preempt_count();
         /*
-        * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+        * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
          * preemption (adding one to the preempt_count). Since we are
          * interested in the preempt_count at the time the tracepoint was
          * hit, we need to subtract one to offset the increment.
          */
-       if (IS_ENABLED(CONFIG_PREEMPT))
+       if (IS_ENABLED(CONFIG_PREEMPTION))
                 fbuffer->pc--;
         fbuffer->trace_file = trace_file;
   
@@@ -787,7 -787,7 +787,7 @@@ static int __ftrace_set_clr_event(struc
         return ret;
   }
   
- -static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
+ +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
   {
         char *event = NULL, *sub = NULL, *match;
         int ret;
diff --combined mm/page_alloc.c

index 9c9194959271cfc0d9214bf60bb09b96c5b1a96a,0d54cd2c43a47f55cd8821105ea153a4a8e8d0cc..6991ccec9c322ffb843110bb69cf2326d64b266c
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -2238,12 -2238,27 +2238,12 @@@ static int move_freepages(struct zone *
         unsigned int order;
         int pages_moved = 0;
   
- -#ifndef CONFIG_HOLES_IN_ZONE
- -      /*
- -       * page_zone is not safe to call in this context when
- -       * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
- -       * anyway as we check zone boundaries in move_freepages_block().
- -       * Remove at a later date when no bug reports exist related to
- -       * grouping pages by mobility
- -       */
- -      VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
- -                pfn_valid(page_to_pfn(end_page)) &&
- -                page_zone(start_page) != page_zone(end_page));
- -#endif
         for (page = start_page; page <= end_page;) {
                 if (!pfn_valid_within(page_to_pfn(page))) {
                         page++;
                         continue;
                 }
   
- -              /* Make sure we are not inadvertently changing nodes */
- -              VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
- -
                 if (!PageBuddy(page)) {
                         /*
                          * We assume that pages that could be isolated for
@@@ -2258,10 -2273,6 +2258,10 @@@
                         continue;
                 }
   
+ +              /* Make sure we are not inadvertently changing nodes */
+ +              VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ +              VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+ +
                 order = page_order(page);
                 move_to_free_area(page, &zone->free_area[order], migratetype);
                 page += 1 << order;
@@@ -3511,7 -3522,7 +3511,7 @@@ bool zone_watermark_ok_safe(struct zon
   static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
   {
         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
-                               RECLAIM_DISTANCE;
+                               node_reclaim_distance;
   }
   #else /* CONFIG_NUMA */
   static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 17 Sep 2019 00:25:49 +0000 (17:25 -0700)
		1	2
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
arch/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/entry/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/amd.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/kvm.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/rcupdate.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/topology.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/kprobes.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree_stall.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/cpufreq_schedutil.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/deadline.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/idle.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/psi.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/ftrace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace_events.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history