Merge branch 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 15 Nov 2017 22:29:44 +0000 (14:29 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 15 Nov 2017 22:29:44 +0000 (14:29 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 15 Nov 2017 22:29:44 +0000 (14:29 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 15 Nov 2017 22:29:44 +0000 (14:29 -0800)
diff --combined MAINTAINERS

index 16e1e6dc89f253338e8307fc9ff296acbb98b8d1,ed35cd61ad8332493f57d1d2ddaec79291e41502..a74d6a7388641e588db3310dcabd7256637e257d
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -527,6 -527,11 +527,6 @@@ W:        http://ez.analog.com/community/linux
   S:    Supported
   F:    drivers/input/misc/adxl34x.c
   
- -AEDSP16 DRIVER
- -M:    Riccardo Facchetti <fizban@tin.it>
- -S:    Maintained
- -F:    sound/oss/aedsp16.c
- -
   AF9013 MEDIA DRIVER
   M:    Antti Palosaari <crope@iki.fi>
   L:    linux-media@vger.kernel.org
@@@ -695,9 -700,9 +695,9 @@@ F: include/linux/altera_uart.
   F:    include/linux/altera_jtaguart.h
   
   AMAZON ETHERNET DRIVERS
- -M:    Netanel Belgazal <netanel@annapurnalabs.com>
- -R:    Saeed Bishara <saeed@annapurnalabs.com>
- -R:    Zorik Machulsky <zorik@annapurnalabs.com>
+ +M:    Netanel Belgazal <netanel@amazon.com>
+ +R:    Saeed Bishara <saeedb@amazon.com>
+ +R:    Zorik Machulsky <zorik@amazon.com>
   L:    netdev@vger.kernel.org
   S:    Supported
   F:    Documentation/networking/ena.txt
@@@ -868,7 -873,7 +868,7 @@@ F: drivers/android
   F:    drivers/staging/android/
   
   ANDROID GOLDFISH RTC DRIVER
- -M:    Miodrag Dinic <miodrag.dinic@imgtec.com>
+ +M:    Miodrag Dinic <miodrag.dinic@mips.com>
   S:    Supported
   F:    Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt
   F:    drivers/rtc/rtc-goldfish.c
@@@ -2024,7 -2029,6 +2024,7 @@@ M:      Masahiro Yamada <yamada.masahiro@soc
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-uniphier.git
   S:    Maintained
+ +F:    Documentation/devicetree/bindings/gpio/gpio-uniphier.txt
   F:    arch/arm/boot/dts/uniphier*
   F:    arch/arm/include/asm/hardware/cache-uniphier.h
   F:    arch/arm/mach-uniphier/
@@@ -2032,7 -2036,6 +2032,7 @@@ F:      arch/arm/mm/cache-uniphier.
   F:    arch/arm64/boot/dts/socionext/
   F:    drivers/bus/uniphier-system-bus.c
   F:    drivers/clk/uniphier/
+ +F:    drivers/gpio/gpio-uniphier.c
   F:    drivers/i2c/busses/i2c-uniphier*
   F:    drivers/irqchip/irq-uniphier-aidet.c
   F:    drivers/pinctrl/uniphier/
@@@ -2244,7 -2247,7 +2244,7 @@@ F:      include/linux/dmaengine.
   F:    include/linux/async_tx.h
   
   AT24 EEPROM DRIVER
- -M:    Wolfram Sang <wsa@the-dreams.de>
+ +M:    Bartosz Golaszewski <brgl@bgdev.pl>
   L:    linux-i2c@vger.kernel.org
   S:    Maintained
   F:    drivers/misc/eeprom/at24.c
@@@ -2559,12 -2562,10 +2559,12 @@@ S:   Maintaine
   F:    drivers/net/hamradio/baycom*
   
   BCACHE (BLOCK LAYER CACHE)
+ +M:    Michael Lyle <mlyle@lyle.org>
   M:    Kent Overstreet <kent.overstreet@gmail.com>
   L:    linux-bcache@vger.kernel.org
   W:    http://bcache.evilpiepirate.org
- -S:    Orphan
+ +C:    irc://irc.oftc.net/bcache
+ +S:    Maintained
   F:    drivers/md/bcache/
   
   BDISP ST MEDIA DRIVER
@@@ -2712,7 -2713,6 +2712,7 @@@ L:      linux-kernel@vger.kernel.or
   S:    Supported
   F:    arch/x86/net/bpf_jit*
   F:    Documentation/networking/filter.txt
+ +F:    Documentation/bpf/
   F:    include/linux/bpf*
   F:    include/linux/filter.h
   F:    include/uapi/linux/bpf*
@@@ -2725,7 -2725,7 +2725,7 @@@ F:      net/core/filter.
   F:    net/sched/act_bpf.c
   F:    net/sched/cls_bpf.c
   F:    samples/bpf/
- -F:    tools/net/bpf*
+ +F:    tools/bpf/
   F:    tools/testing/selftests/bpf/
   
   BROADCOM B44 10/100 ETHERNET DRIVER
@@@ -2896,15 -2896,7 +2896,15 @@@ S:    Supporte
   F:    drivers/gpio/gpio-brcmstb.c
   F:    Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt
   
+ +BROADCOM BRCMSTB USB2 and USB3 PHY DRIVER
+ +M:    Al Cooper <alcooperx@gmail.com>
+ +L:    linux-kernel@vger.kernel.org
+ +L:    bcm-kernel-feedback-list@broadcom.com
+ +S:    Maintained
+ +F:    drivers/phy/broadcom/phy-brcm-usb*
+ +
   BROADCOM GENET ETHERNET DRIVER
+ +M:    Doug Berger <opendmb@gmail.com>
   M:    Florian Fainelli <f.fainelli@gmail.com>
   L:    netdev@vger.kernel.org
   S:    Supported
@@@ -3090,6 -3082,7 +3090,6 @@@ F:      arch/c6x
   
   CA8210 IEEE-802.15.4 RADIO DRIVER
   M:    Harry Morris <h.morris@cascoda.com>
- -M:    linuxdev@cascoda.com
   L:    linux-wpan@vger.kernel.org
   W:    https://github.com/Cascoda/ca8210-linux.git
   S:    Maintained
@@@ -3336,22 -3329,17 +3336,22 @@@ S:   Maintaine
   F:    drivers/auxdisplay/cfag12864bfb.c
   F:    include/linux/cfag12864b.h
   
- -CFG80211 and NL80211
+ +802.11 (including CFG80211/NL80211)
   M:    Johannes Berg <johannes@sipsolutions.net>
   L:    linux-wireless@vger.kernel.org
   W:    http://wireless.kernel.org/
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git
   S:    Maintained
+ +F:    net/wireless/
   F:    include/uapi/linux/nl80211.h
+ +F:    include/linux/ieee80211.h
+ +F:    include/net/wext.h
   F:    include/net/cfg80211.h
- -F:    net/wireless/*
- -X:    net/wireless/wext*
+ +F:    include/net/iw_handler.h
+ +F:    include/net/ieee80211_radiotap.h
+ +F:    Documentation/driver-api/80211/cfg80211.rst
+ +F:    Documentation/networking/regulatory.txt
   
   CHAR and MISC DRIVERS
   M:    Arnd Bergmann <arnd@arndb.de>
@@@ -3427,7 -3415,7 +3427,7 @@@ F:      drivers/scsi/snic
   CISCO VIC ETHERNET NIC DRIVER
   M:    Christian Benvenuti <benve@cisco.com>
   M:    Govindarajulu Varadarajan <_govind@gmx.com>
- -M:    Neel Patel <neepatel@cisco.com>
+ +M:    Parvi Kaustubhi <pkaustub@cisco.com>
   S:    Supported
   F:    drivers/net/ethernet/cisco/enic/
   
@@@ -3456,8 -3444,7 +3456,8 @@@ M:      Thomas Gleixner <tglx@linutronix.de
   L:    linux-kernel@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
   S:    Supported
- -F:    drivers/clocksource
+ +F:    drivers/clocksource/
+ +F:    Documentation/devicetree/bindings/timer/
   
   CMPC ACPI DRIVER
   M:    Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
@@@ -3478,7 -3465,7 +3478,7 @@@ COCCINELLE/Semantic Patches (SmPL
   M:    Julia Lawall <Julia.Lawall@lip6.fr>
   M:    Gilles Muller <Gilles.Muller@lip6.fr>
   M:    Nicolas Palix <nicolas.palix@imag.fr>
- -M:    Michal Marek <mmarek@suse.com>
+ +M:    Michal Marek <michal.lkml@markovi.net>
   L:    cocci@systeme.lip6.fr (moderated for non-subscribers)
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mmarek/kbuild.git misc
   W:    http://coccinelle.lip6.fr/
@@@ -3592,7 -3579,7 +3592,7 @@@ T:      git git://git.kernel.org/pub/scm/lin
   S:    Maintained
   F:    Documentation/cgroup-v1/cpusets.txt
   F:    include/linux/cpuset.h
- F:    kernel/cpuset.c
+ F:    kernel/cgroup/cpuset.c
   
   CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
   M:    Johannes Weiner <hannes@cmpxchg.org>
@@@ -3649,8 -3636,6 +3649,8 @@@ F:      drivers/cpufreq/arm_big_little_dt.
   
   CPU POWER MONITORING SUBSYSTEM
   M:    Thomas Renninger <trenn@suse.com>
+ +M:    Shuah Khan <shuahkh@osg.samsung.com>
+ +M:    Shuah Khan <shuah@kernel.org>
   L:    linux-pm@vger.kernel.org
   S:    Maintained
   F:    tools/power/cpupower/
@@@ -4106,8 -4091,6 +4106,8 @@@ T:      git git://git.kernel.org/pub/scm/lin
   T:    quilt http://people.redhat.com/agk/patches/linux/editing/
   S:    Maintained
   F:    Documentation/device-mapper/
+ +F:    drivers/md/Makefile
+ +F:    drivers/md/Kconfig
   F:    drivers/md/dm*
   F:    drivers/md/persistent-data/
   F:    include/linux/device-mapper.h
@@@ -4251,7 -4234,7 +4251,7 @@@ S:      Maintaine
   F:    drivers/dma/
   F:    include/linux/dmaengine.h
   F:    Documentation/devicetree/bindings/dma/
- -F:    Documentation/dmaengine/
+ +F:    Documentation/driver-api/dmaengine/
   T:    git git://git.infradead.org/users/vkoul/slave-dma.git
   
   DMA MAPPING HELPERS
@@@ -4923,19 -4906,13 +4923,19 @@@ L:   linux-edac@vger.kernel.or
   S:    Maintained
   F:    drivers/edac/highbank*
   
- -EDAC-CAVIUM
+ +EDAC-CAVIUM OCTEON
   M:    Ralf Baechle <ralf@linux-mips.org>
   M:    David Daney <david.daney@cavium.com>
   L:    linux-edac@vger.kernel.org
   L:    linux-mips@linux-mips.org
   S:    Supported
   F:    drivers/edac/octeon_edac*
+ +
+ +EDAC-CAVIUM THUNDERX
+ +M:    David Daney <david.daney@cavium.com>
+ +M:    Jan Glauber <jglauber@cavium.com>
+ +L:    linux-edac@vger.kernel.org
+ +S:    Supported
   F:    drivers/edac/thunderx_edac*
   
   EDAC-CORE
@@@ -5236,7 -5213,8 +5236,7 @@@ F:      fs/ext4
   
   Extended Verification Module (EVM)
   M:    Mimi Zohar <zohar@linux.vnet.ibm.com>
- -L:    linux-ima-devel@lists.sourceforge.net
- -L:    linux-security-module@vger.kernel.org
+ +L:    linux-integrity@vger.kernel.org
   S:    Supported
   F:    security/integrity/evm/
   
@@@ -5281,8 -5259,7 +5281,8 @@@ S:      Maintaine
   F:    drivers/iommu/exynos-iommu.c
   
   EZchip NPS platform support
- -M:    Noam Camus <noamc@ezchip.com>
+ +M:    Elad Kanfi <eladkan@mellanox.com>
+ +M:    Vineet Gupta <vgupta@synopsys.com>
   S:    Supported
   F:    arch/arc/plat-eznps
   F:    arch/arc/boot/dts/eznps.dts
@@@ -5368,7 -5345,9 +5368,7 @@@ M:      "J. Bruce Fields" <bfields@fieldses.
   L:    linux-fsdevel@vger.kernel.org
   S:    Maintained
   F:    include/linux/fcntl.h
- -F:    include/linux/fs.h
   F:    include/uapi/linux/fcntl.h
- -F:    include/uapi/linux/fs.h
   F:    fs/fcntl.c
   F:    fs/locks.c
   
@@@ -5377,8 -5356,6 +5377,8 @@@ M:      Alexander Viro <viro@zeniv.linux.org
   L:    linux-fsdevel@vger.kernel.org
   S:    Maintained
   F:    fs/*
+ +F:    include/linux/fs.h
+ +F:    include/uapi/linux/fs.h
   
   FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
   M:    Riku Voipio <riku.voipio@iki.fi>
@@@ -5491,7 -5468,7 +5491,7 @@@ F:      include/uapi/linux/fb.
   
   FREESCALE CAAM (Cryptographic Acceleration and Assurance Module) DRIVER
   M:    Horia Geantă <horia.geanta@nxp.com>
- -M:    Dan Douglass <dan.douglass@nxp.com>
+ +M:    Aymen Sghaier <aymen.sghaier@nxp.com>
   L:    linux-crypto@vger.kernel.org
   S:    Maintained
   F:    drivers/crypto/caam/
@@@ -5671,7 -5648,6 +5671,7 @@@ T:      git git://git.kernel.org/pub/scm/lin
   S:    Supported
   F:    fs/crypto/
   F:    include/linux/fscrypt*.h
+ +F:    Documentation/filesystems/fscrypt.rst
   
   FUJITSU FR-V (FRV) PORT
   S:    Orphan
@@@ -6265,13 -6241,6 +6265,13 @@@ S:    Maintaine
   F:    drivers/net/ethernet/hisilicon/
   F:    Documentation/devicetree/bindings/net/hisilicon*.txt
   
+ +HISILICON PMU DRIVER
+ +M:    Shaokun Zhang <zhangshaokun@hisilicon.com>
+ +W:    http://www.hisilicon.com
+ +S:    Supported
+ +F:    drivers/perf/hisilicon
+ +F:    Documentation/perf/hisi-pmu.txt
+ +
   HISILICON ROCE DRIVER
   M:    Lijun Ou <oulijun@huawei.com>
   M:    Wei Hu(Xavier) <xavier.huwei@huawei.com>
@@@ -6701,7 -6670,7 +6701,7 @@@ F:      include/net/ieee802154_netdev.
   F:    Documentation/networking/ieee802154.txt
   
   IFE PROTOCOL
- -M:    Yotam Gigi <yotamg@mellanox.com>
+ +M:    Yotam Gigi <yotam.gi@gmail.com>
   M:    Jamal Hadi Salim <jhs@mojatatu.com>
   F:    net/ife
   F:    include/net/ife.h
@@@ -6763,13 -6732,13 +6763,13 @@@ S:   Maintaine
   F:    drivers/usb/atm/ueagle-atm.c
   
   IMGTEC ASCII LCD DRIVER
- -M:    Paul Burton <paul.burton@imgtec.com>
+ +M:    Paul Burton <paul.burton@mips.com>
   S:    Maintained
   F:    Documentation/devicetree/bindings/auxdisplay/img-ascii-lcd.txt
   F:    drivers/auxdisplay/img-ascii-lcd.c
   
   IMGTEC IR DECODER DRIVER
- -M:    James Hogan <james.hogan@imgtec.com>
+ +M:    James Hogan <jhogan@kernel.org>
   S:    Maintained
   F:    drivers/media/rc/img-ir/
   
@@@ -6871,7 -6840,9 +6871,7 @@@ L:      linux-crypto@vger.kernel.or
   INTEGRITY MEASUREMENT ARCHITECTURE (IMA)
   M:    Mimi Zohar <zohar@linux.vnet.ibm.com>
   M:    Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
- -L:    linux-ima-devel@lists.sourceforge.net
- -L:    linux-ima-user@lists.sourceforge.net
- -L:    linux-security-module@vger.kernel.org
+ +L:    linux-integrity@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git
   S:    Supported
   F:    security/integrity/ima/
@@@ -7461,8 -7432,10 +7461,8 @@@ F:     mm/kasan
   F:    scripts/Makefile.kasan
   
   KCONFIG
- -M:    "Yann E. MORIN" <yann.morin.1998@free.fr>
   L:    linux-kbuild@vger.kernel.org
- -T:    git git://gitorious.org/linux-kconfig/linux-kconfig
- -S:    Maintained
+ +S:    Orphan
   F:    Documentation/kbuild/kconfig-language.txt
   F:    scripts/kconfig/
   
@@@ -7491,7 -7464,7 +7491,7 @@@ F:      fs/autofs4
   
   KERNEL BUILD + files below scripts/ (unless maintained elsewhere)
   M:    Masahiro Yamada <yamada.masahiro@socionext.com>
- -M:    Michal Marek <mmarek@suse.com>
+ +M:    Michal Marek <michal.lkml@markovi.net>
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git
   L:    linux-kbuild@vger.kernel.org
   S:    Maintained
@@@ -7589,7 -7562,7 +7589,7 @@@ F:      arch/arm64/include/asm/kvm
   F:    arch/arm64/kvm/
   
   KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips)
- -M:    James Hogan <james.hogan@imgtec.com>
+ +M:    James Hogan <jhogan@kernel.org>
   L:    linux-mips@linux-mips.org
   S:    Supported
   F:    arch/mips/include/uapi/asm/kvm*
@@@ -7597,7 -7570,7 +7597,7 @@@ F:      arch/mips/include/asm/kvm
   F:    arch/mips/kvm/
   
   KERNEL VIRTUAL MACHINE FOR POWERPC (KVM/powerpc)
- -M:    Alexander Graf <agraf@suse.com>
+ +M:    Paul Mackerras <paulus@ozlabs.org>
   L:    kvm-ppc@vger.kernel.org
   W:    http://www.linux-kvm.org/
   T:    git git://github.com/agraf/linux-2.6.git
@@@ -7652,7 -7625,8 +7652,7 @@@ F:      kernel/kexec
   
   KEYS-ENCRYPTED
   M:    Mimi Zohar <zohar@linux.vnet.ibm.com>
- -M:    David Safford <safford@us.ibm.com>
- -L:    linux-security-module@vger.kernel.org
+ +L:    linux-integrity@vger.kernel.org
   L:    keyrings@vger.kernel.org
   S:    Supported
   F:    Documentation/security/keys/trusted-encrypted.rst
@@@ -7660,8 -7634,9 +7660,8 @@@ F:      include/keys/encrypted-type.
   F:    security/keys/encrypted-keys/
   
   KEYS-TRUSTED
- -M:    David Safford <safford@us.ibm.com>
   M:    Mimi Zohar <zohar@linux.vnet.ibm.com>
- -L:    linux-security-module@vger.kernel.org
+ +L:    linux-integrity@vger.kernel.org
   L:    keyrings@vger.kernel.org
   S:    Supported
   F:    Documentation/security/keys/trusted-encrypted.rst
@@@ -7769,11 -7744,6 +7769,11 @@@ S:    Maintaine
   F:    Documentation/scsi/53c700.txt
   F:    drivers/scsi/53c700*
   
+ +LEAKING_ADDRESSES
+ +M:    Tobin C. Harding <me@tobin.cc>
+ +S:    Maintained
+ +F:    scripts/leaking_addresses.pl
+ +
   LED SUBSYSTEM
   M:    Richard Purdie <rpurdie@rpsys.net>
   M:    Jacek Anaszewski <jacek.anaszewski@gmail.com>
@@@ -8237,7 -8207,6 +8237,7 @@@ F:      Documentation/networking/mac80211-in
   F:    include/net/mac80211.h
   F:    net/mac80211/
   F:    drivers/net/wireless/mac80211_hwsim.[ch]
+ +F:    Documentation/networking/mac80211_hwsim/README
   
   MAILBOX API
   M:    Jassi Brar <jassisinghbrar@gmail.com>
@@@ -8295,12 -8264,6 +8295,12 @@@ L:    libertas-dev@lists.infradead.or
   S:    Orphan
   F:    drivers/net/wireless/marvell/libertas/
   
+ +MARVELL MACCHIATOBIN SUPPORT
+ +M:    Russell King <rmk@armlinux.org.uk>
+ +L:    linux-arm-kernel@lists.infradead.org
+ +S:    Maintained
+ +F:    arch/arm64/boot/dts/marvell/armada-8040-mcbin.dts
+ +
   MARVELL MV643XX ETHERNET DRIVER
   M:    Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
   L:    netdev@vger.kernel.org
@@@ -8634,12 -8597,6 +8634,12 @@@ M:    Sean Wang <sean.wang@mediatek.com
   S:    Maintained
   F:    drivers/media/rc/mtk-cir.c
   
+ +MEDIATEK PMIC LED DRIVER
+ +M:    Sean Wang <sean.wang@mediatek.com>
+ +S:    Maintained
+ +F:    drivers/leds/leds-mt6323.c
+ +F:    Documentation/devicetree/bindings/leds/leds-mt6323.txt
+ +
   MEDIATEK ETHERNET DRIVER
   M:    Felix Fietkau <nbd@openwrt.org>
   M:    John Crispin <john@phrozen.org>
@@@ -8773,7 -8730,7 +8773,7 @@@ Q:      http://patchwork.ozlabs.org/project/
   F:    drivers/net/ethernet/mellanox/mlxsw/
   
   MELLANOX FIRMWARE FLASH LIBRARY (mlxfw)
- -M:    Yotam Gigi <yotamg@mellanox.com>
+ +M:    mlxsw@mellanox.com
   L:    netdev@vger.kernel.org
   S:    Supported
   W:    http://www.mellanox.com
@@@ -8922,7 -8879,7 +8922,7 @@@ F:      Documentation/devicetree/bindings/me
   T:    git git://linuxtv.org/media_tree.git
   
   METAG ARCHITECTURE
- -M:    James Hogan <james.hogan@imgtec.com>
+ +M:    James Hogan <jhogan@kernel.org>
   L:    linux-metag@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jhogan/metag.git
   S:    Odd Fixes
@@@ -9023,7 -8980,7 +9023,7 @@@ F:      Documentation/mips
   F:    arch/mips/
   
   MIPS BOSTON DEVELOPMENT BOARD
- -M:    Paul Burton <paul.burton@imgtec.com>
+ +M:    Paul Burton <paul.burton@mips.com>
   L:    linux-mips@linux-mips.org
   S:    Maintained
   F:    Documentation/devicetree/bindings/clock/img,boston-clock.txt
@@@ -9033,7 -8990,7 +9033,7 @@@ F:      drivers/clk/imgtec/clk-boston.
   F:    include/dt-bindings/clock/boston-clock.h
   
   MIPS GENERIC PLATFORM
- -M:    Paul Burton <paul.burton@imgtec.com>
+ +M:    Paul Burton <paul.burton@mips.com>
   L:    linux-mips@linux-mips.org
   S:    Supported
   F:    arch/mips/generic/
@@@ -9049,7 -9006,7 +9049,7 @@@ F:      drivers/*/*loongson1
   F:    drivers/*/*/*loongson1*
   
   MIPS RINT INSTRUCTION EMULATION
- -M:    Aleksandar Markovic <aleksandar.markovic@imgtec.com>
+ +M:    Aleksandar Markovic <aleksandar.markovic@mips.com>
   L:    linux-mips@linux-mips.org
   S:    Supported
   F:    arch/mips/math-emu/sp_rint.c
@@@ -9229,6 -9186,12 +9229,6 @@@ F:     include/linux/dt-bindings/mux
   F:    include/linux/mux/
   F:    drivers/mux/
   
- -MULTISOUND SOUND DRIVER
- -M:    Andrew Veliath <andrewtv@usa.net>
- -S:    Maintained
- -F:    Documentation/sound/oss/MultiSound
- -F:    sound/oss/msnd*
- -
   MULTITECH MULTIPORT CARD (ISICOM)
   S:    Orphan
   F:    drivers/tty/isicom.c
@@@ -9237,6 -9200,7 +9237,6 @@@ F:      include/linux/isicom.
   MUSB MULTIPOINT HIGH SPEED DUAL-ROLE CONTROLLER
   M:    Bin Liu <b-liu@ti.com>
   L:    linux-usb@vger.kernel.org
- -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
   S:    Maintained
   F:    drivers/usb/musb/
   
@@@ -9384,7 -9348,7 +9384,7 @@@ NETWORK BLOCK DEVICE (NBD
   M:    Josef Bacik <jbacik@fb.com>
   S:    Maintained
   L:    linux-block@vger.kernel.org
- -L:    nbd-general@lists.sourceforge.net
+ +L:    nbd@other.debian.org
   F:    Documentation/blockdev/nbd.txt
   F:    drivers/block/nbd.c
   F:    include/uapi/linux/nbd.h
@@@ -9432,7 -9396,6 +9432,7 @@@ M:      Florian Fainelli <f.fainelli@gmail.c
   S:    Maintained
   F:    net/dsa/
   F:    include/net/dsa.h
+ +F:    include/linux/dsa/
   F:    drivers/net/dsa/
   
   NETWORKING [GENERAL]
@@@ -9453,8 -9416,8 +9453,8 @@@ F:      include/uapi/linux/in.
   F:    include/uapi/linux/net.h
   F:    include/uapi/linux/netdevice.h
   F:    include/uapi/linux/net_namespace.h
- -F:    tools/net/
   F:    tools/testing/selftests/net/
+ +F:    lib/net_utils.c
   F:    lib/random32.c
   
   NETWORKING [IPSEC]
@@@ -10056,11 -10019,7 +10056,11 @@@ T: git git://github.com/openrisc/linux.
   L:    openrisc@lists.librecores.org
   W:    http://openrisc.io
   S:    Maintained
+ +F:    Documentation/devicetree/bindings/openrisc/
+ +F:    Documentation/openrisc/
   F:    arch/openrisc/
+ +F:    drivers/irqchip/irq-ompic.c
+ +F:    drivers/irqchip/irq-or1k-*
   
   OPENVSWITCH
   M:    Pravin Shelar <pshelar@nicira.com>
@@@ -10078,7 -10037,7 +10078,7 @@@ M:   Stephen Boyd <sboyd@codeaurora.org
   L:    linux-pm@vger.kernel.org
   S:    Maintained
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git
- -F:    drivers/base/power/opp/
+ +F:    drivers/opp/
   F:    include/linux/pm_opp.h
   F:    Documentation/power/opp.txt
   F:    Documentation/devicetree/bindings/opp/
@@@ -10208,6 -10167,7 +10208,6 @@@ F:   Documentation/parport*.tx
   
   PARAVIRT_OPS INTERFACE
   M:    Juergen Gross <jgross@suse.com>
- -M:    Chris Wright <chrisw@sous-sol.org>
   M:    Alok Kataria <akataria@vmware.com>
   M:    Rusty Russell <rusty@rustcorp.com.au>
   L:    virtualization@lists.linux-foundation.org
@@@ -10365,6 -10325,7 +10365,6 @@@ F:   drivers/pci/host/vmd.
   
   PCI DRIVER FOR MICROSEMI SWITCHTEC
   M:    Kurt Schwemmer <kurt.schwemmer@microsemi.com>
- -M:    Stephen Bates <stephen.bates@microsemi.com>
   M:    Logan Gunthorpe <logang@deltatee.com>
   L:    linux-pci@vger.kernel.org
   S:    Maintained
@@@ -10429,7 -10390,6 +10429,7 @@@ F:   drivers/pci/dwc/*keystone
   
   PCI ENDPOINT SUBSYSTEM
   M:    Kishon Vijay Abraham I <kishon@ti.com>
+ +M:    Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
   L:    linux-pci@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kishon/pci-endpoint.git
   S:    Supported
@@@ -10481,15 -10441,6 +10481,15 @@@ F: include/linux/pci
   F:    arch/x86/pci/
   F:    arch/x86/kernel/quirks.c
   
+ +PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS
+ +M:    Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ +L:    linux-pci@vger.kernel.org
+ +Q:    http://patchwork.ozlabs.org/project/linux-pci/list/
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/
+ +S:    Supported
+ +F:    drivers/pci/host/
+ +F:    drivers/pci/dwc/
+ +
   PCIE DRIVER FOR AXIS ARTPEC
   M:    Niklas Cassel <niklas.cassel@axis.com>
   M:    Jesper Nilsson <jesper.nilsson@axis.com>
@@@ -10509,6 -10460,7 +10509,6 @@@ F:   drivers/pci/host/pci-thunder-
   
   PCIE DRIVER FOR HISILICON
   M:    Zhou Wang <wangzhou1@hisilicon.com>
- -M:    Gabriele Paoloni <gabriele.paoloni@huawei.com>
   L:    linux-pci@vger.kernel.org
   S:    Maintained
   F:    Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
@@@ -10595,8 -10547,6 +10595,8 @@@ M:   Peter Zijlstra <peterz@infradead.org
   M:    Ingo Molnar <mingo@redhat.com>
   M:    Arnaldo Carvalho de Melo <acme@kernel.org>
   R:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
+ +R:    Jiri Olsa <jolsa@redhat.com>
+ +R:    Namhyung Kim <namhyung@kernel.org>
   L:    linux-kernel@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
   S:    Supported
@@@ -10720,9 -10670,10 +10720,9 @@@ S:  Maintaine
   F:    drivers/pinctrl/spear/
   
   PISTACHIO SOC SUPPORT
- -M:    James Hartley <james.hartley@imgtec.com>
- -M:    Ionela Voinescu <ionela.voinescu@imgtec.com>
+ +M:    James Hartley <james.hartley@sondrel.com>
   L:    linux-mips@linux-mips.org
- -S:    Maintained
+ +S:    Odd Fixes
   F:    arch/mips/pistachio/
   F:    arch/mips/include/asm/mach-pistachio/
   F:    arch/mips/boot/dts/img/pistachio*
@@@ -10926,7 -10877,7 +10926,7 @@@ S:   Maintaine
   F:    drivers/block/ps3vram.c
   
   PSAMPLE PACKET SAMPLING SUPPORT:
- -M:    Yotam Gigi <yotamg@mellanox.com>
+ +M:    Yotam Gigi <yotam.gi@gmail.com>
   S:    Maintained
   F:    net/psample
   F:    include/net/psample.h
@@@ -11069,6 -11020,7 +11069,6 @@@ F:   drivers/mtd/nand/pxa3xx_nand.
   
   QAT DRIVER
   M:    Giovanni Cabiddu <giovanni.cabiddu@intel.com>
- -M:    Salvatore Benedetto <salvatore.benedetto@intel.com>
   L:    qat-linux@intel.com
   S:    Supported
   F:    drivers/crypto/qat/
@@@ -11528,7 -11480,6 +11528,7 @@@ T:   git git://git.kernel.org/pub/scm/lin
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git
   S:    Maintained
   F:    Documentation/rfkill.txt
+ +F:    Documentation/ABI/stable/sysfs-class-rfkill
   F:    net/rfkill/
   
   RHASHTABLE
@@@ -11550,16 -11501,6 +11550,16 @@@ S: Maintaine
   F:    drivers/mtd/nand/r852.c
   F:    drivers/mtd/nand/r852.h
   
+ +RISC-V ARCHITECTURE
+ +M:    Palmer Dabbelt <palmer@sifive.com>
+ +M:    Albert Ou <albert@sifive.com>
+ +L:    patches@groups.riscv.org
+ +T:    git https://github.com/riscv/riscv-linux
+ +S:    Supported
+ +F:    arch/riscv/
+ +K:    riscv
+ +N:    riscv
+ +
   ROCCAT DRIVERS
   M:    Stefan Achatz <erazor_de@users.sourceforge.net>
   W:    http://sourceforge.net/projects/roccat/
@@@ -11812,7 -11753,7 +11812,7 @@@ L:   linux-crypto@vger.kernel.or
   L:    linux-samsung-soc@vger.kernel.org
   S:    Maintained
   F:    drivers/crypto/exynos-rng.c
- -F:    Documentation/devicetree/bindings/rng/samsung,exynos-rng4.txt
+ +F:    Documentation/devicetree/bindings/crypto/samsung,exynos-rng4.txt
   
   SAMSUNG FRAMEBUFFER DRIVER
   M:    Jingoo Han <jingoohan1@gmail.com>
@@@ -12095,15 -12036,10 +12095,15 @@@ L:        linux-mmc@vger.kernel.or
   S:    Maintained
   F:    drivers/mmc/host/sdhci-spear.c
   
+ +SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) TI OMAP DRIVER
+ +M:    Kishon Vijay Abraham I <kishon@ti.com>
+ +L:    linux-mmc@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/mmc/host/sdhci-omap.c
+ +
   SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER
   M:    Scott Bauer <scott.bauer@intel.com>
   M:    Jonathan Derrick <jonathan.derrick@intel.com>
- -M:    Rafael Antognolli <rafael.antognolli@intel.com>
   L:    linux-block@vger.kernel.org
   S:    Supported
   F:    block/sed*
@@@ -12504,10 -12440,7 +12504,10 @@@ M: Shaohua Li <shli@kernel.org
   L:    linux-raid@vger.kernel.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git
   S:    Supported
- -F:    drivers/md/
+ +F:    drivers/md/Makefile
+ +F:    drivers/md/Kconfig
+ +F:    drivers/md/md*
+ +F:    drivers/md/raid*
   F:    include/linux/raid/
   F:    include/uapi/linux/raid/
   
@@@ -12960,16 -12893,9 +12960,16 @@@ F: arch/arc/plat-axs10
   F:    arch/arc/boot/dts/ax*
   F:    Documentation/devicetree/bindings/arc/axs10*
   
+ +SYNOPSYS DESIGNWARE APB GPIO DRIVER
+ +M:    Hoan Tran <hotran@apm.com>
+ +L:    linux-gpio@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/gpio/gpio-dwapb.c
+ +F:    Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt
+ +
   SYNOPSYS DESIGNWARE DMAC DRIVER
   M:    Viresh Kumar <vireshk@kernel.org>
- -M:    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ +R:    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
   S:    Maintained
   F:    include/linux/dma/dw.h
   F:    include/linux/platform_data/dma-dw.h
@@@ -12999,9 -12925,9 +12999,9 @@@ F:   drivers/mmc/host/dw_mmc
   SYNOPSYS HSDK RESET CONTROLLER DRIVER
   M:    Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
   S:    Supported
- -F:    drivers/reset/reset-hsdk-v1.c
- -F:    include/dt-bindings/reset/snps,hsdk-v1-reset.h
- -F:    Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt
+ +F:    drivers/reset/reset-hsdk.c
+ +F:    include/dt-bindings/reset/snps,hsdk-reset.h
+ +F:    Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt
   
   SYSTEM CONFIGURATION (SYSCON)
   M:    Lee Jones <lee.jones@linaro.org>
@@@ -13352,15 -13278,6 +13352,15 @@@ M: Mika Westerberg <mika.westerberg@lin
   M:    Yehezkel Bernat <yehezkel.bernat@intel.com>
   S:    Maintained
   F:    drivers/thunderbolt/
+ +F:    include/linux/thunderbolt.h
+ +
+ +THUNDERBOLT NETWORK DRIVER
+ +M:    Michael Jamet <michael.jamet@intel.com>
+ +M:    Mika Westerberg <mika.westerberg@linux.intel.com>
+ +M:    Yehezkel Bernat <yehezkel.bernat@intel.com>
+ +L:    netdev@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/net/thunderbolt.c
   
   THUNDERX GPIO DRIVER
   M:    David Daney <david.daney@cavium.com>
@@@ -13669,14 -13586,23 +13669,14 @@@ F:        drivers/platform/x86/toshiba-wmi.
   
   TPM DEVICE DRIVER
   M:    Peter Huewe <peterhuewe@gmx.de>
- -M:    Marcel Selhorst <tpmdd@selhorst.net>
   M:    Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
   R:    Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
- -W:    http://tpmdd.sourceforge.net
- -L:    tpmdd-devel@lists.sourceforge.net (moderated for non-subscribers)
- -Q:    https://patchwork.kernel.org/project/tpmdd-devel/list/
+ +L:    linux-integrity@vger.kernel.org
+ +Q:    https://patchwork.kernel.org/project/linux-integrity/list/
   T:    git git://git.infradead.org/users/jjs/linux-tpmdd.git
   S:    Maintained
   F:    drivers/char/tpm/
   
- -TPM IBM_VTPM DEVICE DRIVER
- -M:    Ashley Lai <ashleydlai@gmail.com>
- -W:    http://tpmdd.sourceforge.net
- -L:    tpmdd-devel@lists.sourceforge.net (moderated for non-subscribers)
- -S:    Maintained
- -F:    drivers/char/tpm/tpm_ibmvtpm*
- -
   TRACING
   M:    Steven Rostedt <rostedt@goodmis.org>
   M:    Ingo Molnar <mingo@redhat.com>
@@@ -13817,7 -13743,7 +13817,7 @@@ UDRAW TABLE
   M:    Bastien Nocera <hadess@hadess.net>
   L:    linux-input@vger.kernel.org
   S:    Maintained
- -F:    drivers/hid/hid-udraw.c
+ +F:    drivers/hid/hid-udraw-ps3.c
   
   UFS FILESYSTEM
   M:    Evgeniy Dushistov <dushistov@mail.ru>
@@@ -14340,15 -14266,12 +14340,15 @@@ S:        Maintaine
   F:    include/linux/virtio_vsock.h
   F:    include/uapi/linux/virtio_vsock.h
   F:    include/uapi/linux/vsockmon.h
+ +F:    include/uapi/linux/vm_sockets_diag.h
+ +F:    net/vmw_vsock/diag.c
   F:    net/vmw_vsock/af_vsock_tap.c
   F:    net/vmw_vsock/virtio_transport_common.c
   F:    net/vmw_vsock/virtio_transport.c
   F:    drivers/net/vsockmon.c
   F:    drivers/vhost/vsock.c
   F:    drivers/vhost/vsock.h
+ +F:    tools/testing/vsock/
   
   VIRTIO CONSOLE DRIVER
   M:    Amit Shah <amit@kernel.org>
@@@ -14389,7 -14312,6 +14389,7 @@@ L:   virtualization@lists.linux-foundatio
   L:    kvm@vger.kernel.org
   S:    Supported
   F:    drivers/s390/virtio/
+ +F:    arch/s390/include/uapi/asm/virtio-ccw.h
   
   VIRTIO GPU DRIVER
   M:    David Airlie <airlied@linux.ie>
@@@ -14612,6 -14534,7 +14612,6 @@@ L:   wil6210@qca.qualcomm.co
   S:    Supported
   W:    http://wireless.kernel.org/en/users/Drivers/wil6210
   F:    drivers/net/wireless/ath/wil6210/
- -F:    include/uapi/linux/wil6210_uapi.h
   
   WIMAX STACK
   M:    Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
@@@ -14662,7 -14585,6 +14662,7 @@@ F:   Documentation/devicetree/bindings/ex
   F:    Documentation/devicetree/bindings/regulator/arizona-regulator.txt
   F:    Documentation/devicetree/bindings/mfd/arizona.txt
   F:    Documentation/devicetree/bindings/mfd/wm831x.txt
+ +F:    Documentation/devicetree/bindings/sound/wlf,arizona.txt
   F:    arch/arm/mach-s3c64xx/mach-crag6410*
   F:    drivers/clk/clk-wm83*.c
   F:    drivers/extcon/extcon-arizona.c
diff --combined include/linux/cgroup-defs.h

index 1dff0a478b45aace3903af82883cb3bf39194774,ada6df7b1f55b8ee530489dd8ba9e1739db1ec71..8b7fd8eeccee26c5694530a45f8f9332aaf681c7
--- 1/include/linux/cgroup-defs.h
--- 2/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@@ -1,4 -1,3 +1,4 @@@
+ +/* SPDX-License-Identifier: GPL-2.0 */
   /*
    * linux/cgroup-defs.h - basic definitions for cgroup
    *
@@@ -17,6 -16,7 +17,7 @@@
   #include <linux/refcount.h>
   #include <linux/percpu-refcount.h>
   #include <linux/percpu-rwsem.h>
+ #include <linux/u64_stats_sync.h>
   #include <linux/workqueue.h>
   #include <linux/bpf-cgroup.h>
   
@@@ -255,6 -255,57 +256,57 @@@ struct css_set 
         struct rcu_head rcu_head;
   };
   
+ /*
+  * cgroup basic resource usage statistics.  Accounting is done per-cpu in
+  * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
+  * reads.
+  *
+  * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
+  * linked into the updated tree.  On the following read, propagation only
+  * considers and consumes the updated tree.  This makes reading O(the
+  * number of descendants which have been active since last read) instead of
+  * O(the total number of descendants).
+  *
+  * This is important because there can be a lot of (draining) cgroups which
+  * aren't active and stat may be read frequently.  The combination can
+  * become very expensive.  By propagating selectively, increasing reading
+  * frequency decreases the cost of each read.
+  */
+ struct cgroup_cpu_stat {
+       /*
+        * ->sync protects all the current counters.  These are the only
+        * fields which get updated in the hot path.
+        */
+       struct u64_stats_sync sync;
+       struct task_cputime cputime;
+ 
+       /*
+        * Snapshots at the last reading.  These are used to calculate the
+        * deltas to propagate to the global counters.
+        */
+       struct task_cputime last_cputime;
+ 
+       /*
+        * Child cgroups with stat updates on this cpu since the last read
+        * are linked on the parent's ->updated_children through
+        * ->updated_next.
+        *
+        * In addition to being more compact, singly-linked list pointing
+        * to the cgroup makes it unnecessary for each per-cpu struct to
+        * point back to the associated cgroup.
+        *
+        * Protected by per-cpu cgroup_cpu_stat_lock.
+        */
+       struct cgroup *updated_children;        /* terminated by self cgroup */
+       struct cgroup *updated_next;            /* NULL iff not on the list */
+ };
+ 
+ struct cgroup_stat {
+       /* per-cpu statistics are collected into the folowing global counters */
+       struct task_cputime cputime;
+       struct prev_cputime prev_cputime;
+ };
+ 
   struct cgroup {
         /* self css with NULL ->ss, points back to this cgroup */
         struct cgroup_subsys_state self;
@@@ -354,6 -405,11 +406,11 @@@
          */
         struct cgroup *dom_cgrp;
   
+       /* cgroup basic resource statistics */
+       struct cgroup_cpu_stat __percpu *cpu_stat;
+       struct cgroup_stat pending_stat;        /* pending from children */
+       struct cgroup_stat stat;
+ 
         /*
          * list of pidlists, up to two for each namespace (one for procs, one
          * for tasks); created on demand.
@@@ -513,6 -569,8 +570,8 @@@ struct cgroup_subsys 
         void (*css_released)(struct cgroup_subsys_state *css);
         void (*css_free)(struct cgroup_subsys_state *css);
         void (*css_reset)(struct cgroup_subsys_state *css);
+       int (*css_extra_stat_show)(struct seq_file *seq,
+                                  struct cgroup_subsys_state *css);
   
         int (*can_attach)(struct cgroup_taskset *tset);
         void (*cancel_attach)(struct cgroup_taskset *tset);
diff --combined include/linux/cgroup.h

index dddbc29e20098e0a9f0377f64b0945c87691d315,03cad08b09d138bfbaa0bacd27683c0bea348557..473e0c0abb8621f732ae3a384acbac63eca8ff11
--- 1/include/linux/cgroup.h
--- 2/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@@ -1,4 -1,3 +1,4 @@@
+ +/* SPDX-License-Identifier: GPL-2.0 */
   #ifndef _LINUX_CGROUP_H
   #define _LINUX_CGROUP_H
   /*
@@@ -23,6 -22,7 +23,7 @@@
   #include <linux/nsproxy.h>
   #include <linux/user_namespace.h>
   #include <linux/refcount.h>
+ #include <linux/kernel_stat.h>
   
   #include <linux/cgroup-defs.h>
   
@@@ -689,6 -689,63 +690,63 @@@ static inline void cgroup_path_from_ker
         char *buf, size_t buflen) {}
   #endif /* !CONFIG_CGROUPS */
   
+ /*
+  * Basic resource stats.
+  */
+ #ifdef CONFIG_CGROUPS
+ 
+ #ifdef CONFIG_CGROUP_CPUACCT
+ void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
+ #else
+ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
+                                        u64 val) {}
+ #endif
+ 
+ void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
+ void __cgroup_account_cputime_field(struct cgroup *cgrp,
+                                   enum cpu_usage_stat index, u64 delta_exec);
+ 
+ static inline void cgroup_account_cputime(struct task_struct *task,
+                                         u64 delta_exec)
+ {
+       struct cgroup *cgrp;
+ 
+       cpuacct_charge(task, delta_exec);
+ 
+       rcu_read_lock();
+       cgrp = task_dfl_cgroup(task);
+       if (cgroup_parent(cgrp))
+               __cgroup_account_cputime(cgrp, delta_exec);
+       rcu_read_unlock();
+ }
+ 
+ static inline void cgroup_account_cputime_field(struct task_struct *task,
+                                               enum cpu_usage_stat index,
+                                               u64 delta_exec)
+ {
+       struct cgroup *cgrp;
+ 
+       cpuacct_account_field(task, index, delta_exec);
+ 
+       rcu_read_lock();
+       cgrp = task_dfl_cgroup(task);
+       if (cgroup_parent(cgrp))
+               __cgroup_account_cputime_field(cgrp, index, delta_exec);
+       rcu_read_unlock();
+ }
+ 
+ #else /* CONFIG_CGROUPS */
+ 
+ static inline void cgroup_account_cputime(struct task_struct *task,
+                                         u64 delta_exec) {}
+ static inline void cgroup_account_cputime_field(struct task_struct *task,
+                                               enum cpu_usage_stat index,
+                                               u64 delta_exec) {}
+ 
+ #endif        /* CONFIG_CGROUPS */
+ 
   /*
    * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
    * definition in cgroup-defs.h.
diff --combined include/linux/sched/cputime.h

index d0677f6739f636e6ed6152e5875528f6539b1334,9251044335c57379522a13824b573d4e09fcf176..53f883f5a2fd1d29d1e2131b89260304a0be8df0
--- 1/include/linux/sched/cputime.h
--- 2/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@@ -1,4 -1,3 +1,4 @@@
+ +/* SPDX-License-Identifier: GPL-2.0 */
   #ifndef _LINUX_SCHED_CPUTIME_H
   #define _LINUX_SCHED_CPUTIME_H
   
@@@ -54,7 -53,8 +54,8 @@@ static inline void task_cputime_scaled(
   
   extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
   extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
- 
+ extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+                          u64 *ut, u64 *st);
   
   /*
    * Thread group CPU time accounting.
diff --combined kernel/cgroup/Makefile

index ae448f7632cc64753e0e0e4d9c4a0e65d41912a5,0acee616e06cb304053b54831741128a92f28924..2be89a003185bb4cb3613a539496008c129643d9
--- 1/kernel/cgroup/Makefile
--- 2/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@@ -1,5 -1,4 +1,5 @@@
- obj-y := cgroup.o namespace.o cgroup-v1.o
+ +# SPDX-License-Identifier: GPL-2.0
+ obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
   
   obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
   obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --combined kernel/cgroup/cgroup-internal.h

index bf54ade001be4ada1d9ebd45d3a12930828d94e1,4dc317090920414479bd3241be6757ef6dcd8d38..b928b27050c62fee81fa791dd2ed3e0d282d59f5
--- 1/kernel/cgroup/cgroup-internal.h
--- 2/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@@ -1,4 -1,3 +1,4 @@@
+ +/* SPDX-License-Identifier: GPL-2.0 */
   #ifndef __CGROUP_INTERNAL_H
   #define __CGROUP_INTERNAL_H
   
@@@ -200,6 -199,15 +200,15 @@@ int cgroup_show_path(struct seq_file *s
   
   int cgroup_task_count(const struct cgroup *cgrp);
   
+ /*
+  * stat.c
+  */
+ void cgroup_stat_flush(struct cgroup *cgrp);
+ int cgroup_stat_init(struct cgroup *cgrp);
+ void cgroup_stat_exit(struct cgroup *cgrp);
+ void cgroup_stat_show_cputime(struct seq_file *seq);
+ void cgroup_stat_boot(void);
+ 
   /*
    * namespace.c
    */
diff --combined kernel/cgroup/cgroup.c

index 00f5b358aeac5af94424c22b0e6a006874b10ce0,69e65d28fe98ed9f8fa057010ba809cf5d811294..0b1ffe147f240c39726d79505c4e02e8fe40cd47
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -142,12 -142,14 +142,14 @@@ static struct static_key_true *cgroup_s
   };
   #undef SUBSYS
   
+ static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+ 
   /*
    * The default hierarchy, reserved for the subsystems that are otherwise
    * unattached - it never has more than a single cgroup, and all tasks are
    * part of that cgroup.
    */
- struct cgroup_root cgrp_dfl_root;
+ struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
   EXPORT_SYMBOL_GPL(cgrp_dfl_root);
   
   /*
@@@ -461,6 -463,28 +463,28 @@@ static struct cgroup_subsys_state *cgro
                 return &cgrp->self;
   }
   
+ /**
+  * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+  * @cgrp: the cgroup of interest
+  * @ss: the subsystem of interest
+  *
+  * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
+  * or is offline, %NULL is returned.
+  */
+ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+                                                    struct cgroup_subsys *ss)
+ {
+       struct cgroup_subsys_state *css;
+ 
+       rcu_read_lock();
+       css = cgroup_css(cgrp, ss);
+       if (!css || !css_tryget_online(css))
+               css = NULL;
+       rcu_read_unlock();
+ 
+       return css;
+ }
+ 
   /**
    * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
    * @cgrp: the cgroup of interest
@@@ -647,6 -671,14 +671,14 @@@ struct css_set init_css_set = 
         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
+ 
+       /*
+        * The following field is re-initialized when this cset gets linked
+        * in cgroup_init().  However, let's initialize the field
+        * statically too so that the default cgroup can be accessed safely
+        * early during boot.
+        */
+       .dfl_cgrp               = &cgrp_dfl_root.cgrp,
   };
   
   static int css_set_count      = 1;    /* 1 for init_css_set */
@@@ -1896,9 -1928,6 +1928,9 @@@ int cgroup_setup_root(struct cgroup_roo
         if (ret)
                 goto destroy_root;
   
+ +      ret = cgroup_bpf_inherit(root_cgrp);
+ +      WARN_ON_ONCE(ret);
+ +
         trace_cgroup_setup_root(root);
   
         /*
@@@ -2314,14 -2343,6 +2346,14 @@@ out_release_tset
                 list_del_init(&cset->mg_node);
         }
         spin_unlock_irq(&css_set_lock);
+ +
+ +      /*
+ +       * Re-initialize the cgroup_taskset structure in case it is reused
+ +       * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+ +       * iteration.
+ +       */
+ +      tset->nr_tasks = 0;
+ +      tset->csets    = &tset->src_csets;
         return ret;
   }
   
@@@ -3315,6 -3336,37 +3347,37 @@@ static int cgroup_stat_show(struct seq_
         return 0;
   }
   
+ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+                                                struct cgroup *cgrp, int ssid)
+ {
+       struct cgroup_subsys *ss = cgroup_subsys[ssid];
+       struct cgroup_subsys_state *css;
+       int ret;
+ 
+       if (!ss->css_extra_stat_show)
+               return 0;
+ 
+       css = cgroup_tryget_css(cgrp, ss);
+       if (!css)
+               return 0;
+ 
+       ret = ss->css_extra_stat_show(seq, css);
+       css_put(css);
+       return ret;
+ }
+ 
+ static int cpu_stat_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+       int ret = 0;
+ 
+       cgroup_stat_show_cputime(seq);
+ #ifdef CONFIG_CGROUP_SCHED
+       ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+ #endif
+       return ret;
+ }
+ 
   static int cgroup_file_open(struct kernfs_open_file *of)
   {
         struct cftype *cft = of->kn->priv;
@@@ -4422,6 -4474,11 +4485,11 @@@ static struct cftype cgroup_base_files[
                 .name = "cgroup.stat",
                 .seq_show = cgroup_stat_show,
         },
+       {
+               .name = "cpu.stat",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_stat_show,
+       },
         { }     /* terminate */
   };
   
@@@ -4482,6 -4539,8 +4550,8 @@@ static void css_free_work_fn(struct wor
                          */
                         cgroup_put(cgroup_parent(cgrp));
                         kernfs_put(cgrp->kn);
+                       if (cgroup_on_dfl(cgrp))
+                               cgroup_stat_exit(cgrp);
                         kfree(cgrp);
                 } else {
                         /*
@@@ -4526,6 -4585,9 +4596,9 @@@ static void css_release_work_fn(struct 
                 /* cgroup release path */
                 trace_cgroup_release(cgrp);
   
+               if (cgroup_on_dfl(cgrp))
+                       cgroup_stat_flush(cgrp);
+ 
                 for (tcgrp = cgroup_parent(cgrp); tcgrp;
                      tcgrp = cgroup_parent(tcgrp))
                         tcgrp->nr_dying_descendants--;
@@@ -4709,6 -4771,12 +4782,12 @@@ static struct cgroup *cgroup_create(str
         if (ret)
                 goto out_free_cgrp;
   
+       if (cgroup_on_dfl(parent)) {
+               ret = cgroup_stat_init(cgrp);
+               if (ret)
+                       goto out_cancel_ref;
+       }
+ 
         /*
          * Temporarily set the pointer to NULL, so idr_find() won't return
          * a half-baked cgroup.
@@@ -4716,7 -4784,7 +4795,7 @@@
         cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
         if (cgrp->id < 0) {
                 ret = -ENOMEM;
-               goto out_cancel_ref;
+               goto out_stat_exit;
         }
   
         init_cgroup_housekeeping(cgrp);
@@@ -4724,9 -4792,6 +4803,9 @@@
         cgrp->self.parent = &parent->self;
         cgrp->root = root;
         cgrp->level = level;
+ +      ret = cgroup_bpf_inherit(cgrp);
+ +      if (ret)
+ +              goto out_idr_free;
   
         for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@@ -4761,12 -4826,16 +4840,15 @@@
         if (!cgroup_on_dfl(cgrp))
                 cgrp->subtree_control = cgroup_control(cgrp);
   
- -      if (parent)
- -              cgroup_bpf_inherit(cgrp, parent);
- -
         cgroup_propagate_control(cgrp);
   
         return cgrp;
   
+ +out_idr_free:
+ +      cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+ out_stat_exit:
+       if (cgroup_on_dfl(parent))
+               cgroup_stat_exit(cgrp);
   out_cancel_ref:
         percpu_ref_exit(&cgrp->self.refcnt);
   out_free_cgrp:
@@@ -5161,6 -5230,8 +5243,8 @@@ int __init cgroup_init(void
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
   
+       cgroup_stat_boot();
+ 
         /*
          * The latency of the synchronize_sched() is too high for cgroups,
          * avoid it at the cost of forcing all readers into the slow path.
@@@ -5749,34 -5820,84 +5833,103 @@@ void cgroup_sk_free(struct sock_cgroup_
   #endif        /* CONFIG_SOCK_CGROUP_DATA */
   
   #ifdef CONFIG_CGROUP_BPF
- -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
- -                    enum bpf_attach_type type, bool overridable)
+ +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ +                    enum bpf_attach_type type, u32 flags)
+ +{
+ +      int ret;
+ +
+ +      mutex_lock(&cgroup_mutex);
+ +      ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ +      mutex_unlock(&cgroup_mutex);
+ +      return ret;
+ +}
+ +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ +                    enum bpf_attach_type type, u32 flags)
+ +{
+ +      int ret;
+ +
+ +      mutex_lock(&cgroup_mutex);
+ +      ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ +      mutex_unlock(&cgroup_mutex);
+ +      return ret;
+ +}
+ +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ +                   union bpf_attr __user *uattr)
   {
- -      struct cgroup *parent = cgroup_parent(cgrp);
         int ret;
   
         mutex_lock(&cgroup_mutex);
- -      ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ +      ret = __cgroup_bpf_query(cgrp, attr, uattr);
         mutex_unlock(&cgroup_mutex);
         return ret;
   }
   #endif /* CONFIG_CGROUP_BPF */
+ 
+ #ifdef CONFIG_SYSFS
+ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+                                     ssize_t size, const char *prefix)
+ {
+       struct cftype *cft;
+       ssize_t ret = 0;
+ 
+       for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+               if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+                       continue;
+ 
+               if (prefix)
+                       ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+ 
+               ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+ 
+               if (unlikely(ret >= size)) {
+                       WARN_ON(1);
+                       break;
+               }
+       }
+ 
+       return ret;
+ }
+ 
+ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+                             char *buf)
+ {
+       struct cgroup_subsys *ss;
+       int ssid;
+       ssize_t ret = 0;
+ 
+       ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+                                    NULL);
+ 
+       for_each_subsys(ss, ssid)
+               ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+                                             PAGE_SIZE - ret,
+                                             cgroup_subsys_name[ssid]);
+ 
+       return ret;
+ }
+ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+ 
+ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+                            char *buf)
+ {
+       return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+ }
+ static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+ 
+ static struct attribute *cgroup_sysfs_attrs[] = {
+       &cgroup_delegate_attr.attr,
+       &cgroup_features_attr.attr,
+       NULL,
+ };
+ 
+ static const struct attribute_group cgroup_sysfs_attr_group = {
+       .attrs = cgroup_sysfs_attrs,
+       .name = "cgroup",
+ };
+ 
+ static int __init cgroup_sysfs_init(void)
+ {
+       return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+ }
+ subsys_initcall(cgroup_sysfs_init);
+ #endif /* CONFIG_SYSFS */
diff --combined kernel/sched/core.c

index 5b82a00735325ba75ca9a93095953a28cc2b7257,0b3eec389552e33de6c4ef9e046d5d5191711fac..a092f350f3a21ad31c359c704eff65990613853f
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -26,7 -26,6 +26,7 @@@
   #include <linux/profile.h>
   #include <linux/security.h>
   #include <linux/syscalls.h>
+ +#include <linux/sched/isolation.h>
   
   #include <asm/switch_to.h>
   #include <asm/tlb.h>
@@@ -43,21 -42,18 +43,21 @@@
   
   DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
   
+ +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
   /*
    * Debugging: various feature bits
+ + *
+ + * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ + * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ + * at compile time and compiler optimization based on features default.
    */
- -
   #define SCHED_FEAT(name, enabled)     \
         (1UL << __SCHED_FEAT_##name) * enabled |
- -
   const_debug unsigned int sysctl_sched_features =
   #include "features.h"
         0;
- -
   #undef SCHED_FEAT
+ +#endif
   
   /*
    * Number of tasks to iterate in a single balance run.
@@@ -87,6 -83,9 +87,6 @@@ __read_mostly int scheduler_running
    */
   int sysctl_sched_rt_runtime = 950000;
   
- -/* CPUs with isolated domains */
- -cpumask_var_t cpu_isolated_map;
- -
   /*
    * __task_rq_lock - lock the rq @p resides on.
    */
@@@ -506,7 -505,8 +506,7 @@@ void resched_cpu(int cpu
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
   
- -      if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- -              return;
+ +      raw_spin_lock_irqsave(&rq->lock, flags);
         resched_curr(rq);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
   }
@@@ -526,7 -526,7 +526,7 @@@ int get_nohz_timer_target(void
         int i, cpu = smp_processor_id();
         struct sched_domain *sd;
   
- -      if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+ +      if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
                 return cpu;
   
         rcu_read_lock();
@@@ -535,15 -535,15 +535,15 @@@
                         if (cpu == i)
                                 continue;
   
- -                      if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ +                      if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
                                 cpu = i;
                                 goto unlock;
                         }
                 }
         }
   
- -      if (!is_housekeeping_cpu(cpu))
- -              cpu = housekeeping_any_cpu();
+ +      if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ +              cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
   unlock:
         rcu_read_unlock();
         return cpu;
@@@ -733,7 -733,7 +733,7 @@@ int tg_nop(struct task_group *tg, void 
   }
   #endif
   
- -static void set_load_weight(struct task_struct *p)
+ +static void set_load_weight(struct task_struct *p, bool update_load)
   {
         int prio = p->static_prio - MAX_RT_PRIO;
         struct load_weight *load = &p->se.load;
@@@ -747,16 -747,8 +747,16 @@@
                 return;
         }
   
- -      load->weight = scale_load(sched_prio_to_weight[prio]);
- -      load->inv_weight = sched_prio_to_wmult[prio];
+ +      /*
+ +       * SCHED_OTHER tasks have to update their load when changing their
+ +       * weight
+ +       */
+ +      if (update_load && p->sched_class == &fair_sched_class) {
+ +              reweight_task(p, prio);
+ +      } else {
+ +              load->weight = scale_load(sched_prio_to_weight[prio]);
+ +              load->inv_weight = sched_prio_to_wmult[prio];
+ +      }
   }
   
   static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@@ -2366,7 -2358,7 +2366,7 @@@ int sched_fork(unsigned long clone_flag
                         p->static_prio = NICE_TO_PRIO(0);
   
                 p->prio = p->normal_prio = __normal_prio(p);
- -              set_load_weight(p);
+ +              set_load_weight(p, false);
   
                 /*
                  * We don't need the reset flag anymore after the fork. It has
@@@ -3813,7 -3805,7 +3813,7 @@@ void set_user_nice(struct task_struct *
                 put_prev_task(rq, p);
   
         p->static_prio = NICE_TO_PRIO(nice);
- -      set_load_weight(p);
+ +      set_load_weight(p, true);
         old_prio = p->prio;
         p->prio = effective_prio(p);
         delta = p->prio - old_prio;
@@@ -3970,7 -3962,7 +3970,7 @@@ static void __setscheduler_params(struc
          */
         p->rt_priority = attr->sched_priority;
         p->normal_prio = normal_prio(p);
- -      set_load_weight(p);
+ +      set_load_weight(p, true);
   }
   
   /* Actually do priority change: must hold pi & rq lock. */
@@@ -4850,7 -4842,6 +4850,7 @@@ int __sched _cond_resched(void
                 preempt_schedule_common();
                 return 1;
         }
+ +      rcu_all_qs();
         return 0;
   }
   EXPORT_SYMBOL(_cond_resched);
@@@ -5174,29 -5165,6 +5174,29 @@@ void sched_show_task(struct task_struc
         show_stack(p, NULL);
         put_task_stack(p);
   }
+ +EXPORT_SYMBOL_GPL(sched_show_task);
+ +
+ +static inline bool
+ +state_filter_match(unsigned long state_filter, struct task_struct *p)
+ +{
+ +      /* no filter, everything matches */
+ +      if (!state_filter)
+ +              return true;
+ +
+ +      /* filter, but doesn't match */
+ +      if (!(p->state & state_filter))
+ +              return false;
+ +
+ +      /*
+ +       * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ +       * TASK_KILLABLE).
+ +       */
+ +      if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
   
   void show_state_filter(unsigned long state_filter)
   {
@@@ -5220,7 -5188,7 +5220,7 @@@
                  */
                 touch_nmi_watchdog();
                 touch_all_softlockup_watchdogs();
- -              if (!state_filter || (p->state & state_filter))
+ +              if (state_filter_match(state_filter, p))
                         sched_show_task(p);
         }
   
@@@ -5736,6 -5704,10 +5736,6 @@@ static inline void sched_init_smt(void
   
   void __init sched_init_smp(void)
   {
- -      cpumask_var_t non_isolated_cpus;
- -
- -      alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- -
         sched_init_numa();
   
         /*
@@@ -5745,12 -5717,16 +5745,12 @@@
          */
         mutex_lock(&sched_domains_mutex);
         sched_init_domains(cpu_active_mask);
- -      cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- -      if (cpumask_empty(non_isolated_cpus))
- -              cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
   
         /* Move init over to a non-isolated CPU */
- -      if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ +      if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
                 BUG();
         sched_init_granularity();
- -      free_cpumask_var(non_isolated_cpus);
   
         init_sched_rt_class();
         init_sched_dl_class();
@@@ -5935,7 -5911,7 +5935,7 @@@ void __init sched_init(void
                 atomic_set(&rq->nr_iowait, 0);
         }
   
- -      set_load_weight(&init_task);
+ +      set_load_weight(&init_task, false);
   
         /*
          * The boot idle thread does lazy MMU switching as well:
@@@ -5954,6 -5930,9 +5954,6 @@@
         calc_load_update = jiffies + LOAD_FREQ;
   
   #ifdef CONFIG_SMP
- -      /* May be allocated at isolcpus cmdline parse time */
- -      if (cpu_isolated_map == NULL)
- -              zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
         idle_thread_set_boot_cpu();
         set_cpu_rq_start_time(smp_processor_id());
   #endif
@@@ -6620,7 -6599,7 +6620,7 @@@ static int __cfs_schedulable(struct tas
         return ret;
   }
   
- static int cpu_stats_show(struct seq_file *sf, void *v)
+ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
   {
         struct task_group *tg = css_tg(seq_css(sf));
         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@@ -6660,7 -6639,7 +6660,7 @@@ static u64 cpu_rt_period_read_uint(stru
   }
   #endif /* CONFIG_RT_GROUP_SCHED */
   
- static struct cftype cpu_files[] = {
+ static struct cftype cpu_legacy_files[] = {
   #ifdef CONFIG_FAIR_GROUP_SCHED
         {
                 .name = "shares",
@@@ -6681,7 -6660,7 +6681,7 @@@
         },
         {
                 .name = "stat",
-               .seq_show = cpu_stats_show,
+               .seq_show = cpu_cfs_stat_show,
         },
   #endif
   #ifdef CONFIG_RT_GROUP_SCHED
@@@ -6699,16 -6678,182 +6699,182 @@@
         { }     /* Terminate */
   };
   
+ static int cpu_extra_stat_show(struct seq_file *sf,
+                              struct cgroup_subsys_state *css)
+ {
+ #ifdef CONFIG_CFS_BANDWIDTH
+       {
+               struct task_group *tg = css_tg(css);
+               struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+               u64 throttled_usec;
+ 
+               throttled_usec = cfs_b->throttled_time;
+               do_div(throttled_usec, NSEC_PER_USEC);
+ 
+               seq_printf(sf, "nr_periods %d\n"
+                          "nr_throttled %d\n"
+                          "throttled_usec %llu\n",
+                          cfs_b->nr_periods, cfs_b->nr_throttled,
+                          throttled_usec);
+       }
+ #endif
+       return 0;
+ }
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+                              struct cftype *cft)
+ {
+       struct task_group *tg = css_tg(css);
+       u64 weight = scale_load_down(tg->shares);
+ 
+       return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+ }
+ 
+ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+                               struct cftype *cft, u64 weight)
+ {
+       /*
+        * cgroup weight knobs should use the common MIN, DFL and MAX
+        * values which are 1, 100 and 10000 respectively.  While it loses
+        * a bit of range on both ends, it maps pretty well onto the shares
+        * value used by scheduler and the round-trip conversions preserve
+        * the original value over the entire range.
+        */
+       if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+               return -ERANGE;
+ 
+       weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+ 
+       return sched_group_set_shares(css_tg(css), scale_load(weight));
+ }
+ 
+ static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+                                   struct cftype *cft)
+ {
+       unsigned long weight = scale_load_down(css_tg(css)->shares);
+       int last_delta = INT_MAX;
+       int prio, delta;
+ 
+       /* find the closest nice value to the current weight */
+       for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+               delta = abs(sched_prio_to_weight[prio] - weight);
+               if (delta >= last_delta)
+                       break;
+               last_delta = delta;
+       }
+ 
+       return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+ }
+ 
+ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+                                    struct cftype *cft, s64 nice)
+ {
+       unsigned long weight;
+ 
+       if (nice < MIN_NICE || nice > MAX_NICE)
+               return -ERANGE;
+ 
+       weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+       return sched_group_set_shares(css_tg(css), scale_load(weight));
+ }
+ #endif
+ 
+ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+                                                 long period, long quota)
+ {
+       if (quota < 0)
+               seq_puts(sf, "max");
+       else
+               seq_printf(sf, "%ld", quota);
+ 
+       seq_printf(sf, " %ld\n", period);
+ }
+ 
+ /* caller should put the current value in *@periodp before calling */
+ static int __maybe_unused cpu_period_quota_parse(char *buf,
+                                                u64 *periodp, u64 *quotap)
+ {
+       char tok[21];   /* U64_MAX */
+ 
+       if (!sscanf(buf, "%s %llu", tok, periodp))
+               return -EINVAL;
+ 
+       *periodp *= NSEC_PER_USEC;
+ 
+       if (sscanf(tok, "%llu", quotap))
+               *quotap *= NSEC_PER_USEC;
+       else if (!strcmp(tok, "max"))
+               *quotap = RUNTIME_INF;
+       else
+               return -EINVAL;
+ 
+       return 0;
+ }
+ 
+ #ifdef CONFIG_CFS_BANDWIDTH
+ static int cpu_max_show(struct seq_file *sf, void *v)
+ {
+       struct task_group *tg = css_tg(seq_css(sf));
+ 
+       cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+       return 0;
+ }
+ 
+ static ssize_t cpu_max_write(struct kernfs_open_file *of,
+                            char *buf, size_t nbytes, loff_t off)
+ {
+       struct task_group *tg = css_tg(of_css(of));
+       u64 period = tg_get_cfs_period(tg);
+       u64 quota;
+       int ret;
+ 
+       ret = cpu_period_quota_parse(buf, &period, &quota);
+       if (!ret)
+               ret = tg_set_cfs_bandwidth(tg, period, quota);
+       return ret ?: nbytes;
+ }
+ #endif
+ 
+ static struct cftype cpu_files[] = {
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+       {
+               .name = "weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_u64 = cpu_weight_read_u64,
+               .write_u64 = cpu_weight_write_u64,
+       },
+       {
+               .name = "weight.nice",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_s64 = cpu_weight_nice_read_s64,
+               .write_s64 = cpu_weight_nice_write_s64,
+       },
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+       {
+               .name = "max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_max_show,
+               .write = cpu_max_write,
+       },
+ #endif
+       { }     /* terminate */
+ };
+ 
   struct cgroup_subsys cpu_cgrp_subsys = {
         .css_alloc      = cpu_cgroup_css_alloc,
         .css_online     = cpu_cgroup_css_online,
         .css_released   = cpu_cgroup_css_released,
         .css_free       = cpu_cgroup_css_free,
+       .css_extra_stat_show = cpu_extra_stat_show,
         .fork           = cpu_cgroup_fork,
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
-       .legacy_cftypes = cpu_files,
+       .legacy_cftypes = cpu_legacy_files,
+       .dfl_cftypes    = cpu_files,
         .early_init     = true,
+       .threaded       = true,
   };
   
   #endif        /* CONFIG_CGROUP_SCHED */
diff --combined kernel/sched/cputime.c

index 9be8b68a66da0cf5a334f2cd38d915b31a2a4b65,5498f20d24750962bd63ad82cfca94e11974db85..bac6ac9a4ec7068e11e5b35fdea9a3f6a43fd490
--- 1/kernel/sched/cputime.c
--- 2/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@@ -109,7 -109,7 +109,7 @@@ static inline void task_group_account_f
          */
         __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
   
-       cpuacct_account_field(p, index, tmp);
+       cgroup_account_cputime_field(p, index, tmp);
   }
   
   /*
@@@ -259,7 -259,8 +259,7 @@@ static inline u64 account_other_time(u6
   {
         u64 accounted;
   
- -      /* Shall be converted to a lockdep-enabled lightweight check */
- -      WARN_ON_ONCE(!irqs_disabled());
+ +      lockdep_assert_irqs_disabled();
   
         accounted = steal_account_process_time(max);
   
@@@ -446,6 -447,13 +446,13 @@@ void vtime_account_irq_enter(struct tas
   EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
   #endif /* __ARCH_HAS_VTIME_ACCOUNT */
   
+ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+                   u64 *ut, u64 *st)
+ {
+       *ut = curr->utime;
+       *st = curr->stime;
+ }
+ 
   void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
   {
         *ut = p->utime;
@@@ -584,9 -592,8 +591,8 @@@ drop_precision
    *
    * Assuming that rtime_i+1 >= rtime_i.
    */
- static void cputime_adjust(struct task_cputime *curr,
-                          struct prev_cputime *prev,
-                          u64 *ut, u64 *st)
+ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+                   u64 *ut, u64 *st)
   {
         u64 rtime, stime, utime;
         unsigned long flags;
diff --combined kernel/sched/deadline.c

index f349f7e98deca60b1b26ab10a0be36719866cd1e,abd913c1b99e6bd12c3a4d1544aa9e04e5e7bdbe..2473736c7616dd3810b5295a8a5fe4bc4c16b92c
--- 1/kernel/sched/deadline.c
--- 2/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@@ -1,4 -1,3 +1,4 @@@
+ +// SPDX-License-Identifier: GPL-2.0
   /*
    * Deadline Scheduling Class (SCHED_DEADLINE)
    *
@@@ -243,7 -242,7 +243,7 @@@ static void task_non_contending(struct 
                         if (p->state == TASK_DEAD)
                                 sub_rq_bw(p->dl.dl_bw, &rq->dl);
                         raw_spin_lock(&dl_b->lock);
- -                      __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ +                      __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
                         __dl_clear_params(p);
                         raw_spin_unlock(&dl_b->lock);
                 }
@@@ -1144,7 -1143,7 +1144,7 @@@ static void update_curr_dl(struct rq *r
         account_group_exec_runtime(curr, delta_exec);
   
         curr->se.exec_start = rq_clock_task(rq);
-       cpuacct_charge(curr, delta_exec);
+       cgroup_account_cputime(curr, delta_exec);
   
         sched_rt_avg_update(rq, delta_exec);
   
@@@ -1210,7 -1209,7 +1210,7 @@@ static enum hrtimer_restart inactive_ta
                 }
   
                 raw_spin_lock(&dl_b->lock);
- -              __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ +              __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
                 raw_spin_unlock(&dl_b->lock);
                 __dl_clear_params(p);
   
@@@ -1365,10 -1364,6 +1365,10 @@@ enqueue_dl_entity(struct sched_dl_entit
                 update_dl_entity(dl_se, pi_se);
         } else if (flags & ENQUEUE_REPLENISH) {
                 replenish_dl_entity(dl_se, pi_se);
+ +      } else if ((flags & ENQUEUE_RESTORE) &&
+ +                dl_time_before(dl_se->deadline,
+ +                               rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+ +              setup_new_dl_entity(dl_se);
         }
   
         __enqueue_dl_entity(dl_se);
@@@ -2171,7 -2166,7 +2171,7 @@@ static void set_cpus_allowed_dl(struct 
                  * until we complete the update.
                  */
                 raw_spin_lock(&src_dl_b->lock);
- -              __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ +              __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
                 raw_spin_unlock(&src_dl_b->lock);
         }
   
@@@ -2260,6 -2255,13 +2260,6 @@@ static void switched_to_dl(struct rq *r
   
                 return;
         }
- -      /*
- -       * If p is boosted we already updated its params in
- -       * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
- -       * p's deadline being now already after rq_clock(rq).
- -       */
- -      if (dl_time_before(p->dl.deadline, rq_clock(rq)))
- -              setup_new_dl_entity(&p->dl);
   
         if (rq->curr != p) {
   #ifdef CONFIG_SMP
@@@ -2449,7 -2451,7 +2449,7 @@@ int sched_dl_overflow(struct task_struc
         if (dl_policy(policy) && !task_has_dl_policy(p) &&
             !__dl_overflow(dl_b, cpus, 0, new_bw)) {
                 if (hrtimer_active(&p->dl.inactive_timer))
- -                      __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ +                      __dl_sub(dl_b, p->dl.dl_bw, cpus);
                 __dl_add(dl_b, new_bw, cpus);
                 err = 0;
         } else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@@ -2461,7 -2463,7 +2461,7 @@@
                  * But this would require to set the task's "inactive
                  * timer" when the task is not inactive.
                  */
- -              __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ +              __dl_sub(dl_b, p->dl.dl_bw, cpus);
                 __dl_add(dl_b, new_bw, cpus);
                 dl_change_utilization(p, new_bw);
                 err = 0;
diff --combined kernel/sched/fair.c

index 0989676c50e92df396e8a4ef6ccf3545a1e057fe,0ae69af95b8b88c8b0b693e2e6be50c193b4d5d7..4037e19bbca25939f0dd57b05f8fb25de8a90908
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -1,4 -1,3 +1,4 @@@
+ +// SPDX-License-Identifier: GPL-2.0
   /*
    * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
    *
@@@ -33,7 -32,6 +33,7 @@@
   #include <linux/mempolicy.h>
   #include <linux/migrate.h>
   #include <linux/task_work.h>
+ +#include <linux/sched/isolation.h>
   
   #include <trace/events/sched.h>
   
@@@ -718,8 -716,13 +718,8 @@@ void init_entity_runnable_average(struc
   {
         struct sched_avg *sa = &se->avg;
   
- -      sa->last_update_time = 0;
- -      /*
- -       * sched_avg's period_contrib should be strictly less then 1024, so
- -       * we give it 1023 to make sure it is almost a period (1024us), and
- -       * will definitely be update (after enqueue).
- -       */
- -      sa->period_contrib = 1023;
+ +      memset(sa, 0, sizeof(*sa));
+ +
         /*
          * Tasks are intialized with full load to be seen as heavy tasks until
          * they get a chance to stabilize to their real load level.
@@@ -727,10 -730,13 +727,10 @@@
          * nothing has been attached to the task group yet.
          */
         if (entity_is_task(se))
- -              sa->load_avg = scale_load_down(se->load.weight);
- -      sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- -      /*
- -       * At this point, util_avg won't be used in select_task_rq_fair anyway
- -       */
- -      sa->util_avg = 0;
- -      sa->util_sum = 0;
+ +              sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+ +
+ +      se->runnable_weight = se->load.weight;
+ +
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
   }
   
@@@ -778,6 -784,7 +778,6 @@@ void post_init_entity_util_avg(struct s
                 } else {
                         sa->util_avg = cap;
                 }
- -              sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
         }
   
         if (entity_is_task(se)) {
@@@ -844,7 -851,7 +844,7 @@@ static void update_curr(struct cfs_rq *
                 struct task_struct *curtask = task_of(curr);
   
                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-               cpuacct_charge(curtask, delta_exec);
+               cgroup_account_cputime(curtask, delta_exec);
                 account_group_exec_runtime(curtask, delta_exec);
         }
   
@@@ -2018,7 -2025,7 +2018,7 @@@ static u64 numa_get_avg_runtime(struct 
                 delta = runtime - p->last_sum_exec_runtime;
                 *period = now - p->last_task_numa_placement;
         } else {
- -              delta = p->se.avg.load_sum / p->se.load.weight;
+ +              delta = p->se.avg.load_sum;
                 *period = LOAD_AVG_MAX;
         }
   
@@@ -2685,226 -2692,18 +2685,226 @@@ account_entity_dequeue(struct cfs_rq *c
         cfs_rq->nr_running--;
   }
   
+ +/*
+ + * Signed add and clamp on underflow.
+ + *
+ + * Explicitly do a load-store to ensure the intermediate value never hits
+ + * memory. This allows lockless observations without ever seeing the negative
+ + * values.
+ + */
+ +#define add_positive(_ptr, _val) do {                           \
+ +      typeof(_ptr) ptr = (_ptr);                              \
+ +      typeof(_val) val = (_val);                              \
+ +      typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+ +                                                              \
+ +      res = var + val;                                        \
+ +                                                              \
+ +      if (val < 0 && res > var)                               \
+ +              res = 0;                                        \
+ +                                                              \
+ +      WRITE_ONCE(*ptr, res);                                  \
+ +} while (0)
+ +
+ +/*
+ + * Unsigned subtract and clamp on underflow.
+ + *
+ + * Explicitly do a load-store to ensure the intermediate value never hits
+ + * memory. This allows lockless observations without ever seeing the negative
+ + * values.
+ + */
+ +#define sub_positive(_ptr, _val) do {                         \
+ +      typeof(_ptr) ptr = (_ptr);                              \
+ +      typeof(*ptr) val = (_val);                              \
+ +      typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+ +      res = var - val;                                        \
+ +      if (res > var)                                          \
+ +              res = 0;                                        \
+ +      WRITE_ONCE(*ptr, res);                                  \
+ +} while (0)
+ +
+ +#ifdef CONFIG_SMP
+ +/*
+ + * XXX we want to get rid of these helpers and use the full load resolution.
+ + */
+ +static inline long se_weight(struct sched_entity *se)
+ +{
+ +      return scale_load_down(se->load.weight);
+ +}
+ +
+ +static inline long se_runnable(struct sched_entity *se)
+ +{
+ +      return scale_load_down(se->runnable_weight);
+ +}
+ +
+ +static inline void
+ +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +{
+ +      cfs_rq->runnable_weight += se->runnable_weight;
+ +
+ +      cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+ +      cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+ +}
+ +
+ +static inline void
+ +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +{
+ +      cfs_rq->runnable_weight -= se->runnable_weight;
+ +
+ +      sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+ +      sub_positive(&cfs_rq->avg.runnable_load_sum,
+ +                   se_runnable(se) * se->avg.runnable_load_sum);
+ +}
+ +
+ +static inline void
+ +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +{
+ +      cfs_rq->avg.load_avg += se->avg.load_avg;
+ +      cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+ +}
+ +
+ +static inline void
+ +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +{
+ +      sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ +      sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ +}
+ +#else
+ +static inline void
+ +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ +static inline void
+ +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ +static inline void
+ +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ +static inline void
+ +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ +#endif
+ +
+ +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ +                          unsigned long weight, unsigned long runnable)
+ +{
+ +      if (se->on_rq) {
+ +              /* commit outstanding execution time */
+ +              if (cfs_rq->curr == se)
+ +                      update_curr(cfs_rq);
+ +              account_entity_dequeue(cfs_rq, se);
+ +              dequeue_runnable_load_avg(cfs_rq, se);
+ +      }
+ +      dequeue_load_avg(cfs_rq, se);
+ +
+ +      se->runnable_weight = runnable;
+ +      update_load_set(&se->load, weight);
+ +
+ +#ifdef CONFIG_SMP
+ +      do {
+ +              u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+ +
+ +              se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+ +              se->avg.runnable_load_avg =
+ +                      div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+ +      } while (0);
+ +#endif
+ +
+ +      enqueue_load_avg(cfs_rq, se);
+ +      if (se->on_rq) {
+ +              account_entity_enqueue(cfs_rq, se);
+ +              enqueue_runnable_load_avg(cfs_rq, se);
+ +      }
+ +}
+ +
+ +void reweight_task(struct task_struct *p, int prio)
+ +{
+ +      struct sched_entity *se = &p->se;
+ +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ +      struct load_weight *load = &se->load;
+ +      unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+ +
+ +      reweight_entity(cfs_rq, se, weight, weight);
+ +      load->inv_weight = sched_prio_to_wmult[prio];
+ +}
+ +
   #ifdef CONFIG_FAIR_GROUP_SCHED
   # ifdef CONFIG_SMP
- -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+ +/*
+ + * All this does is approximate the hierarchical proportion which includes that
+ + * global sum we all love to hate.
+ + *
+ + * That is, the weight of a group entity, is the proportional share of the
+ + * group weight based on the group runqueue weights. That is:
+ + *
+ + *                     tg->weight * grq->load.weight
+ + *   ge->load.weight = -----------------------------               (1)
+ + *                      \Sum grq->load.weight
+ + *
+ + * Now, because computing that sum is prohibitively expensive to compute (been
+ + * there, done that) we approximate it with this average stuff. The average
+ + * moves slower and therefore the approximation is cheaper and more stable.
+ + *
+ + * So instead of the above, we substitute:
+ + *
+ + *   grq->load.weight -> grq->avg.load_avg                         (2)
+ + *
+ + * which yields the following:
+ + *
+ + *                     tg->weight * grq->avg.load_avg
+ + *   ge->load.weight = ------------------------------              (3)
+ + *                            tg->load_avg
+ + *
+ + * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ + *
+ + * That is shares_avg, and it is right (given the approximation (2)).
+ + *
+ + * The problem with it is that because the average is slow -- it was designed
+ + * to be exactly that of course -- this leads to transients in boundary
+ + * conditions. In specific, the case where the group was idle and we start the
+ + * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ + * yielding bad latency etc..
+ + *
+ + * Now, in that special case (1) reduces to:
+ + *
+ + *                     tg->weight * grq->load.weight
+ + *   ge->load.weight = ----------------------------- = tg->weight   (4)
+ + *                        grp->load.weight
+ + *
+ + * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ + *
+ + * So what we do is modify our approximation (3) to approach (4) in the (near)
+ + * UP case, like:
+ + *
+ + *   ge->load.weight =
+ + *
+ + *              tg->weight * grq->load.weight
+ + *     ---------------------------------------------------         (5)
+ + *     tg->load_avg - grq->avg.load_avg + grq->load.weight
+ + *
+ + * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ + * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ + *
+ + *
+ + *                     tg->weight * grq->load.weight
+ + *   ge->load.weight = -----------------------------             (6)
+ + *                            tg_load_avg'
+ + *
+ + * Where:
+ + *
+ + *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ + *                  max(grq->load.weight, grq->avg.load_avg)
+ + *
+ + * And that is shares_weight and is icky. In the (near) UP case it approaches
+ + * (4) while in the normal case it approaches (3). It consistently
+ + * overestimates the ge->load.weight and therefore:
+ + *
+ + *   \Sum ge->load.weight >= tg->weight
+ + *
+ + * hence icky!
+ + */
+ +static long calc_group_shares(struct cfs_rq *cfs_rq)
   {
- -      long tg_weight, load, shares;
+ +      long tg_weight, tg_shares, load, shares;
+ +      struct task_group *tg = cfs_rq->tg;
   
- -      /*
- -       * This really should be: cfs_rq->avg.load_avg, but instead we use
- -       * cfs_rq->load.weight, which is its upper bound. This helps ramp up
- -       * the shares for small weight interactive tasks.
- -       */
- -      load = scale_load_down(cfs_rq->load.weight);
+ +      tg_shares = READ_ONCE(tg->shares);
+ +
+ +      load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
   
         tg_weight = atomic_long_read(&tg->load_avg);
   
@@@ -2912,7 -2711,7 +2912,7 @@@
         tg_weight -= cfs_rq->tg_load_avg_contrib;
         tg_weight += load;
   
- -      shares = (tg->shares * load);
+ +      shares = (tg_shares * load);
         if (tg_weight)
                 shares /= tg_weight;
   
@@@ -2928,86 -2727,63 +2928,86 @@@
          * case no task is runnable on a CPU MIN_SHARES=2 should be returned
          * instead of 0.
          */
- -      if (shares < MIN_SHARES)
- -              shares = MIN_SHARES;
- -      if (shares > tg->shares)
- -              shares = tg->shares;
- -
- -      return shares;
+ +      return clamp_t(long, shares, MIN_SHARES, tg_shares);
   }
- -# else /* CONFIG_SMP */
- -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
- -{
- -      return tg->shares;
- -}
- -# endif /* CONFIG_SMP */
   
- -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- -                          unsigned long weight)
+ +/*
+ + * This calculates the effective runnable weight for a group entity based on
+ + * the group entity weight calculated above.
+ + *
+ + * Because of the above approximation (2), our group entity weight is
+ + * an load_avg based ratio (3). This means that it includes blocked load and
+ + * does not represent the runnable weight.
+ + *
+ + * Approximate the group entity's runnable weight per ratio from the group
+ + * runqueue:
+ + *
+ + *                                         grq->avg.runnable_load_avg
+ + *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
+ + *                                             grq->avg.load_avg
+ + *
+ + * However, analogous to above, since the avg numbers are slow, this leads to
+ + * transients in the from-idle case. Instead we use:
+ + *
+ + *   ge->runnable_weight = ge->load.weight *
+ + *
+ + *            max(grq->avg.runnable_load_avg, grq->runnable_weight)
+ + *            -----------------------------------------------------   (8)
+ + *                  max(grq->avg.load_avg, grq->load.weight)
+ + *
+ + * Where these max() serve both to use the 'instant' values to fix the slow
+ + * from-idle and avoid the /0 on to-idle, similar to (6).
+ + */
+ +static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
   {
- -      if (se->on_rq) {
- -              /* commit outstanding execution time */
- -              if (cfs_rq->curr == se)
- -                      update_curr(cfs_rq);
- -              account_entity_dequeue(cfs_rq, se);
- -      }
+ +      long runnable, load_avg;
   
- -      update_load_set(&se->load, weight);
+ +      load_avg = max(cfs_rq->avg.load_avg,
+ +                     scale_load_down(cfs_rq->load.weight));
   
- -      if (se->on_rq)
- -              account_entity_enqueue(cfs_rq, se);
+ +      runnable = max(cfs_rq->avg.runnable_load_avg,
+ +                     scale_load_down(cfs_rq->runnable_weight));
+ +
+ +      runnable *= shares;
+ +      if (load_avg)
+ +              runnable /= load_avg;
+ +
+ +      return clamp_t(long, runnable, MIN_SHARES, shares);
   }
+ +# endif /* CONFIG_SMP */
   
   static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
   
- -static void update_cfs_shares(struct sched_entity *se)
+ +/*
+ + * Recomputes the group entity based on the current state of its group
+ + * runqueue.
+ + */
+ +static void update_cfs_group(struct sched_entity *se)
   {
- -      struct cfs_rq *cfs_rq = group_cfs_rq(se);
- -      struct task_group *tg;
- -      long shares;
+ +      struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ +      long shares, runnable;
   
- -      if (!cfs_rq)
+ +      if (!gcfs_rq)
                 return;
   
- -      if (throttled_hierarchy(cfs_rq))
+ +      if (throttled_hierarchy(gcfs_rq))
                 return;
   
- -      tg = cfs_rq->tg;
- -
   #ifndef CONFIG_SMP
- -      if (likely(se->load.weight == tg->shares))
+ +      runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
+ +
+ +      if (likely(se->load.weight == shares))
                 return;
+ +#else
+ +      shares   = calc_group_shares(gcfs_rq);
+ +      runnable = calc_group_runnable(gcfs_rq, shares);
   #endif
- -      shares = calc_cfs_shares(cfs_rq, tg);
   
- -      reweight_entity(cfs_rq_of(se), se, shares);
+ +      reweight_entity(cfs_rq_of(se), se, shares, runnable);
   }
   
   #else /* CONFIG_FAIR_GROUP_SCHED */
- -static inline void update_cfs_shares(struct sched_entity *se)
+ +static inline void update_cfs_group(struct sched_entity *se)
   {
   }
   #endif /* CONFIG_FAIR_GROUP_SCHED */
@@@ -3116,7 -2892,7 +3116,7 @@@ static u32 __accumulate_pelt_segments(u
    */
   static __always_inline u32
   accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
- -             unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ +             unsigned long load, unsigned long runnable, int running)
   {
         unsigned long scale_freq, scale_cpu;
         u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@@ -3133,8 -2909,10 +3133,8 @@@
          */
         if (periods) {
                 sa->load_sum = decay_load(sa->load_sum, periods);
- -              if (cfs_rq) {
- -                      cfs_rq->runnable_load_sum =
- -                              decay_load(cfs_rq->runnable_load_sum, periods);
- -              }
+ +              sa->runnable_load_sum =
+ +                      decay_load(sa->runnable_load_sum, periods);
                 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
   
                 /*
@@@ -3147,10 -2925,11 +3147,10 @@@
         sa->period_contrib = delta;
   
         contrib = cap_scale(contrib, scale_freq);
- -      if (weight) {
- -              sa->load_sum += weight * contrib;
- -              if (cfs_rq)
- -                      cfs_rq->runnable_load_sum += weight * contrib;
- -      }
+ +      if (load)
+ +              sa->load_sum += load * contrib;
+ +      if (runnable)
+ +              sa->runnable_load_sum += runnable * contrib;
         if (running)
                 sa->util_sum += contrib * scale_cpu;
   
@@@ -3186,8 -2965,8 +3186,8 @@@
    *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
    */
   static __always_inline int
- -___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- -                unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ +___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+ +                unsigned long load, unsigned long runnable, int running)
   {
         u64 delta;
   
@@@ -3220,8 -2999,8 +3220,8 @@@
          * this happens during idle_balance() which calls
          * update_blocked_averages()
          */
- -      if (!weight)
- -              running = 0;
+ +      if (!load)
+ +              runnable = running = 0;
   
         /*
          * Now we know we crossed measurement unit boundaries. The *_avg
@@@ -3230,96 -3009,63 +3230,96 @@@
          * Step 1: accumulate *_sum since last_update_time. If we haven't
          * crossed period boundaries, finish.
          */
- -      if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ +      if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
                 return 0;
   
+ +      return 1;
+ +}
+ +
+ +static __always_inline void
+ +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+ +{
+ +      u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ +
         /*
          * Step 2: update *_avg.
          */
- -      sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
- -      if (cfs_rq) {
- -              cfs_rq->runnable_load_avg =
- -                      div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
- -      }
- -      sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
- -
- -      return 1;
+ +      sa->load_avg = div_u64(load * sa->load_sum, divider);
+ +      sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+ +      sa->util_avg = sa->util_sum / divider;
   }
   
+ +/*
+ + * sched_entity:
+ + *
+ + *   task:
+ + *     se_runnable() == se_weight()
+ + *
+ + *   group: [ see update_cfs_group() ]
+ + *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
+ + *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ + *
+ + *   load_sum := runnable_sum
+ + *   load_avg = se_weight(se) * runnable_avg
+ + *
+ + *   runnable_load_sum := runnable_sum
+ + *   runnable_load_avg = se_runnable(se) * runnable_avg
+ + *
+ + * XXX collapse load_sum and runnable_load_sum
+ + *
+ + * cfq_rs:
+ + *
+ + *   load_sum = \Sum se_weight(se) * se->avg.load_sum
+ + *   load_avg = \Sum se->avg.load_avg
+ + *
+ + *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ + *   runnable_load_avg = \Sum se->avg.runable_load_avg
+ + */
+ +
   static int
   __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
   {
- -      return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+ +      if (entity_is_task(se))
+ +              se->runnable_weight = se->load.weight;
+ +
+ +      if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ +              ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ +              return 1;
+ +      }
+ +
+ +      return 0;
   }
   
   static int
   __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
- -      return ___update_load_avg(now, cpu, &se->avg,
- -                                se->on_rq * scale_load_down(se->load.weight),
- -                                cfs_rq->curr == se, NULL);
+ +      if (entity_is_task(se))
+ +              se->runnable_weight = se->load.weight;
+ +
+ +      if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ +                              cfs_rq->curr == se)) {
+ +
+ +              ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ +              return 1;
+ +      }
+ +
+ +      return 0;
   }
   
   static int
   __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
   {
- -      return ___update_load_avg(now, cpu, &cfs_rq->avg,
- -                      scale_load_down(cfs_rq->load.weight),
- -                      cfs_rq->curr != NULL, cfs_rq);
- -}
+ +      if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ +                              scale_load_down(cfs_rq->load.weight),
+ +                              scale_load_down(cfs_rq->runnable_weight),
+ +                              cfs_rq->curr != NULL)) {
   
- -/*
- - * Signed add and clamp on underflow.
- - *
- - * Explicitly do a load-store to ensure the intermediate value never hits
- - * memory. This allows lockless observations without ever seeing the negative
- - * values.
- - */
- -#define add_positive(_ptr, _val) do {                           \
- -      typeof(_ptr) ptr = (_ptr);                              \
- -      typeof(_val) val = (_val);                              \
- -      typeof(*ptr) res, var = READ_ONCE(*ptr);                \
- -                                                              \
- -      res = var + val;                                        \
- -                                                              \
- -      if (val < 0 && res > var)                               \
- -              res = 0;                                        \
- -                                                              \
- -      WRITE_ONCE(*ptr, res);                                  \
- -} while (0)
+ +              ___update_load_avg(&cfs_rq->avg, 1, 1);
+ +              return 1;
+ +      }
+ +
+ +      return 0;
+ +}
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   /**
@@@ -3402,77 -3148,11 +3402,77 @@@ void set_task_rq_fair(struct sched_enti
         se->avg.last_update_time = n_last_update_time;
   }
   
- -/* Take into account change of utilization of a child task group */
+ +
+ +/*
+ + * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ + * propagate its contribution. The key to this propagation is the invariant
+ + * that for each group:
+ + *
+ + *   ge->avg == grq->avg                                              (1)
+ + *
+ + * _IFF_ we look at the pure running and runnable sums. Because they
+ + * represent the very same entity, just at different points in the hierarchy.
+ + *
+ + *
+ + * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ + * simply copies the running sum over.
+ + *
+ + * However, update_tg_cfs_runnable() is more complex. So we have:
+ + *
+ + *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg                (2)
+ + *
+ + * And since, like util, the runnable part should be directly transferable,
+ + * the following would _appear_ to be the straight forward approach:
+ + *
+ + *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg      (3)
+ + *
+ + * And per (1) we have:
+ + *
+ + *   ge->avg.running_avg == grq->avg.running_avg
+ + *
+ + * Which gives:
+ + *
+ + *                      ge->load.weight * grq->avg.load_avg
+ + *   ge->avg.load_avg = -----------------------------------           (4)
+ + *                               grq->load.weight
+ + *
+ + * Except that is wrong!
+ + *
+ + * Because while for entities historical weight is not important and we
+ + * really only care about our future and therefore can consider a pure
+ + * runnable sum, runqueues can NOT do this.
+ + *
+ + * We specifically want runqueues to have a load_avg that includes
+ + * historical weights. Those represent the blocked load, the load we expect
+ + * to (shortly) return to us. This only works by keeping the weights as
+ + * integral part of the sum. We therefore cannot decompose as per (3).
+ + *
+ + * OK, so what then?
+ + *
+ + *
+ + * Another way to look at things is:
+ + *
+ + *   grq->avg.load_avg = \Sum se->avg.load_avg
+ + *
+ + * Therefore, per (2):
+ + *
+ + *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ + *
+ + * And the very thing we're propagating is a change in that sum (someone
+ + * joined/left). So we can easily know the runnable change, which would be, per
+ + * (2) the already tracked se->load_avg divided by the corresponding
+ + * se->weight.
+ + *
+ + * Basically (4) but in differential form:
+ + *
+ + *   d(runnable_avg) += se->avg.load_avg / se->load.weight
+ + *                                                               (5)
+ + *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ + */
+ +
   static inline void
- -update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
   {
- -      struct cfs_rq *gcfs_rq = group_cfs_rq(se);
         long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
   
         /* Nothing to update */
@@@ -3488,65 -3168,102 +3488,65 @@@
         cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
   }
   
- -/* Take into account change of load of a child task group */
   static inline void
- -update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
   {
- -      struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- -      long delta, load = gcfs_rq->avg.load_avg;
- -
- -      /*
- -       * If the load of group cfs_rq is null, the load of the
- -       * sched_entity will also be null so we can skip the formula
- -       */
- -      if (load) {
- -              long tg_load;
+ +      long runnable_sum = gcfs_rq->prop_runnable_sum;
+ +      long runnable_load_avg, load_avg;
+ +      s64 runnable_load_sum, load_sum;
   
- -              /* Get tg's load and ensure tg_load > 0 */
- -              tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
- -
- -              /* Ensure tg_load >= load and updated with current load*/
- -              tg_load -= gcfs_rq->tg_load_avg_contrib;
- -              tg_load += load;
+ +      if (!runnable_sum)
+ +              return;
   
- -              /*
- -               * We need to compute a correction term in the case that the
- -               * task group is consuming more CPU than a task of equal
- -               * weight. A task with a weight equals to tg->shares will have
- -               * a load less or equal to scale_load_down(tg->shares).
- -               * Similarly, the sched_entities that represent the task group
- -               * at parent level, can't have a load higher than
- -               * scale_load_down(tg->shares). And the Sum of sched_entities'
- -               * load must be <= scale_load_down(tg->shares).
- -               */
- -              if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- -                      /* scale gcfs_rq's load into tg's shares*/
- -                      load *= scale_load_down(gcfs_rq->tg->shares);
- -                      load /= tg_load;
- -              }
- -      }
+ +      gcfs_rq->prop_runnable_sum = 0;
   
- -      delta = load - se->avg.load_avg;
+ +      load_sum = (s64)se_weight(se) * runnable_sum;
+ +      load_avg = div_s64(load_sum, LOAD_AVG_MAX);
   
- -      /* Nothing to update */
- -      if (!delta)
- -              return;
+ +      add_positive(&se->avg.load_sum, runnable_sum);
+ +      add_positive(&se->avg.load_avg, load_avg);
   
- -      /* Set new sched_entity's load */
- -      se->avg.load_avg = load;
- -      se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+ +      add_positive(&cfs_rq->avg.load_avg, load_avg);
+ +      add_positive(&cfs_rq->avg.load_sum, load_sum);
   
- -      /* Update parent cfs_rq load */
- -      add_positive(&cfs_rq->avg.load_avg, delta);
- -      cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+ +      runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+ +      runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+ +
+ +      add_positive(&se->avg.runnable_load_sum, runnable_sum);
+ +      add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
   
- -      /*
- -       * If the sched_entity is already enqueued, we also have to update the
- -       * runnable load avg.
- -       */
         if (se->on_rq) {
- -              /* Update parent cfs_rq runnable_load_avg */
- -              add_positive(&cfs_rq->runnable_load_avg, delta);
- -              cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ +              add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+ +              add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
         }
   }
   
- -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
- -{
- -      cfs_rq->propagate_avg = 1;
- -}
- -
- -static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+ +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
   {
- -      struct cfs_rq *cfs_rq = group_cfs_rq(se);
- -
- -      if (!cfs_rq->propagate_avg)
- -              return 0;
- -
- -      cfs_rq->propagate_avg = 0;
- -      return 1;
+ +      cfs_rq->propagate = 1;
+ +      cfs_rq->prop_runnable_sum += runnable_sum;
   }
   
   /* Update task and its cfs_rq load average */
   static inline int propagate_entity_load_avg(struct sched_entity *se)
   {
- -      struct cfs_rq *cfs_rq;
+ +      struct cfs_rq *cfs_rq, *gcfs_rq;
   
         if (entity_is_task(se))
                 return 0;
   
- -      if (!test_and_clear_tg_cfs_propagate(se))
+ +      gcfs_rq = group_cfs_rq(se);
+ +      if (!gcfs_rq->propagate)
                 return 0;
   
+ +      gcfs_rq->propagate = 0;
+ +
         cfs_rq = cfs_rq_of(se);
   
- -      set_tg_cfs_propagate(cfs_rq);
+ +      add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
   
- -      update_tg_cfs_util(cfs_rq, se);
- -      update_tg_cfs_load(cfs_rq, se);
+ +      update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ +      update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
   
         return 1;
   }
@@@ -3570,7 -3287,7 +3570,7 @@@ static inline bool skip_blocked_update(
          * If there is a pending propagation, we have to update the load and
          * the utilization of the sched_entity:
          */
- -      if (gcfs_rq->propagate_avg)
+ +      if (gcfs_rq->propagate)
                 return false;
   
         /*
@@@ -3590,10 -3307,27 +3590,10 @@@ static inline int propagate_entity_load
         return 0;
   }
   
- -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+ +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
   
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
- -/*
- - * Unsigned subtract and clamp on underflow.
- - *
- - * Explicitly do a load-store to ensure the intermediate value never hits
- - * memory. This allows lockless observations without ever seeing the negative
- - * values.
- - */
- -#define sub_positive(_ptr, _val) do {                         \
- -      typeof(_ptr) ptr = (_ptr);                              \
- -      typeof(*ptr) val = (_val);                              \
- -      typeof(*ptr) res, var = READ_ONCE(*ptr);                \
- -      res = var - val;                                        \
- -      if (res > var)                                          \
- -              res = 0;                                        \
- -      WRITE_ONCE(*ptr, res);                                  \
- -} while (0)
- -
   /**
    * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
    * @now: current time, as per cfs_rq_clock_task()
@@@ -3613,45 -3347,65 +3613,45 @@@
   static inline int
   update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
   {
+ +      unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
         struct sched_avg *sa = &cfs_rq->avg;
- -      int decayed, removed_load = 0, removed_util = 0;
+ +      int decayed = 0;
   
- -      if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- -              s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ +      if (cfs_rq->removed.nr) {
+ +              unsigned long r;
+ +              u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ +
+ +              raw_spin_lock(&cfs_rq->removed.lock);
+ +              swap(cfs_rq->removed.util_avg, removed_util);
+ +              swap(cfs_rq->removed.load_avg, removed_load);
+ +              swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+ +              cfs_rq->removed.nr = 0;
+ +              raw_spin_unlock(&cfs_rq->removed.lock);
+ +
+ +              r = removed_load;
                 sub_positive(&sa->load_avg, r);
- -              sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
- -              removed_load = 1;
- -              set_tg_cfs_propagate(cfs_rq);
- -      }
+ +              sub_positive(&sa->load_sum, r * divider);
   
- -      if (atomic_long_read(&cfs_rq->removed_util_avg)) {
- -              long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ +              r = removed_util;
                 sub_positive(&sa->util_avg, r);
- -              sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
- -              removed_util = 1;
- -              set_tg_cfs_propagate(cfs_rq);
+ +              sub_positive(&sa->util_sum, r * divider);
+ +
+ +              add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+ +
+ +              decayed = 1;
         }
   
- -      decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ +      decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
   
   #ifndef CONFIG_64BIT
         smp_wmb();
         cfs_rq->load_last_update_time_copy = sa->last_update_time;
   #endif
   
- -      if (decayed || removed_util)
+ +      if (decayed)
                 cfs_rq_util_change(cfs_rq);
   
- -      return decayed || removed_load;
- -}
- -
- -/*
- - * Optional action to be done while updating the load average
- - */
- -#define UPDATE_TG     0x1
- -#define SKIP_AGE_LOAD 0x2
- -
- -/* Update task and its cfs_rq load average */
- -static inline void update_load_avg(struct sched_entity *se, int flags)
- -{
- -      struct cfs_rq *cfs_rq = cfs_rq_of(se);
- -      u64 now = cfs_rq_clock_task(cfs_rq);
- -      struct rq *rq = rq_of(cfs_rq);
- -      int cpu = cpu_of(rq);
- -      int decayed;
- -
- -      /*
- -       * Track task load average for carrying it to new CPU after migrated, and
- -       * track group sched_entity load average for task_h_load calc in migration
- -       */
- -      if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- -              __update_load_avg_se(now, cpu, cfs_rq, se);
- -
- -      decayed  = update_cfs_rq_load_avg(now, cfs_rq);
- -      decayed |= propagate_entity_load_avg(se);
- -
- -      if (decayed && (flags & UPDATE_TG))
- -              update_tg_load_avg(cfs_rq, 0);
+ +      return decayed;
   }
   
   /**
@@@ -3664,39 -3418,12 +3664,39 @@@
    */
   static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
+ +      u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ +
+ +      /*
+ +       * When we attach the @se to the @cfs_rq, we must align the decay
+ +       * window because without that, really weird and wonderful things can
+ +       * happen.
+ +       *
+ +       * XXX illustrate
+ +       */
         se->avg.last_update_time = cfs_rq->avg.last_update_time;
- -      cfs_rq->avg.load_avg += se->avg.load_avg;
- -      cfs_rq->avg.load_sum += se->avg.load_sum;
+ +      se->avg.period_contrib = cfs_rq->avg.period_contrib;
+ +
+ +      /*
+ +       * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+ +       * period_contrib. This isn't strictly correct, but since we're
+ +       * entirely outside of the PELT hierarchy, nobody cares if we truncate
+ +       * _sum a little.
+ +       */
+ +      se->avg.util_sum = se->avg.util_avg * divider;
+ +
+ +      se->avg.load_sum = divider;
+ +      if (se_weight(se)) {
+ +              se->avg.load_sum =
+ +                      div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+ +      }
+ +
+ +      se->avg.runnable_load_sum = se->avg.load_sum;
+ +
+ +      enqueue_load_avg(cfs_rq, se);
         cfs_rq->avg.util_avg += se->avg.util_avg;
         cfs_rq->avg.util_sum += se->avg.util_sum;
- -      set_tg_cfs_propagate(cfs_rq);
+ +
+ +      add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
   
         cfs_rq_util_change(cfs_rq);
   }
@@@ -3711,47 -3438,39 +3711,47 @@@
    */
   static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
- -
- -      sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- -      sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ +      dequeue_load_avg(cfs_rq, se);
         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- -      set_tg_cfs_propagate(cfs_rq);
+ +
+ +      add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
   
         cfs_rq_util_change(cfs_rq);
   }
   
- -/* Add the load generated by se into cfs_rq's load average */
- -static inline void
- -enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +/*
+ + * Optional action to be done while updating the load average
+ + */
+ +#define UPDATE_TG     0x1
+ +#define SKIP_AGE_LOAD 0x2
+ +#define DO_ATTACH     0x4
+ +
+ +/* Update task and its cfs_rq load average */
+ +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   {
- -      struct sched_avg *sa = &se->avg;
+ +      u64 now = cfs_rq_clock_task(cfs_rq);
+ +      struct rq *rq = rq_of(cfs_rq);
+ +      int cpu = cpu_of(rq);
+ +      int decayed;
+ +
+ +      /*
+ +       * Track task load average for carrying it to new CPU after migrated, and
+ +       * track group sched_entity load average for task_h_load calc in migration
+ +       */
+ +      if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ +              __update_load_avg_se(now, cpu, cfs_rq, se);
   
- -      cfs_rq->runnable_load_avg += sa->load_avg;
- -      cfs_rq->runnable_load_sum += sa->load_sum;
+ +      decayed  = update_cfs_rq_load_avg(now, cfs_rq);
+ +      decayed |= propagate_entity_load_avg(se);
+ +
+ +      if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
   
- -      if (!sa->last_update_time) {
                 attach_entity_load_avg(cfs_rq, se);
                 update_tg_load_avg(cfs_rq, 0);
- -      }
- -}
   
- -/* Remove the runnable load generated by se from cfs_rq's runnable load average */
- -static inline void
- -dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
- -{
- -      cfs_rq->runnable_load_avg =
- -              max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
- -      cfs_rq->runnable_load_sum =
- -              max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+ +      } else if (decayed && (flags & UPDATE_TG))
+ +              update_tg_load_avg(cfs_rq, 0);
   }
   
   #ifndef CONFIG_64BIT
@@@ -3795,7 -3514,6 +3795,7 @@@ void sync_entity_load_avg(struct sched_
   void remove_entity_load_avg(struct sched_entity *se)
   {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ +      unsigned long flags;
   
         /*
          * tasks cannot exit without having gone through wake_up_new_task() ->
@@@ -3808,18 -3526,13 +3808,18 @@@
          */
   
         sync_entity_load_avg(se);
- -      atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- -      atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+ +
+ +      raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+ +      ++cfs_rq->removed.nr;
+ +      cfs_rq->removed.util_avg        += se->avg.util_avg;
+ +      cfs_rq->removed.load_avg        += se->avg.load_avg;
+ +      cfs_rq->removed.runnable_sum    += se->avg.load_sum; /* == runnable_sum */
+ +      raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
   }
   
   static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
   {
- -      return cfs_rq->runnable_load_avg;
+ +      return cfs_rq->avg.runnable_load_avg;
   }
   
   static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@@ -3839,13 -3552,16 +3839,13 @@@ update_cfs_rq_load_avg(u64 now, struct 
   
   #define UPDATE_TG     0x0
   #define SKIP_AGE_LOAD 0x0
+ +#define DO_ATTACH     0x0
   
- -static inline void update_load_avg(struct sched_entity *se, int not_used1)
+ +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
   {
- -      cfs_rq_util_change(cfs_rq_of(se));
+ +      cfs_rq_util_change(cfs_rq);
   }
   
- -static inline void
- -enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
- -static inline void
- -dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
   static inline void remove_entity_load_avg(struct sched_entity *se) {}
   
   static inline void
@@@ -3990,9 -3706,9 +3990,9 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
          *     its group cfs_rq
          *   - Add its new weight to cfs_rq->load.weight
          */
- -      update_load_avg(se, UPDATE_TG);
- -      enqueue_entity_load_avg(cfs_rq, se);
- -      update_cfs_shares(se);
+ +      update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ +      update_cfs_group(se);
+ +      enqueue_runnable_load_avg(cfs_rq, se);
         account_entity_enqueue(cfs_rq, se);
   
         if (flags & ENQUEUE_WAKEUP)
@@@ -4074,8 -3790,8 +4074,8 @@@ dequeue_entity(struct cfs_rq *cfs_rq, s
          *   - For group entity, update its weight to reflect the new share
          *     of its group cfs_rq.
          */
- -      update_load_avg(se, UPDATE_TG);
- -      dequeue_entity_load_avg(cfs_rq, se);
+ +      update_load_avg(cfs_rq, se, UPDATE_TG);
+ +      dequeue_runnable_load_avg(cfs_rq, se);
   
         update_stats_dequeue(cfs_rq, se, flags);
   
@@@ -4098,7 -3814,7 +4098,7 @@@
         /* return excess runtime on last dequeue */
         return_cfs_rq_runtime(cfs_rq);
   
- -      update_cfs_shares(se);
+ +      update_cfs_group(se);
   
         /*
          * Now advance min_vruntime if @se was the entity holding it back,
@@@ -4162,7 -3878,7 +4162,7 @@@ set_next_entity(struct cfs_rq *cfs_rq, 
                  */
                 update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
- -              update_load_avg(se, UPDATE_TG);
+ +              update_load_avg(cfs_rq, se, UPDATE_TG);
         }
   
         update_stats_curr_start(cfs_rq, se);
@@@ -4264,7 -3980,7 +4264,7 @@@ static void put_prev_entity(struct cfs_
                 /* Put 'current' back into the tree. */
                 __enqueue_entity(cfs_rq, prev);
                 /* in !on_rq case, update occurred at dequeue */
- -              update_load_avg(prev, 0);
+ +              update_load_avg(cfs_rq, prev, 0);
         }
         cfs_rq->curr = NULL;
   }
@@@ -4280,8 -3996,8 +4280,8 @@@ entity_tick(struct cfs_rq *cfs_rq, stru
         /*
          * Ensure that runnable average is periodically updated.
          */
- -      update_load_avg(curr, UPDATE_TG);
- -      update_cfs_shares(curr);
+ +      update_load_avg(cfs_rq, curr, UPDATE_TG);
+ +      update_cfs_group(curr);
   
   #ifdef CONFIG_SCHED_HRTICK
         /*
@@@ -5198,8 -4914,8 +5198,8 @@@ enqueue_task_fair(struct rq *rq, struc
                 if (cfs_rq_throttled(cfs_rq))
                         break;
   
- -              update_load_avg(se, UPDATE_TG);
- -              update_cfs_shares(se);
+ +              update_load_avg(cfs_rq, se, UPDATE_TG);
+ +              update_cfs_group(se);
         }
   
         if (!se)
@@@ -5257,8 -4973,8 +5257,8 @@@ static void dequeue_task_fair(struct r
                 if (cfs_rq_throttled(cfs_rq))
                         break;
   
- -              update_load_avg(se, UPDATE_TG);
- -              update_cfs_shares(se);
+ +              update_load_avg(cfs_rq, se, UPDATE_TG);
+ +              update_cfs_group(se);
         }
   
         if (!se)
@@@ -5640,62 -5356,91 +5640,62 @@@ static int wake_wide(struct task_struc
         return 1;
   }
   
- -struct llc_stats {
- -      unsigned long   nr_running;
- -      unsigned long   load;
- -      unsigned long   capacity;
- -      int             has_capacity;
- -};
+ +/*
+ + * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ + * soonest. For the purpose of speed we only consider the waking and previous
+ + * CPU.
+ + *
+ + * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ + *                    will be) idle.
+ + *
+ + * wake_affine_weight() - considers the weight to reflect the average
+ + *                      scheduling latency of the CPUs. This seems to work
+ + *                      for the overloaded case.
+ + */
   
- -static bool get_llc_stats(struct llc_stats *stats, int cpu)
+ +static bool
+ +wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+ +               int this_cpu, int prev_cpu, int sync)
   {
- -      struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- -
- -      if (!sds)
- -              return false;
+ +      if (idle_cpu(this_cpu))
+ +              return true;
   
- -      stats->nr_running       = READ_ONCE(sds->nr_running);
- -      stats->load             = READ_ONCE(sds->load);
- -      stats->capacity         = READ_ONCE(sds->capacity);
- -      stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
+ +      if (sync && cpu_rq(this_cpu)->nr_running == 1)
+ +              return true;
   
- -      return true;
+ +      return false;
   }
   
- -/*
- - * Can a task be moved from prev_cpu to this_cpu without causing a load
- - * imbalance that would trigger the load balancer?
- - *
- - * Since we're running on 'stale' values, we might in fact create an imbalance
- - * but recomputing these values is expensive, as that'd mean iteration 2 cache
- - * domains worth of CPUs.
- - */
   static bool
- -wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
- -              int this_cpu, int prev_cpu, int sync)
+ +wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+ +                 int this_cpu, int prev_cpu, int sync)
   {
- -      struct llc_stats prev_stats, this_stats;
         s64 this_eff_load, prev_eff_load;
         unsigned long task_load;
   
- -      if (!get_llc_stats(&prev_stats, prev_cpu) ||
- -          !get_llc_stats(&this_stats, this_cpu))
- -              return false;
+ +      this_eff_load = target_load(this_cpu, sd->wake_idx);
+ +      prev_eff_load = source_load(prev_cpu, sd->wake_idx);
   
- -      /*
- -       * If sync wakeup then subtract the (maximum possible)
- -       * effect of the currently running task from the load
- -       * of the current LLC.
- -       */
         if (sync) {
                 unsigned long current_load = task_h_load(current);
   
- -              /* in this case load hits 0 and this LLC is considered 'idle' */
- -              if (current_load > this_stats.load)
+ +              if (current_load > this_eff_load)
                         return true;
   
- -              this_stats.load -= current_load;
+ +              this_eff_load -= current_load;
         }
   
- -      /*
- -       * The has_capacity stuff is not SMT aware, but by trying to balance
- -       * the nr_running on both ends we try and fill the domain at equal
- -       * rates, thereby first consuming cores before siblings.
- -       */
- -
- -      /* if the old cache has capacity, stay there */
- -      if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
- -              return false;
- -
- -      /* if this cache has capacity, come here */
- -      if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
- -              return true;
- -
- -      /*
- -       * Check to see if we can move the load without causing too much
- -       * imbalance.
- -       */
         task_load = task_h_load(p);
   
- -      this_eff_load = 100;
- -      this_eff_load *= prev_stats.capacity;
+ +      this_eff_load += task_load;
+ +      if (sched_feat(WA_BIAS))
+ +              this_eff_load *= 100;
+ +      this_eff_load *= capacity_of(prev_cpu);
   
- -      prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- -      prev_eff_load *= this_stats.capacity;
- -
- -      this_eff_load *= this_stats.load + task_load;
- -      prev_eff_load *= prev_stats.load - task_load;
+ +      prev_eff_load -= task_load;
+ +      if (sched_feat(WA_BIAS))
+ +              prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+ +      prev_eff_load *= capacity_of(this_cpu);
   
         return this_eff_load <= prev_eff_load;
   }
@@@ -5704,13 -5449,22 +5704,13 @@@ static int wake_affine(struct sched_dom
                        int prev_cpu, int sync)
   {
         int this_cpu = smp_processor_id();
- -      bool affine;
+ +      bool affine = false;
   
- -      /*
- -       * Default to no affine wakeups; wake_affine() should not effect a task
- -       * placement the load-balancer feels inclined to undo. The conservative
- -       * option is therefore to not move tasks when they wake up.
- -       */
- -      affine = false;
+ +      if (sched_feat(WA_IDLE) && !affine)
+ +              affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
   
- -      /*
- -       * If the wakeup is across cache domains, try to evaluate if movement
- -       * makes sense, otherwise rely on select_idle_siblings() to do
- -       * placement inside the cache domain.
- -       */
- -      if (!cpus_share_cache(prev_cpu, this_cpu))
- -              affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+ +      if (sched_feat(WA_WEIGHT) && !affine)
+ +              affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
   
         schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
         if (affine) {
@@@ -5732,8 -5486,6 +5732,8 @@@ static unsigned long capacity_spare_wak
   /*
    * find_idlest_group finds and returns the least busy CPU group within the
    * domain.
+ + *
+ + * Assumes p is allowed on at least one CPU in sd.
    */
   static struct sched_group *
   find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@@ -5741,9 -5493,8 +5741,9 @@@
   {
         struct sched_group *idlest = NULL, *group = sd->groups;
         struct sched_group *most_spare_sg = NULL;
- -      unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
- -      unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+ +      unsigned long min_runnable_load = ULONG_MAX;
+ +      unsigned long this_runnable_load = ULONG_MAX;
+ +      unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
         unsigned long most_spare = 0, this_spare = 0;
         int load_idx = sd->forkexec_idx;
         int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
@@@ -5864,10 -5615,10 +5864,10 @@@ skip_spare
   }
   
   /*
- - * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ + * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
    */
   static int
- -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+ +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
   {
         unsigned long load, min_load = ULONG_MAX;
         unsigned int min_exit_latency = UINT_MAX;
@@@ -5916,53 -5667,6 +5916,53 @@@
         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
   }
   
+ +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ +                                int cpu, int prev_cpu, int sd_flag)
+ +{
+ +      int new_cpu = cpu;
+ +
+ +      if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ +              return prev_cpu;
+ +
+ +      while (sd) {
+ +              struct sched_group *group;
+ +              struct sched_domain *tmp;
+ +              int weight;
+ +
+ +              if (!(sd->flags & sd_flag)) {
+ +                      sd = sd->child;
+ +                      continue;
+ +              }
+ +
+ +              group = find_idlest_group(sd, p, cpu, sd_flag);
+ +              if (!group) {
+ +                      sd = sd->child;
+ +                      continue;
+ +              }
+ +
+ +              new_cpu = find_idlest_group_cpu(group, p, cpu);
+ +              if (new_cpu == cpu) {
+ +                      /* Now try balancing at a lower domain level of cpu */
+ +                      sd = sd->child;
+ +                      continue;
+ +              }
+ +
+ +              /* Now try balancing at a lower domain level of new_cpu */
+ +              cpu = new_cpu;
+ +              weight = sd->span_weight;
+ +              sd = NULL;
+ +              for_each_domain(cpu, tmp) {
+ +                      if (weight <= tmp->span_weight)
+ +                              break;
+ +                      if (tmp->flags & sd_flag)
+ +                              sd = tmp;
+ +              }
+ +              /* while loop will break here if sd == NULL */
+ +      }
+ +
+ +      return new_cpu;
+ +}
+ +
   #ifdef CONFIG_SCHED_SMT
   
   static inline void set_idle_cores(int cpu, int val)
@@@ -6315,30 -6019,50 +6315,30 @@@ select_task_rq_fair(struct task_struct 
                         new_cpu = cpu;
         }
   
+ +      if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ +              /*
+ +               * We're going to need the task's util for capacity_spare_wake
+ +               * in find_idlest_group. Sync it up to prev_cpu's
+ +               * last_update_time.
+ +               */
+ +              sync_entity_load_avg(&p->se);
+ +      }
+ +
         if (!sd) {
- - pick_cpu:
+ +pick_cpu:
                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
                         new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
   
- -      } else while (sd) {
- -              struct sched_group *group;
- -              int weight;
- -
- -              if (!(sd->flags & sd_flag)) {
- -                      sd = sd->child;
- -                      continue;
- -              }
- -
- -              group = find_idlest_group(sd, p, cpu, sd_flag);
- -              if (!group) {
- -                      sd = sd->child;
- -                      continue;
- -              }
- -
- -              new_cpu = find_idlest_cpu(group, p, cpu);
- -              if (new_cpu == -1 || new_cpu == cpu) {
- -                      /* Now try balancing at a lower domain level of cpu */
- -                      sd = sd->child;
- -                      continue;
- -              }
- -
- -              /* Now try balancing at a lower domain level of new_cpu */
- -              cpu = new_cpu;
- -              weight = sd->span_weight;
- -              sd = NULL;
- -              for_each_domain(cpu, tmp) {
- -                      if (weight <= tmp->span_weight)
- -                              break;
- -                      if (tmp->flags & sd_flag)
- -                              sd = tmp;
- -              }
- -              /* while loop will break here if sd == NULL */
+ +      } else {
+ +              new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
         }
         rcu_read_unlock();
   
         return new_cpu;
   }
   
+ +static void detach_entity_cfs_rq(struct sched_entity *se);
+ +
   /*
    * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
    * cfs_rq_of(p) references at time of call are still valid and identify the
@@@ -6372,25 -6096,14 +6372,25 @@@ static void migrate_task_rq_fair(struc
                 se->vruntime -= min_vruntime;
         }
   
- -      /*
- -       * We are supposed to update the task to "current" time, then its up to date
- -       * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
- -       * what current time is, so simply throw away the out-of-date time. This
- -       * will result in the wakee task is less decayed, but giving the wakee more
- -       * load sounds not bad.
- -       */
- -      remove_entity_load_avg(&p->se);
+ +      if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+ +              /*
+ +               * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+ +               * rq->lock and can modify state directly.
+ +               */
+ +              lockdep_assert_held(&task_rq(p)->lock);
+ +              detach_entity_cfs_rq(&p->se);
+ +
+ +      } else {
+ +              /*
+ +               * We are supposed to update the task to "current" time, then
+ +               * its up to date and ready to go to new CPU/cfs_rq. But we
+ +               * have difficulty in getting what current time is, so simply
+ +               * throw away the out-of-date time. This will result in the
+ +               * wakee task is less decayed, but giving the wakee more load
+ +               * sounds not bad.
+ +               */
+ +              remove_entity_load_avg(&p->se);
+ +      }
   
         /* Tell new CPU we are migrated */
         p->se.avg.last_update_time = 0;
@@@ -6658,7 -6371,10 +6658,7 @@@ again
                 set_next_entity(cfs_rq, se);
         }
   
- -      if (hrtick_enabled(rq))
- -              hrtick_start_fair(rq, p);
- -
- -      return p;
+ +      goto done;
   simple:
   #endif
   
@@@ -6672,16 -6388,6 +6672,16 @@@
   
         p = task_of(se);
   
+ +done: __maybe_unused
+ +#ifdef CONFIG_SMP
+ +      /*
+ +       * Move the next running task to the front of
+ +       * the list, so our cfs_tasks list becomes MRU
+ +       * one.
+ +       */
+ +      list_move(&p->se.group_node, &rq->cfs_tasks);
+ +#endif
+ +
         if (hrtick_enabled(rq))
                 hrtick_start_fair(rq, p);
   
@@@ -7117,12 -6823,11 +7117,12 @@@ static void detach_task(struct task_str
    */
   static struct task_struct *detach_one_task(struct lb_env *env)
   {
- -      struct task_struct *p, *n;
+ +      struct task_struct *p;
   
         lockdep_assert_held(&env->src_rq->lock);
   
- -      list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ +      list_for_each_entry_reverse(p,
+ +                      &env->src_rq->cfs_tasks, se.group_node) {
                 if (!can_migrate_task(p, env))
                         continue;
   
@@@ -7168,7 -6873,7 +7168,7 @@@ static int detach_tasks(struct lb_env *
                 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
                         break;
   
- -              p = list_first_entry(tasks, struct task_struct, se.group_node);
+ +              p = list_last_entry(tasks, struct task_struct, se.group_node);
   
                 env->loop++;
                 /* We've more or less seen every task there is, call it quits */
@@@ -7218,7 -6923,7 +7218,7 @@@
   
                 continue;
   next:
- -              list_move_tail(&p->se.group_node, tasks);
+ +              list_move(&p->se.group_node, tasks);
         }
   
         /*
@@@ -7294,7 -6999,7 +7294,7 @@@ static inline bool cfs_rq_is_decayed(st
         if (cfs_rq->avg.util_sum)
                 return false;
   
- -      if (cfs_rq->runnable_load_sum)
+ +      if (cfs_rq->avg.runnable_load_sum)
                 return false;
   
         return true;
@@@ -7326,7 -7031,7 +7326,7 @@@ static void update_blocked_averages(in
                 /* Propagate pending load changes to the parent, if any: */
                 se = cfs_rq->tg->se[cpu];
                 if (se && !skip_blocked_update(se))
- -                      update_load_avg(se, 0);
+ +                      update_load_avg(cfs_rq_of(se), se, 0);
   
                 /*
                  * There can be a lot of idle CPU cgroups.  Don't let fully
@@@ -7895,6 -7600,7 +7895,6 @@@ static inline enum fbq_type fbq_classif
    */
   static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
   {
- -      struct sched_domain_shared *shared = env->sd->shared;
         struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats *local = &sds->local_stat;
@@@ -7966,6 -7672,22 +7966,6 @@@ next_group
                 if (env->dst_rq->rd->overload != overload)
                         env->dst_rq->rd->overload = overload;
         }
- -
- -      if (!shared)
- -              return;
- -
- -      /*
- -       * Since these are sums over groups they can contain some CPUs
- -       * multiple times for the NUMA domains.
- -       *
- -       * Currently only wake_affine_llc() and find_busiest_group()
- -       * uses these numbers, only the last is affected by this problem.
- -       *
- -       * XXX fix that.
- -       */
- -      WRITE_ONCE(shared->nr_running,  sds->total_running);
- -      WRITE_ONCE(shared->load,        sds->total_load);
- -      WRITE_ONCE(shared->capacity,    sds->total_capacity);
   }
   
   /**
@@@ -8207,11 -7929,8 +8207,11 @@@ static struct sched_group *find_busiest
         if (busiest->group_type == group_imbalanced)
                 goto force_balance;
   
- -      /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- -      if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ +      /*
+ +       * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ +       * capacities from resulting in underutilization due to avg_load.
+ +       */
+ +      if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
             busiest->group_no_capacity)
                 goto force_balance;
   
@@@ -8378,13 -8097,6 +8378,13 @@@ static int should_we_balance(struct lb_
         struct sched_group *sg = env->sd->groups;
         int cpu, balance_cpu = -1;
   
+ +      /*
+ +       * Ensure the balancing environment is consistent; can happen
+ +       * when the softirq triggers 'during' hotplug.
+ +       */
+ +      if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+ +              return 0;
+ +
         /*
          * In the newly idle case, we will allow all the cpu's
          * to do the newly idle load balance.
@@@ -9028,7 -8740,7 +9028,7 @@@ void nohz_balance_enter_idle(int cpu
                 return;
   
         /* Spare idle load balancing on CPUs that don't want to be disturbed: */
- -      if (!is_housekeeping_cpu(cpu))
+ +      if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
                 return;
   
         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
@@@ -9493,7 -9205,7 +9493,7 @@@ static void propagate_entity_cfs_rq(str
                 if (cfs_rq_throttled(cfs_rq))
                         break;
   
- -              update_load_avg(se, UPDATE_TG);
+ +              update_load_avg(cfs_rq, se, UPDATE_TG);
         }
   }
   #else
@@@ -9505,7 -9217,7 +9505,7 @@@ static void detach_entity_cfs_rq(struc
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
   
         /* Catch up with the cfs_rq and remove our load when we leave */
- -      update_load_avg(se, 0);
+ +      update_load_avg(cfs_rq, se, 0);
         detach_entity_load_avg(cfs_rq, se);
         update_tg_load_avg(cfs_rq, false);
         propagate_entity_cfs_rq(se);
@@@ -9524,7 -9236,7 +9524,7 @@@ static void attach_entity_cfs_rq(struc
   #endif
   
         /* Synchronize entity with its cfs_rq */
- -      update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ +      update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
         attach_entity_load_avg(cfs_rq, se);
         update_tg_load_avg(cfs_rq, false);
         propagate_entity_cfs_rq(se);
@@@ -9606,7 -9318,11 +9606,7 @@@ void init_cfs_rq(struct cfs_rq *cfs_rq
         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
   #endif
   #ifdef CONFIG_SMP
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -      cfs_rq->propagate_avg = 0;
- -#endif
- -      atomic_long_set(&cfs_rq->removed_load_avg, 0);
- -      atomic_long_set(&cfs_rq->removed_util_avg, 0);
+ +      raw_spin_lock_init(&cfs_rq->removed.lock);
   #endif
   }
   
@@@ -9804,8 -9520,8 +9804,8 @@@ int sched_group_set_shares(struct task_
                 rq_lock_irqsave(rq, &rf);
                 update_rq_clock(rq);
                 for_each_sched_entity(se) {
- -                      update_load_avg(se, UPDATE_TG);
- -                      update_cfs_shares(se);
+ +                      update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+ +                      update_cfs_group(se);
                 }
                 rq_unlock_irqrestore(rq, &rf);
         }
diff --combined kernel/sched/rt.c

index d8c43d73e078806ac468450732b76282fb13f798,fdc2c5d1f82eebeda7677887771b6c6b972c98c8..4056c19ca3f00efbc7592a1b4b071426fabf2124
--- 1/kernel/sched/rt.c
--- 2/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@@ -1,4 -1,3 +1,4 @@@
+ +// SPDX-License-Identifier: GPL-2.0
   /*
    * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
    * policies)
@@@ -74,6 -73,10 +74,6 @@@ static void start_rt_bandwidth(struct r
         raw_spin_unlock(&rt_b->rt_runtime_lock);
   }
   
- -#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
- -static void push_irq_work_func(struct irq_work *work);
- -#endif
- -
   void init_rt_rq(struct rt_rq *rt_rq)
   {
         struct rt_prio_array *array;
@@@ -93,6 -96,13 +93,6 @@@
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
         plist_head_init(&rt_rq->pushable_tasks);
- -
- -#ifdef HAVE_RT_PUSH_IPI
- -      rt_rq->push_flags = 0;
- -      rt_rq->push_cpu = nr_cpu_ids;
- -      raw_spin_lock_init(&rt_rq->push_lock);
- -      init_irq_work(&rt_rq->push_work, push_irq_work_func);
- -#endif
   #endif /* CONFIG_SMP */
         /* We start is dequeued state, because no RT tasks are queued */
         rt_rq->rt_queued = 0;
@@@ -969,7 -979,7 +969,7 @@@ static void update_curr_rt(struct rq *r
         account_group_exec_runtime(curr, delta_exec);
   
         curr->se.exec_start = rq_clock_task(rq);
-       cpuacct_charge(curr, delta_exec);
+       cgroup_account_cputime(curr, delta_exec);
   
         sched_rt_avg_update(rq, delta_exec);
   
@@@ -1865,166 -1875,241 +1865,166 @@@ static void push_rt_tasks(struct rq *rq
   }
   
   #ifdef HAVE_RT_PUSH_IPI
+ +
   /*
- - * The search for the next cpu always starts at rq->cpu and ends
- - * when we reach rq->cpu again. It will never return rq->cpu.
- - * This returns the next cpu to check, or nr_cpu_ids if the loop
- - * is complete.
+ + * When a high priority task schedules out from a CPU and a lower priority
+ + * task is scheduled in, a check is made to see if there's any RT tasks
+ + * on other CPUs that are waiting to run because a higher priority RT task
+ + * is currently running on its CPU. In this case, the CPU with multiple RT
+ + * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ + * up that may be able to run one of its non-running queued RT tasks.
+ + *
+ + * All CPUs with overloaded RT tasks need to be notified as there is currently
+ + * no way to know which of these CPUs have the highest priority task waiting
+ + * to run. Instead of trying to take a spinlock on each of these CPUs,
+ + * which has shown to cause large latency when done on machines with many
+ + * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ + * RT tasks waiting to run.
+ + *
+ + * Just sending an IPI to each of the CPUs is also an issue, as on large
+ + * count CPU machines, this can cause an IPI storm on a CPU, especially
+ + * if its the only CPU with multiple RT tasks queued, and a large number
+ + * of CPUs scheduling a lower priority task at the same time.
+ + *
+ + * Each root domain has its own irq work function that can iterate over
+ + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ + * tassk must be checked if there's one or many CPUs that are lowering
+ + * their priority, there's a single irq work iterator that will try to
+ + * push off RT tasks that are waiting to run.
+ + *
+ + * When a CPU schedules a lower priority task, it will kick off the
+ + * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ + * As it only takes the first CPU that schedules a lower priority task
+ + * to start the process, the rto_start variable is incremented and if
+ + * the atomic result is one, then that CPU will try to take the rto_lock.
+ + * This prevents high contention on the lock as the process handles all
+ + * CPUs scheduling lower priority tasks.
+ + *
+ + * All CPUs that are scheduling a lower priority task will increment the
+ + * rt_loop_next variable. This will make sure that the irq work iterator
+ + * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ + * priority task, even if the iterator is in the middle of a scan. Incrementing
+ + * the rt_loop_next will cause the iterator to perform another scan.
    *
- - * rq->rt.push_cpu holds the last cpu returned by this function,
- - * or if this is the first instance, it must hold rq->cpu.
    */
   static int rto_next_cpu(struct rq *rq)
   {
- -      int prev_cpu = rq->rt.push_cpu;
+ +      struct root_domain *rd = rq->rd;
+ +      int next;
         int cpu;
   
- -      cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
- -
         /*
- -       * If the previous cpu is less than the rq's CPU, then it already
- -       * passed the end of the mask, and has started from the beginning.
- -       * We end if the next CPU is greater or equal to rq's CPU.
+ +       * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ +       * rt_next_cpu() will simply return the first CPU found in
+ +       * the rto_mask.
+ +       *
+ +       * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+ +       * will return the next CPU found in the rto_mask.
+ +       *
+ +       * If there are no more CPUs left in the rto_mask, then a check is made
+ +       * against rto_loop and rto_loop_next. rto_loop is only updated with
+ +       * the rto_lock held, but any CPU may increment the rto_loop_next
+ +       * without any locking.
          */
- -      if (prev_cpu < rq->cpu) {
- -              if (cpu >= rq->cpu)
- -                      return nr_cpu_ids;
+ +      for (;;) {
   
- -      } else if (cpu >= nr_cpu_ids) {
- -              /*
- -               * We passed the end of the mask, start at the beginning.
- -               * If the result is greater or equal to the rq's CPU, then
- -               * the loop is finished.
- -               */
- -              cpu = cpumask_first(rq->rd->rto_mask);
- -              if (cpu >= rq->cpu)
- -                      return nr_cpu_ids;
- -      }
- -      rq->rt.push_cpu = cpu;
+ +              /* When rto_cpu is -1 this acts like cpumask_first() */
+ +              cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
   
- -      /* Return cpu to let the caller know if the loop is finished or not */
- -      return cpu;
- -}
+ +              rd->rto_cpu = cpu;
   
- -static int find_next_push_cpu(struct rq *rq)
- -{
- -      struct rq *next_rq;
- -      int cpu;
+ +              if (cpu < nr_cpu_ids)
+ +                      return cpu;
   
- -      while (1) {
- -              cpu = rto_next_cpu(rq);
- -              if (cpu >= nr_cpu_ids)
- -                      break;
- -              next_rq = cpu_rq(cpu);
+ +              rd->rto_cpu = -1;
+ +
+ +              /*
+ +               * ACQUIRE ensures we see the @rto_mask changes
+ +               * made prior to the @next value observed.
+ +               *
+ +               * Matches WMB in rt_set_overload().
+ +               */
+ +              next = atomic_read_acquire(&rd->rto_loop_next);
   
- -              /* Make sure the next rq can push to this rq */
- -              if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ +              if (rd->rto_loop == next)
                         break;
+ +
+ +              rd->rto_loop = next;
         }
   
- -      return cpu;
+ +      return -1;
   }
   
- -#define RT_PUSH_IPI_EXECUTING         1
- -#define RT_PUSH_IPI_RESTART           2
+ +static inline bool rto_start_trylock(atomic_t *v)
+ +{
+ +      return !atomic_cmpxchg_acquire(v, 0, 1);
+ +}
   
- -/*
- - * When a high priority task schedules out from a CPU and a lower priority
- - * task is scheduled in, a check is made to see if there's any RT tasks
- - * on other CPUs that are waiting to run because a higher priority RT task
- - * is currently running on its CPU. In this case, the CPU with multiple RT
- - * tasks queued on it (overloaded) needs to be notified that a CPU has opened
- - * up that may be able to run one of its non-running queued RT tasks.
- - *
- - * On large CPU boxes, there's the case that several CPUs could schedule
- - * a lower priority task at the same time, in which case it will look for
- - * any overloaded CPUs that it could pull a task from. To do this, the runqueue
- - * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
- - * for a single overloaded CPU's runqueue lock can produce a large latency.
- - * (This has actually been observed on large boxes running cyclictest).
- - * Instead of taking the runqueue lock of the overloaded CPU, each of the
- - * CPUs that scheduled a lower priority task simply sends an IPI to the
- - * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
- - * lots of contention. The overloaded CPU will look to push its non-running
- - * RT task off, and if it does, it can then ignore the other IPIs coming
- - * in, and just pass those IPIs off to any other overloaded CPU.
- - *
- - * When a CPU schedules a lower priority task, it only sends an IPI to
- - * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
- - * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
- - * RT overloaded tasks, would cause 100 IPIs to go out at once.
- - *
- - * The overloaded RT CPU, when receiving an IPI, will try to push off its
- - * overloaded RT tasks and then send an IPI to the next CPU that has
- - * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
- - * have completed. Just because a CPU may have pushed off its own overloaded
- - * RT task does not mean it should stop sending the IPI around to other
- - * overloaded CPUs. There may be another RT task waiting to run on one of
- - * those CPUs that are of higher priority than the one that was just
- - * pushed.
- - *
- - * An optimization that could possibly be made is to make a CPU array similar
- - * to the cpupri array mask of all running RT tasks, but for the overloaded
- - * case, then the IPI could be sent to only the CPU with the highest priority
- - * RT task waiting, and that CPU could send off further IPIs to the CPU with
- - * the next highest waiting task. Since the overloaded case is much less likely
- - * to happen, the complexity of this implementation may not be worth it.
- - * Instead, just send an IPI around to all overloaded CPUs.
- - *
- - * The rq->rt.push_flags holds the status of the IPI that is going around.
- - * A run queue can only send out a single IPI at a time. The possible flags
- - * for rq->rt.push_flags are:
- - *
- - *    (None or zero):         No IPI is going around for the current rq
- - *    RT_PUSH_IPI_EXECUTING:  An IPI for the rq is being passed around
- - *    RT_PUSH_IPI_RESTART:    The priority of the running task for the rq
- - *                            has changed, and the IPI should restart
- - *                            circulating the overloaded CPUs again.
- - *
- - * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
- - * before sending to the next CPU.
- - *
- - * Instead of having all CPUs that schedule a lower priority task send
- - * an IPI to the same "first" CPU in the RT overload mask, they send it
- - * to the next overloaded CPU after their own CPU. This helps distribute
- - * the work when there's more than one overloaded CPU and multiple CPUs
- - * scheduling in lower priority tasks.
- - *
- - * When a rq schedules a lower priority task than what was currently
- - * running, the next CPU with overloaded RT tasks is examined first.
- - * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
- - * priority task, it will send an IPI first to CPU 5, then CPU 5 will
- - * send to CPU 1 if it is still overloaded. CPU 1 will clear the
- - * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
- - *
- - * The first CPU to notice IPI_RESTART is set, will clear that flag and then
- - * send an IPI to the next overloaded CPU after the rq->cpu and not the next
- - * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
- - * schedules a lower priority task, and the IPI_RESTART gets set while the
- - * handling is being done on CPU 5, it will clear the flag and send it back to
- - * CPU 4 instead of CPU 1.
- - *
- - * Note, the above logic can be disabled by turning off the sched_feature
- - * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
- - * taken by the CPU requesting a pull and the waiting RT task will be pulled
- - * by that CPU. This may be fine for machines with few CPUs.
- - */
- -static void tell_cpu_to_push(struct rq *rq)
+ +static inline void rto_start_unlock(atomic_t *v)
   {
- -      int cpu;
+ +      atomic_set_release(v, 0);
+ +}
   
- -      if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- -              raw_spin_lock(&rq->rt.push_lock);
- -              /* Make sure it's still executing */
- -              if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- -                      /*
- -                       * Tell the IPI to restart the loop as things have
- -                       * changed since it started.
- -                       */
- -                      rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- -                      raw_spin_unlock(&rq->rt.push_lock);
- -                      return;
- -              }
- -              raw_spin_unlock(&rq->rt.push_lock);
- -      }
+ +static void tell_cpu_to_push(struct rq *rq)
+ +{
+ +      int cpu = -1;
   
- -      /* When here, there's no IPI going around */
+ +      /* Keep the loop going if the IPI is currently active */
+ +      atomic_inc(&rq->rd->rto_loop_next);
   
- -      rq->rt.push_cpu = rq->cpu;
- -      cpu = find_next_push_cpu(rq);
- -      if (cpu >= nr_cpu_ids)
+ +      /* Only one CPU can initiate a loop at a time */
+ +      if (!rto_start_trylock(&rq->rd->rto_loop_start))
                 return;
   
- -      rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ +      raw_spin_lock(&rq->rd->rto_lock);
   
- -      irq_work_queue_on(&rq->rt.push_work, cpu);
+ +      /*
+ +       * The rto_cpu is updated under the lock, if it has a valid cpu
+ +       * then the IPI is still running and will continue due to the
+ +       * update to loop_next, and nothing needs to be done here.
+ +       * Otherwise it is finishing up and an ipi needs to be sent.
+ +       */
+ +      if (rq->rd->rto_cpu < 0)
+ +              cpu = rto_next_cpu(rq);
+ +
+ +      raw_spin_unlock(&rq->rd->rto_lock);
+ +
+ +      rto_start_unlock(&rq->rd->rto_loop_start);
+ +
+ +      if (cpu >= 0)
+ +              irq_work_queue_on(&rq->rd->rto_push_work, cpu);
   }
   
   /* Called from hardirq context */
- -static void try_to_push_tasks(void *arg)
+ +void rto_push_irq_work_func(struct irq_work *work)
   {
- -      struct rt_rq *rt_rq = arg;
- -      struct rq *rq, *src_rq;
- -      int this_cpu;
+ +      struct rq *rq;
         int cpu;
   
- -      this_cpu = rt_rq->push_cpu;
+ +      rq = this_rq();
   
- -      /* Paranoid check */
- -      BUG_ON(this_cpu != smp_processor_id());
- -
- -      rq = cpu_rq(this_cpu);
- -      src_rq = rq_of_rt_rq(rt_rq);
- -
- -again:
+ +      /*
+ +       * We do not need to grab the lock to check for has_pushable_tasks.
+ +       * When it gets updated, a check is made if a push is possible.
+ +       */
         if (has_pushable_tasks(rq)) {
                 raw_spin_lock(&rq->lock);
- -              push_rt_task(rq);
+ +              push_rt_tasks(rq);
                 raw_spin_unlock(&rq->lock);
         }
   
- -      /* Pass the IPI to the next rt overloaded queue */
- -      raw_spin_lock(&rt_rq->push_lock);
- -      /*
- -       * If the source queue changed since the IPI went out,
- -       * we need to restart the search from that CPU again.
- -       */
- -      if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- -              rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- -              rt_rq->push_cpu = src_rq->cpu;
- -      }
+ +      raw_spin_lock(&rq->rd->rto_lock);
   
- -      cpu = find_next_push_cpu(src_rq);
+ +      /* Pass the IPI to the next rt overloaded queue */
+ +      cpu = rto_next_cpu(rq);
   
- -      if (cpu >= nr_cpu_ids)
- -              rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- -      raw_spin_unlock(&rt_rq->push_lock);
+ +      raw_spin_unlock(&rq->rd->rto_lock);
   
- -      if (cpu >= nr_cpu_ids)
+ +      if (cpu < 0)
                 return;
   
- -      /*
- -       * It is possible that a restart caused this CPU to be
- -       * chosen again. Don't bother with an IPI, just see if we
- -       * have more to push.
- -       */
- -      if (unlikely(cpu == rq->cpu))
- -              goto again;
- -
         /* Try the next RT overloaded CPU */
- -      irq_work_queue_on(&rt_rq->push_work, cpu);
- -}
- -
- -static void push_irq_work_func(struct irq_work *work)
- -{
- -      struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
- -
- -      try_to_push_tasks(rt_rq);
+ +      irq_work_queue_on(&rq->rd->rto_push_work, cpu);
   }
   #endif /* HAVE_RT_PUSH_IPI */
   
diff --combined kernel/sched/sched.h

index 45ab0bf564e7abde39013518754e96c3e72f5c3e,f0b98f97884301e78451202fe1121d5bf48eaab8..b19552a212de379f8a06589249fd0d78af4482dc
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -1,4 -1,3 +1,4 @@@
+ +/* SPDX-License-Identifier: GPL-2.0 */
   
   #include <linux/sched.h>
   #include <linux/sched/autogroup.h>
@@@ -30,6 -29,7 +30,7 @@@
   #include <linux/irq_work.h>
   #include <linux/tick.h>
   #include <linux/slab.h>
+ #include <linux/cgroup.h>
   
   #ifdef CONFIG_PARAVIRT
   #include <asm/paravirt.h>
@@@ -37,7 -37,6 +38,6 @@@
   
   #include "cpupri.h"
   #include "cpudeadline.h"
- #include "cpuacct.h"
   
   #ifdef CONFIG_SCHED_DEBUG
   # define SCHED_WARN_ON(x)     WARN_ONCE(x, #x)
@@@ -227,7 -226,7 +227,7 @@@ struct dl_bw 
   static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
   
   static inline
- -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+ +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
   {
         dl_b->total_bw -= tsk_bw;
         __dl_update(dl_b, (s32)tsk_bw / cpus);
@@@ -256,6 -255,7 +256,6 @@@ extern int sched_dl_overflow(struct tas
   extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
   extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
   extern bool __checkparam_dl(const struct sched_attr *attr);
- -extern void __dl_clear_params(struct task_struct *p);
   extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
   extern int dl_task_can_attach(struct task_struct *p,
                               const struct cpumask *cs_cpus_allowed);
@@@ -418,7 -418,6 +418,7 @@@ struct cfs_bandwidth { }
   /* CFS-related fields in a runqueue */
   struct cfs_rq {
         struct load_weight load;
+ +      unsigned long runnable_weight;
         unsigned int nr_running, h_nr_running;
   
         u64 exec_clock;
@@@ -444,22 -443,18 +444,22 @@@
          * CFS load tracking
          */
         struct sched_avg avg;
- -      u64 runnable_load_sum;
- -      unsigned long runnable_load_avg;
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -      unsigned long tg_load_avg_contrib;
- -      unsigned long propagate_avg;
- -#endif
- -      atomic_long_t removed_load_avg, removed_util_avg;
   #ifndef CONFIG_64BIT
         u64 load_last_update_time_copy;
   #endif
+ +      struct {
+ +              raw_spinlock_t  lock ____cacheline_aligned;
+ +              int             nr;
+ +              unsigned long   load_avg;
+ +              unsigned long   util_avg;
+ +              unsigned long   runnable_sum;
+ +      } removed;
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
+ +      unsigned long tg_load_avg_contrib;
+ +      long propagate;
+ +      long prop_runnable_sum;
+ +
         /*
          *   h_load = weight * f(tg)
          *
@@@ -506,7 -501,7 +506,7 @@@ static inline int rt_bandwidth_enabled(
   }
   
   /* RT IPI pull logic requires IRQ_WORK */
- -#ifdef CONFIG_IRQ_WORK
+ +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
   # define HAVE_RT_PUSH_IPI
   #endif
   
@@@ -528,6 -523,12 +528,6 @@@ struct rt_rq 
         unsigned long rt_nr_total;
         int overloaded;
         struct plist_head pushable_tasks;
- -#ifdef HAVE_RT_PUSH_IPI
- -      int push_flags;
- -      int push_cpu;
- -      struct irq_work push_work;
- -      raw_spinlock_t push_lock;
- -#endif
   #endif /* CONFIG_SMP */
         int rt_queued;
   
@@@ -636,19 -637,6 +636,19 @@@ struct root_domain 
         struct dl_bw dl_bw;
         struct cpudl cpudl;
   
+ +#ifdef HAVE_RT_PUSH_IPI
+ +      /*
+ +       * For IPI pull requests, loop across the rto_mask.
+ +       */
+ +      struct irq_work rto_push_work;
+ +      raw_spinlock_t rto_lock;
+ +      /* These are only updated and read within rto_lock */
+ +      int rto_loop;
+ +      int rto_cpu;
+ +      /* These atomics are updated outside of a lock */
+ +      atomic_t rto_loop_next;
+ +      atomic_t rto_loop_start;
+ +#endif
         /*
          * The "RT overload" flag: it gets set if a CPU has more than
          * one runnable RT task.
@@@ -666,9 -654,6 +666,9 @@@ extern void init_defrootdomain(void)
   extern int sched_init_domains(const struct cpumask *cpu_map);
   extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
   
+ +#ifdef HAVE_RT_PUSH_IPI
+ +extern void rto_push_irq_work_func(struct irq_work *work);
+ +#endif
   #endif /* CONFIG_SMP */
   
   /*
@@@ -1233,6 -1218,8 +1233,6 @@@ static inline void __set_task_cpu(struc
   # define const_debug const
   #endif
   
- -extern const_debug unsigned int sysctl_sched_features;
- -
   #define SCHED_FEAT(name, enabled)     \
         __SCHED_FEAT_##name ,
   
@@@ -1244,13 -1231,6 +1244,13 @@@ enum 
   #undef SCHED_FEAT
   
   #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+ +
+ +/*
+ + * To support run-time toggling of sched features, all the translation units
+ + * (but core.c) reference the sysctl_sched_features defined in core.c.
+ + */
+ +extern const_debug unsigned int sysctl_sched_features;
+ +
   #define SCHED_FEAT(name, enabled)                                     \
   static __always_inline bool static_branch_##name(struct static_key *key) \
   {                                                                     \
@@@ -1258,27 -1238,13 +1258,27 @@@
   }
   
   #include "features.h"
- -
   #undef SCHED_FEAT
   
   extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
   #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+ +
   #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+ +
+ +/*
+ + * Each translation unit has its own copy of sysctl_sched_features to allow
+ + * constants propagation at compile time and compiler optimization based on
+ + * features default.
+ + */
+ +#define SCHED_FEAT(name, enabled)     \
+ +      (1UL << __SCHED_FEAT_##name) * enabled |
+ +static const_debug __maybe_unused unsigned int sysctl_sched_features =
+ +#include "features.h"
+ +      0;
+ +#undef SCHED_FEAT
+ +
   #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+ +
   #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
   
   extern struct static_key_false sched_numa_balancing;
@@@ -1563,8 -1529,6 +1563,8 @@@ extern void init_sched_dl_class(void)
   extern void init_sched_rt_class(void);
   extern void init_sched_fair_class(void);
   
+ +extern void reweight_task(struct task_struct *p, int prio);
+ +
   extern void resched_curr(struct rq *rq);
   extern void resched_cpu(int cpu);
   
diff --combined kernel/sched/stop_task.c

index 45caf90b24cd9693a72b943220ecd0176580f748,ec0bb5ab9024417818dd6bc5fefae192e80dfd07..210b1f2146ff2f44b7ee1021b3a99b262857c841
--- 1/kernel/sched/stop_task.c
--- 2/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@@ -1,4 -1,3 +1,4 @@@
+ +// SPDX-License-Identifier: GPL-2.0
   #include "sched.h"
   
   /*
@@@ -72,7 -71,7 +72,7 @@@ static void put_prev_task_stop(struct r
         account_group_exec_runtime(curr, delta_exec);
   
         curr->se.exec_start = rq_clock_task(rq);
-       cpuacct_charge(curr, delta_exec);
+       cgroup_account_cputime(curr, delta_exec);
   }
   
   static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 15 Nov 2017 22:29:44 +0000 (14:29 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 15 Nov 2017 22:29:44 +0000 (14:29 -0800)
		1	2
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup-defs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched/cputime.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup-internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/cputime.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/deadline.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/rt.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/stop_task.c	patch \|	diff1 \|	diff2 \|	blob \| history