gcc-4.5 on). It is based on the commit "Add fuzzing coverage support"
by Dmitry Vyukov <dvyukov@google.com>.
+ config GCC_PLUGIN_LATENT_ENTROPY
+ bool "Generate some entropy during boot and runtime"
+ depends on GCC_PLUGINS
+ help
+ By saying Y here the kernel will instrument some kernel code to
+ extract some entropy from both original and artificially created
+ program state. This will help especially embedded systems where
+ there is little 'natural' source of entropy normally. The cost
+ is some slowdown of the boot process (about 0.5%) and fork and
+ irq processing.
+
+ Note that entropy extracted this way is not cryptographically
+ secure!
+
+ This plugin was ported from grsecurity/PaX. More information at:
+ * https://grsecurity.net/
+ * https://pax.grsecurity.net/
+
config HAVE_CC_STACKPROTECTOR
bool
help
endchoice
+config THIN_ARCHIVES
+ bool
+ help
+ Select this if the architecture wants to use thin archives
+ instead of ld -r to create the built-in.o files.
+
+config LD_DEAD_CODE_DATA_ELIMINATION
+ bool
+ help
+ Select this if the architecture wants to do dead code and
+ data elimination with the linker by compiling with
+ -ffunction-sections -fdata-sections and linking with
+ --gc-sections.
+
+ This requires that the arch annotates or otherwise protects
+ its external entry points from being discarded. Linker scripts
+ must also merge .text.*, .data.*, and .bss.* correctly into
+ output sections. Care must be taken not to pull in unrelated
+ sections (e.g., '.text.init'). Typically '.' in section names
+ is used to distinguish them from label names / C identifiers.
+
config HAVE_ARCH_WITHIN_STACK_FRAMES
bool
help
config CPU_NO_EFFICIENT_FFS
def_bool n
+config HAVE_ARCH_VMAP_STACK
+ def_bool n
+ help
+ An arch should select this symbol if it can support kernel stacks
+ in vmalloc space. This means:
+
+ - vmalloc space must be large enough to hold many kernel stacks.
+ This may rule out many 32-bit architectures.
+
+ - Stacks in vmalloc space need to work reliably. For example, if
+ vmap page tables are created on demand, either this mechanism
+ needs to work while the stack points to a virtual address with
+ unpopulated page tables or arch code (switch_to() and switch_mm(),
+ most likely) needs to ensure that the stack's page table entries
+ are populated before running on a possibly unpopulated stack.
+
+ - If the stack overflows into a guard page, something reasonable
+ should happen. The definition of "reasonable" is flexible, but
+ instantly rebooting without logging anything would be unfriendly.
+
+config VMAP_STACK
+ default y
+ bool "Use a virtually-mapped stack"
+ depends on HAVE_ARCH_VMAP_STACK && !KASAN
+ ---help---
+ Enable this if you want the use virtually-mapped kernel stacks
+ with guard pages. This causes kernel stack overflows to be
+ caught immediately rather than causing difficult-to-diagnose
+ corruption.
+
+ This is presently incompatible with KASAN because KASAN expects
+ the stack to map directly to the KASAN shadow map using a formula
+ that is incorrect if the stack is in vmalloc space.
+
source "kernel/gcov/Kconfig"
CFLAGS_btext.o += -fPIC
endif
+ CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ CFLAGS_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+
ifdef CONFIG_FUNCTION_TRACER
# Do not trace early boot code
CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
process.o systbl.o idle.o \
signal.o sysfs.o cacheinfo.o time.o \
prom.o traps.o setup-common.o \
- udbg.o misc.o io.o dma.o \
- misc_$(CONFIG_WORD_SIZE).o \
+ udbg.o misc.o io.o dma.o misc_$(BITS).o \
of_platform.o prom_parse.o
obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
signal_64.o ptrace32.o \
ifeq ($(CONFIG_FSL_BOOKE),y)
obj-$(CONFIG_HIBERNATION) += swsusp_booke.o
else
-obj-$(CONFIG_HIBERNATION) += swsusp_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o
endif
obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o
-obj-$(CONFIG_MODULES) += module.o module_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_MODULES) += module.o module_$(BITS).o
obj-$(CONFIG_44x) += cpu_setup_44x.o
obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o
obj-$(CONFIG_PPC_DOORBELL) += dbell.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
-extra-y := head_$(CONFIG_WORD_SIZE).o
+extra-y := head_$(BITS).o
extra-$(CONFIG_40x) := head_40x.o
extra-$(CONFIG_44x) := head_44x.o
extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o
extra-$(CONFIG_8xx) := head_8xx.o
extra-y += vmlinux.lds
-obj-$(CONFIG_RELOCATABLE) += reloc_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o
obj-$(CONFIG_PPC32) += entry_32.o setup_32.o
obj-$(CONFIG_PPC64) += dma-iommu.o iommu.o
obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_MODULES) += ppc_ksyms.o
-ifeq ($(CONFIG_PPC32),y)
-obj-$(CONFIG_MODULES) += ppc_ksyms_32.o
-endif
obj-$(CONFIG_BOOTX_TEXT) += btext.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o
pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o
-obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
+obj-$(CONFIG_PCI) += pci_$(BITS).o $(pci64-y) \
pci-common.o pci_of_scan.o
obj-$(CONFIG_PCI_MSI) += msi.o
obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o \
- machine_kexec_$(CONFIG_WORD_SIZE).o
+ machine_kexec_$(BITS).o
obj-$(CONFIG_AUDIT) += audit.o
obj64-$(CONFIG_AUDIT) += compat_audit.o
* Softirq action handler - move entries to local list and loop over them
* while passing them to the queue registered handler.
*/
- static void blk_done_softirq(struct softirq_action *h)
+ static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
struct list_head *cpu_list, local_list;
}
#endif
-static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
+static int blk_softirq_cpu_dead(unsigned int cpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
-
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_done, cpu),
- this_cpu_ptr(&blk_cpu_done));
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_enable();
- }
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
+ this_cpu_ptr(&blk_cpu_done));
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_enable();
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block blk_cpu_notifier = {
- .notifier_call = blk_cpu_notify,
-};
-
void __blk_complete_request(struct request *req)
{
int ccpu, cpu;
INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
- register_hotcpu_notifier(&blk_cpu_notifier);
+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
+ "block/softirq:dead", NULL,
+ blk_softirq_cpu_dead);
return 0;
}
subsys_initcall(blk_softirq_init);
static void crng_reseed(struct crng_state *crng, struct entropy_store *r);
static void push_to_pool(struct work_struct *work);
- static __u32 input_pool_data[INPUT_POOL_WORDS];
- static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
+ static __u32 input_pool_data[INPUT_POOL_WORDS] __latent_entropy;
+ static __u32 blocking_pool_data[OUTPUT_POOL_WORDS] __latent_entropy;
static struct entropy_store input_pool = {
.poolinfo = &poolinfo_table[0],
}
EXPORT_SYMBOL(get_random_long);
-/*
- * randomize_range() returns a start address such that
+/**
+ * randomize_page - Generate a random, page aligned address
+ * @start: The smallest acceptable address the caller will take.
+ * @range: The size of the area, starting at @start, within which the
+ * random address must fall.
+ *
+ * If @start + @range would overflow, @range is capped.
*
- * [...... <range> .....]
- * start end
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ * @start was already page aligned. We now align it regardless.
*
- * a <range> with size "len" starting at the return value is inside in the
- * area defined by [start, end], but is otherwise randomized.
+ * Return: A page aligned address within [start, start + range). On error,
+ * @start is returned.
*/
unsigned long
-randomize_range(unsigned long start, unsigned long end, unsigned long len)
+randomize_page(unsigned long start, unsigned long range)
{
- unsigned long range = end - len - start;
+ if (!PAGE_ALIGNED(start)) {
+ range -= PAGE_ALIGN(start) - start;
+ start = PAGE_ALIGN(start);
+ }
- if (end <= start + len)
- return 0;
- return PAGE_ALIGN(get_random_int() % range + start);
+ if (start > ULONG_MAX - range)
+ range = ULONG_MAX - start;
+
+ range >>= PAGE_SHIFT;
+
+ if (range == 0)
+ return start;
+
+ return start + (get_random_long() % range << PAGE_SHIFT);
}
/* Interface for in-kernel drivers of true hardware RNGs.
#include "pnode.h"
#include "internal.h"
+/* Maximum number of mounts in a mount namespace */
+unsigned int sysctl_mount_max __read_mostly = 100000;
+
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
list_splice(&head, n->list.prev);
+ n->mounts += n->pending_mounts;
+ n->pending_mounts = 0;
+
attach_shadowed(mnt, parent, shadows);
touch_mnt_namespace(n);
}
propagate_umount(&tmp_list);
while (!list_empty(&tmp_list)) {
+ struct mnt_namespace *ns;
bool disconnect;
p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
- __touch_mnt_namespace(p->mnt_ns);
+ ns = p->mnt_ns;
+ if (ns) {
+ ns->mounts--;
+ __touch_mnt_namespace(ns);
+ }
p->mnt_ns = NULL;
if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
return 0;
}
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
+{
+ unsigned int max = READ_ONCE(sysctl_mount_max);
+ unsigned int mounts = 0, old, pending, sum;
+ struct mount *p;
+
+ for (p = mnt; p; p = next_mnt(p, mnt))
+ mounts++;
+
+ old = ns->mounts;
+ pending = ns->pending_mounts;
+ sum = old + pending;
+ if ((old > sum) ||
+ (pending > sum) ||
+ (max < sum) ||
+ (mounts > (max - sum)))
+ return -ENOSPC;
+
+ ns->pending_mounts = pending + mounts;
+ return 0;
+}
+
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
struct path *parent_path)
{
HLIST_HEAD(tree_list);
+ struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct mount *child, *p;
struct hlist_node *n;
int err;
+ /* Is there space to add these mounts to the mount namespace? */
+ if (!parent_path) {
+ err = count_mounts(ns, source_mnt);
+ if (err)
+ goto out;
+ }
+
if (IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt, true);
if (err)
out_cleanup_ids:
while (!hlist_empty(&tree_list)) {
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+ child->mnt_parent->mnt_ns->pending_mounts = 0;
umount_tree(child, UMOUNT_SYNC);
}
unlock_mount_hash();
cleanup_group_ids(source_mnt, NULL);
out:
+ ns->pending_mounts = 0;
return err;
}
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
- MS_STRICTATIME);
+ MS_STRICTATIME | MS_NOREMOTELOCK);
if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
return retval;
}
+static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+}
+
+static void dec_mnt_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+}
+
static void free_mnt_ns(struct mnt_namespace *ns)
{
ns_free_inum(&ns->ns);
+ dec_mnt_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
kfree(ns);
}
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
{
struct mnt_namespace *new_ns;
+ struct ucounts *ucounts;
int ret;
+ ucounts = inc_mnt_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
- if (!new_ns)
+ if (!new_ns) {
+ dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
+ }
ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
return ERR_PTR(ret);
}
new_ns->ns.ops = &mntns_operations;
init_waitqueue_head(&new_ns->poll);
new_ns->event = 0;
new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
+ new_ns->mounts = 0;
+ new_ns->pending_mounts = 0;
return new_ns;
}
+ __latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
q = new;
while (p) {
q->mnt_ns = new_ns;
+ new_ns->mounts++;
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
struct mount *mnt = real_mount(m);
mnt->mnt_ns = new_ns;
new_ns->root = mnt;
+ new_ns->mounts++;
list_add(&mnt->mnt_list, &new_ns->list);
} else {
mntput(m);
return 0;
}
+static struct user_namespace *mntns_owner(struct ns_common *ns)
+{
+ return to_mnt_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations mntns_operations = {
.name = "mnt",
.type = CLONE_NEWNS,
.get = mntns_get,
.put = mntns_put,
.install = mntns_install,
+ .owner = mntns_owner,
};
# define unreachable() do { } while (1)
#endif
+/*
+ * KENTRY - kernel entry point
+ * This can be used to annotate symbols (functions or data) that are used
+ * without their linker symbol being referenced explicitly. For example,
+ * interrupt vector handlers, or functions in the kernel image that are found
+ * programatically.
+ *
+ * Not required for symbols exported with EXPORT_SYMBOL, or initcalls. Those
+ * are handled in their own way (with KEEP() in linker scripts).
+ *
+ * KENTRY can be avoided if the symbols in question are marked as KEEP() in the
+ * linker script. For example an architecture could KEEP() its entire
+ * boot/exception vector code rather than annotate each function and data.
+ */
+#ifndef KENTRY
+# define KENTRY(sym) \
+ extern typeof(sym) sym; \
+ static const unsigned long __kentry_##sym \
+ __used \
+ __attribute__((section("___kentry" "+" #sym ), used)) \
+ = (unsigned long)&sym;
+#endif
+
#ifndef RELOC_HIDE
# define RELOC_HIDE(ptr, off) \
({ unsigned long __ptr; \
# define __attribute_const__ /* unimplemented */
#endif
+ #ifndef __latent_entropy
+ # define __latent_entropy
+ #endif
+
/*
* Tell gcc if a function is cold. The compiler will assume any path
* directly leading to the call is unlikely.
struct rcu_head rcu;
};
-static inline bool close_on_exec(int fd, const struct fdtable *fdt)
+static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)
{
return test_bit(fd, fdt->close_on_exec);
}
-static inline bool fd_is_open(int fd, const struct fdtable *fdt)
+static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
return test_bit(fd, fdt->open_fds);
}
* written part on a separate cache line in SMP
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
- int next_fd;
+ unsigned int next_fd;
unsigned long close_on_exec_init[1];
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
void put_files_struct(struct files_struct *fs);
void reset_files_struct(struct files_struct *);
int unshare_files(struct files_struct **);
- struct files_struct *dup_fd(struct files_struct *, int *);
+ struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
int (*)(const void *, struct file *, unsigned),
/* These are for everybody (although not all archs will actually
discard it in modules) */
- #define __init __section(.init.text) __cold notrace
+ #define __init __section(.init.text) __cold notrace __latent_entropy
#define __initdata __section(.init.data)
-#define __initconst __constsection(.init.rodata)
+#define __initconst __section(.init.rodata)
#define __exitdata __section(.exit.data)
#define __exit_call __used __section(.exitcall.exit)
-/*
- * Some architecture have tool chains which do not handle rodata attributes
- * correctly. For those disable special sections for const, so that other
- * architectures can annotate correctly.
- */
-#ifdef CONFIG_BROKEN_RODATA
-#define __constsection(x)
-#else
-#define __constsection(x) __section(x)
-#endif
-
/*
* modpost check for section mismatches during the kernel build.
* A section mismatch happens when there are references from a
*/
#define __ref __section(.ref.text) noinline
#define __refdata __section(.ref.data)
-#define __refconst __constsection(.ref.rodata)
+#define __refconst __section(.ref.rodata)
#ifdef MODULE
#define __exitused
#define __exit __section(.exit.text) __exitused __cold notrace
/* Used for MEMORY_HOTPLUG */
- #define __meminit __section(.meminit.text) __cold notrace
+ #define __meminit __section(.meminit.text) __cold notrace \
+ __latent_entropy
#define __meminitdata __section(.meminit.data)
-#define __meminitconst __constsection(.meminit.rodata)
+#define __meminitconst __section(.meminit.rodata)
#define __memexit __section(.memexit.text) __exitused __cold notrace
#define __memexitdata __section(.memexit.data)
-#define __memexitconst __constsection(.memexit.rodata)
+#define __memexitconst __section(.memexit.rodata)
/* For assembly routines */
#define __HEAD .section ".head.text","ax"
#ifndef __ASSEMBLY__
-#ifdef CONFIG_LTO
-/* Work around a LTO gcc problem: when there is no reference to a variable
- * in a module it will be moved to the end of the program. This causes
- * reordering of initcalls which the kernel does not like.
- * Add a dummy reference function to avoid this. The function is
- * deleted by the linker.
- */
-#define LTO_REFERENCE_INITCALL(x) \
- ; /* yes this is needed */ \
- static __used __exit void *reference_##x(void) \
- { \
- return &x; \
- }
-#else
-#define LTO_REFERENCE_INITCALL(x)
-#endif
-
-/* initcalls are now grouped by functionality into separate
+/*
+ * initcalls are now grouped by functionality into separate
* subsections. Ordering inside the subsections is determined
* by link order.
* For backwards compatibility, initcall() puts the call in
*
* The `id' arg to __define_initcall() is needed so that multiple initcalls
* can point at the same handler without causing duplicate-symbol build errors.
+ *
+ * Initcalls are run by placing pointers in initcall sections that the
+ * kernel iterates at runtime. The linker can do dead code / data elimination
+ * and remove that completely, so the initcall sections have to be marked
+ * as KEEP() in the linker script.
*/
#define __define_initcall(fn, id) \
static initcall_t __initcall_##fn##id __used \
- __attribute__((__section__(".initcall" #id ".init"))) = fn; \
- LTO_REFERENCE_INITCALL(__initcall_##fn##id)
+ __attribute__((__section__(".initcall" #id ".init"))) = fn;
/*
* Early initcalls run before initializing SMP.
#define __initcall(fn) device_initcall(fn)
-#define __exitcall(fn) \
+#define __exitcall(fn) \
static exitcall_t __exitcall_##fn __exit_call = fn
-#define console_initcall(fn) \
- static initcall_t __initcall_##fn \
+#define console_initcall(fn) \
+ static initcall_t __initcall_##fn \
__used __section(.con_initcall.init) = fn
-#define security_initcall(fn) \
- static initcall_t __initcall_##fn \
+#define security_initcall(fn) \
+ static initcall_t __initcall_##fn \
__used __section(.security_initcall.init) = fn
struct obs_kernel_param {
};
extern void add_device_randomness(const void *, unsigned int);
+
+ #if defined(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) && !defined(__CHECKER__)
+ static inline void add_latent_entropy(void)
+ {
+ add_device_randomness((const void *)&latent_entropy,
+ sizeof(latent_entropy));
+ }
+ #else
+ static inline void add_latent_entropy(void) {}
+ #endif
+
extern void add_input_randomness(unsigned int type, unsigned int code,
- unsigned int value);
- extern void add_interrupt_randomness(int irq, int irq_flags);
+ unsigned int value) __latent_entropy;
+ extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
extern void get_random_bytes(void *buf, int nbytes);
extern int add_random_ready_callback(struct random_ready_callback *rdy);
unsigned int get_random_int(void);
unsigned long get_random_long(void);
-unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
+unsigned long randomize_page(unsigned long start, unsigned long range);
u32 prandom_u32(void);
void prandom_bytes(void *buf, size_t nbytes);
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
* kmemcache based allocator.
*/
-# if THREAD_SIZE >= PAGE_SIZE
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+
+#ifdef CONFIG_VMAP_STACK
+/*
+ * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ * flush. Try to minimize the number of calls by caching stacks.
+ */
+#define NR_CACHED_STACKS 2
+static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+#endif
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
+#ifdef CONFIG_VMAP_STACK
+ void *stack;
+ int i;
+
+ local_irq_disable();
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+
+ if (!s)
+ continue;
+ this_cpu_write(cached_stacks[i], NULL);
+
+ tsk->stack_vm_area = s;
+ local_irq_enable();
+ return s->addr;
+ }
+ local_irq_enable();
+
+ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ VMALLOC_START, VMALLOC_END,
+ THREADINFO_GFP | __GFP_HIGHMEM,
+ PAGE_KERNEL,
+ 0, node, __builtin_return_address(0));
+
+ /*
+ * We can't call find_vm_area() in interrupt context, and
+ * free_thread_stack() can be called in interrupt context,
+ * so cache the vm_struct.
+ */
+ if (stack)
+ tsk->stack_vm_area = find_vm_area(stack);
+ return stack;
+#else
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
+#endif
}
-static inline void free_thread_stack(unsigned long *stack)
+static inline void free_thread_stack(struct task_struct *tsk)
{
- __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+#ifdef CONFIG_VMAP_STACK
+ if (task_stack_vm_area(tsk)) {
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_read(cached_stacks[i]))
+ continue;
+
+ this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+ local_irq_restore(flags);
+ return;
+ }
+ local_irq_restore(flags);
+
+ vfree(tsk->stack);
+ return;
+ }
+#endif
+
+ __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_stack_cache;
return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
}
-static void free_thread_stack(unsigned long *stack)
+static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, stack);
+ kmem_cache_free(thread_stack_cache, tsk->stack);
}
void thread_stack_cache_init(void)
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(unsigned long *stack, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
{
- /* All stack pages are in the same zone and belong to the same memcg. */
- struct page *first_page = virt_to_page(stack);
+ void *stack = task_stack_page(tsk);
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+ if (vm) {
+ int i;
+
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ mod_zone_page_state(page_zone(vm->pages[i]),
+ NR_KERNEL_STACK_KB,
+ PAGE_SIZE / 1024 * account);
+ }
+
+ /* All stack pages belong to the same memcg. */
+ memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ } else {
+ /*
+ * All stack pages are in the same zone and belong to the
+ * same memcg.
+ */
+ struct page *first_page = virt_to_page(stack);
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- THREAD_SIZE / 1024 * account);
+ mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ THREAD_SIZE / 1024 * account);
- memcg_kmem_update_page_stat(
- first_page, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ }
}
-void free_task(struct task_struct *tsk)
+static void release_task_stack(struct task_struct *tsk)
{
- account_kernel_stack(tsk->stack, -1);
+ account_kernel_stack(tsk, -1);
arch_release_thread_stack(tsk->stack);
- free_thread_stack(tsk->stack);
+ free_thread_stack(tsk);
+ tsk->stack = NULL;
+#ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = NULL;
+#endif
+}
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+ if (atomic_dec_and_test(&tsk->stack_refcount))
+ release_task_stack(tsk);
+}
+#endif
+
+void free_task(struct task_struct *tsk)
+{
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+ /*
+ * The task is finally done with both the stack and thread_info,
+ * so free both.
+ */
+ release_task_stack(tsk);
+#else
+ /*
+ * If the task had a separate stack allocation, it should be gone
+ * by now.
+ */
+ WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
put_seccomp_filter(tsk);
{
taskstats_tgid_free(sig);
sched_autogroup_exit(sig);
+ /*
+ * __mmdrop is not safe to call from softirq context on x86 due to
+ * pgd_dtor so postpone it to the async context
+ */
+ if (sig->oom_mm)
+ mmdrop_async(sig->oom_mm);
kmem_cache_free(signal_cachep, sig);
}
void __init fork_init(void)
{
+ int i;
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
+
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ init_user_ns.ucount_max[i] = max_threads/2;
+ }
}
int __weak arch_dup_task_struct(struct task_struct *dst,
{
struct task_struct *tsk;
unsigned long *stack;
+ struct vm_struct *stack_vm_area;
int err;
if (node == NUMA_NO_NODE)
if (!stack)
goto free_tsk;
+ stack_vm_area = task_stack_vm_area(tsk);
+
err = arch_dup_task_struct(tsk, orig);
+
+ /*
+ * arch_dup_task_struct() clobbers the stack-related fields. Make
+ * sure they're properly initialized before using any stack-related
+ * functions again.
+ */
+ tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ atomic_set(&tsk->stack_refcount, 1);
+#endif
+
if (err)
goto free_stack;
- tsk->stack = stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
- account_kernel_stack(stack, 1);
+ account_kernel_stack(tsk, 1);
kcov_task_init(tsk);
return tsk;
free_stack:
- free_thread_stack(stack);
+ free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
}
#ifdef CONFIG_MMU
- static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+ static __latent_entropy int dup_mmap(struct mm_struct *mm,
+ struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct rb_node **rb_link, *rb_parent;
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
+ mm_put_huge_zero_page(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
mmdrop(mm);
}
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
- static struct task_struct *copy_process(unsigned long clone_flags,
+ static __latent_entropy struct task_struct *copy_process(
+ unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
+ put_task_stack(p);
free_task(p);
fork_out:
return ERR_PTR(retval);
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ add_latent_entropy();
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
#include <linux/export.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
-#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include "tree.h"
#include "rcu.h"
-MODULE_ALIAS("rcutree");
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
struct rcu_data *rdp)
{
bool ret;
+ bool need_gp;
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
*/
rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
- rdp->cpu_no_qs.b.norm = true;
+ need_gp = !!(rnp->qsmask & rdp->grpmask);
+ rdp->cpu_no_qs.b.norm = need_gp;
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
+ rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
}
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
- swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
}
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
- swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
/*
* Do RCU core processing for the current CPU.
*/
- static void rcu_process_callbacks(struct softirq_action *unused)
+ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
struct rcu_state *rsp;
rnp = rdp->mynode;
mask = rdp->grpmask;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rnp->qsmaskinitnext |= mask;
- rnp->expmaskinitnext |= mask;
if (!rdp->beenonline)
WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
return 0;
}
+/*
+ * Mark the specified CPU as being online so that subsequent grace periods
+ * (both expedited and normal) will wait on it. Note that this means that
+ * incoming CPUs are not allowed to use RCU read-side critical sections
+ * until this function is called. Failing to observe this restriction
+ * will result in lockdep splats.
+ */
+void rcu_cpu_starting(unsigned int cpu)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
+ mask = rdp->grpmask;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->qsmaskinitnext |= mask;
+ rnp->expmaskinitnext |= mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+}
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* The CPU is exiting the idle loop into the arch_cpu_idle_dead()
* or the scheduler are operational.
*/
pm_notifier(rcu_pm_notify, 0);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
rcutree_prepare_cpu(cpu);
+ rcu_cpu_starting(cpu);
+ }
}
#include "tree_exp.h"
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * 1024 < capacity * margin
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
static inline struct task_struct *task_of(struct sched_entity *se)
{
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(!entity_is_task(se));
-#endif
+ SCHED_WARN_ON(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *curr = cfs_rq->curr;
+
u64 vruntime = cfs_rq->min_vruntime;
- if (cfs_rq->curr)
- vruntime = cfs_rq->curr->vruntime;
+ if (curr) {
+ if (curr->on_rq)
+ vruntime = curr->vruntime;
+ else
+ curr = NULL;
+ }
if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);
- if (!cfs_rq->curr)
+ if (!curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
}
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
}
}
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq, false);
}
#else /* !CONFIG_SMP */
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
+ schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
update_curr(cfs_rq_of(&rq->curr->se));
}
-#ifdef CONFIG_SCHEDSTATS
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 wait_start = rq_clock(rq_of(cfs_rq));
+ u64 wait_start, prev_wait_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ wait_start = rq_clock(rq_of(cfs_rq));
+ prev_wait_start = schedstat_val(se->statistics.wait_start);
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- likely(wait_start > se->statistics.wait_start))
- wait_start -= se->statistics.wait_start;
+ likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
- se->statistics.wait_start = wait_start;
+ schedstat_set(se->statistics.wait_start, wait_start);
}
-static void
+static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
u64 delta;
- delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+ if (!schedstat_enabled())
+ return;
+
+ delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) {
p = task_of(se);
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- se->statistics.wait_start = delta;
+ schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
- se->statistics.wait_max = max(se->statistics.wait_max, delta);
- se->statistics.wait_count++;
- se->statistics.wait_sum += delta;
- se->statistics.wait_start = 0;
+ schedstat_set(se->statistics.wait_max,
+ max(schedstat_val(se->statistics.wait_max), delta));
+ schedstat_inc(se->statistics.wait_count);
+ schedstat_add(se->statistics.wait_sum, delta);
+ schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *tsk = NULL;
+ u64 sleep_start, block_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ sleep_start = schedstat_val(se->statistics.sleep_start);
+ block_start = schedstat_val(se->statistics.block_start);
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+ schedstat_set(se->statistics.sleep_max, delta);
+
+ schedstat_set(se->statistics.sleep_start, 0);
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ account_scheduler_latency(tsk, delta >> 10, 1);
+ trace_sched_stat_sleep(tsk, delta);
+ }
+ }
+ if (block_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+ schedstat_set(se->statistics.block_max, delta);
+
+ schedstat_set(se->statistics.block_start, 0);
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ if (tsk->in_iowait) {
+ schedstat_add(se->statistics.iowait_sum, delta);
+ schedstat_inc(se->statistics.iowait_count);
+ trace_sched_stat_iowait(tsk, delta);
+ }
+
+ trace_sched_stat_blocked(tsk, delta);
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
+ }
+ }
}
/*
* Task is being enqueued - update stats:
*/
static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ if (!schedstat_enabled())
+ return;
+
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper(cfs_rq, se);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+
+ if (!schedstat_enabled())
+ return;
+
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
+ if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ schedstat_set(se->statistics.sleep_start,
+ rq_clock(rq_of(cfs_rq)));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ schedstat_set(se->statistics.block_start,
+ rq_clock(rq_of(cfs_rq)));
}
-
-}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
}
-static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-}
-#endif
-
/*
* We are picking a new current task - update its stats:
*/
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
*/
- if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ if (!cur) {
+ /*
+ * select_idle_siblings() uses an per-cpu cpumask that
+ * can be used from IRQ context.
+ */
+ local_irq_disable();
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
+ local_irq_enable();
+ }
assign:
task_numa_assign(env, cur, imp);
unsigned long nr_pte_updates = 0;
long pages, virtpages;
- WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+ SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */
/*
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
-
- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
- unsigned long max = rq->cpu_capacity_orig;
-
+ if (&this_rq()->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
*
* See cpu_util().
*/
- cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+ cpufreq_update_util(rq_of(cfs_rq), 0);
}
}
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed utilization. It is expected
- * that one calls update_tg_load_avg() on this condition, but after you've
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
- * avg up.
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
*/
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
static inline void update_load_avg(struct sched_entity *se, int not_used)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- struct rq *rq = rq_of(cfs_rq);
-
- cpufreq_trigger_update(rq_clock(rq));
+ cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
}
static inline void
#endif /* CONFIG_SMP */
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
- struct task_struct *tsk = NULL;
-
- if (entity_is_task(se))
- tsk = task_of(se);
-
- if (se->statistics.sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.sleep_max))
- se->statistics.sleep_max = delta;
-
- se->statistics.sleep_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
- }
- if (se->statistics.block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.block_max))
- se->statistics.block_max = delta;
-
- se->statistics.block_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- if (tsk->in_iowait) {
- se->statistics.iowait_sum += delta;
- se->statistics.iowait_count++;
- trace_sched_stat_iowait(tsk, delta);
- }
-
- trace_sched_stat_blocked(tsk, delta);
-
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
-#endif
-}
-
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
d = -d;
if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq, nr_spread_over);
+ schedstat_inc(cfs_rq->nr_spread_over);
#endif
}
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
- if (flags & ENQUEUE_WAKEUP) {
+ if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
- if (schedstat_enabled())
- enqueue_sleeper(cfs_rq, se);
- }
check_schedstat_required();
- if (schedstat_enabled()) {
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
- }
+ update_stats_enqueue(cfs_rq, se, flags);
+ check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);
- if (schedstat_enabled())
- update_stats_dequeue(cfs_rq, se, flags);
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
/*
- * Normalize the entity after updating the min_vruntime because the
- * update can refer to the ->curr item and we need to reflect this
- * movement in our normalized position.
+ * Normalize after update_curr(); which will also have moved
+ * min_vruntime if @se is the one holding it back. But before doing
+ * update_min_vruntime() again, which will discount @se's position and
+ * can move min_vruntime forward still more.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
+
+ /*
+ * Now advance min_vruntime if @se was the entity holding it back,
+ * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+ * put back on, and if we advance min_vruntime, we'll be placed back
+ * further than we started -- ie. we'll be penalized.
+ */
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ update_min_vruntime(cfs_rq);
}
/*
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- if (schedstat_enabled())
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
+
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->statistics.slice_max = max(se->statistics.slice_max,
- se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ schedstat_set(se->statistics.slice_max,
+ max((u64)schedstat_val(se->statistics.slice_max),
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
-#endif
+
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- if (schedstat_enabled()) {
- check_spread(cfs_rq, prev);
- if (prev->on_rq)
- update_stats_wait_start(cfs_rq, prev);
- }
+ check_spread(cfs_rq, prev);
if (prev->on_rq) {
+ update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- WARN_ON(task_rq(p) != rq);
+ SCHED_WARN_ON(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
+ if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
for_each_sched_entity(se) {
if (se->on_rq)
break;
}
#ifdef CONFIG_SMP
+
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
* wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
- wl = (w * (long)tg->shares) / W;
+ wl = (w * (long)scale_load_down(tg->shares)) / W;
else
- wl = tg->shares;
+ wl = scale_load_down(tg->shares);
/*
* Per the above, wl is the new se->load.weight value; since
return 1;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
+ int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
balanced = this_eff_load <= prev_eff_load;
- schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (!balanced)
return 0;
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
return 1;
}
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_cpus(group));
+
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
if (idle_cpu(i)) {
}
/*
- * Try and locate an idle CPU in the sched_domain.
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ * (@start), and wraps around.
+ *
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ * through the LLC domain.
+ *
+ * Especially tbench is found sensitive to this.
+ */
+
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+ int next;
+
+again:
+ next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+
+ if (*wrapped) {
+ if (next >= start)
+ return nr_cpumask_bits;
+ } else {
+ if (next >= nr_cpumask_bits) {
+ *wrapped = 1;
+ n = -1;
+ goto again;
+ }
+ }
+
+ return next;
+}
+
+#define for_each_cpu_wrap(cpu, mask, start, wrap) \
+ for ((wrap) = 0, (cpu) = (start)-1; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
+ (cpu) < nr_cpumask_bits; )
+
+#ifdef CONFIG_SCHED_SMT
+
+static inline void set_idle_cores(int cpu, int val)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ WRITE_ONCE(sds->has_idle_cores, val);
+}
+
+static inline bool test_idle_cores(int cpu, bool def)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ return READ_ONCE(sds->has_idle_cores);
+
+ return def;
+}
+
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core, true))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core, 1);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int core, cpu, wrap;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ if (!test_idle_cores(target, false))
+ return -1;
+
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+
+ for_each_cpu_wrap(core, cpus, target, wrap) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ cpumask_clear_cpu(cpu, cpus);
+ if (!idle_cpu(cpu))
+ idle = false;
+ }
+
+ if (idle)
+ return core;
+ }
+
+ /*
+ * Failed to find an idle core; stop looking for one.
+ */
+ set_idle_cores(target, 0);
+
+ return -1;
+}
+
+/*
+ * Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ u64 avg_idle = this_rq()->avg_idle;
+ u64 avg_cost = this_sd->avg_scan_cost;
+ u64 time, cost;
+ s64 delta;
+ int cpu, wrap;
+
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ if ((avg_idle / 512) < avg_cost)
+ return -1;
+
+ time = local_clock();
+
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ break;
+ }
+
+ time = local_clock() - time;
+ cost = this_sd->avg_scan_cost;
+ delta = (s64)(time - cost) / 8;
+ this_sd->avg_scan_cost += delta;
+
+ return cpu;
+}
+
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
+ */
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
- int i = task_cpu(p);
+ int i;
if (idle_cpu(target))
return target;
/*
- * If the prevous cpu is cache affine and idle, don't be stupid.
+ * If the previous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ return prev;
- /*
- * Otherwise, iterate the domains and find an eligible idle cpu.
- *
- * A completely idle sched group at higher domains is more
- * desirable than an idle group at a lower level, because lower
- * domains have smaller groups and usually share hardware
- * resources which causes tasks to contend on them, e.g. x86
- * hyperthread siblings in the lowest domain (SMT) can contend
- * on the shared cpu pipeline.
- *
- * However, while we prefer idle groups at higher domains
- * finding an idle cpu at the lowest domain is still better than
- * returning 'target', which we've already established, isn't
- * idle.
- */
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- /* Ensure the entire group is idle */
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (!sd)
+ return target;
+
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_cpu(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_smt(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
- /*
- * It doesn't matter which cpu we pick, the
- * whole group is idle.
- */
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
- }
-done:
return target;
}
return (util >= capacity) ? capacity : util;
}
+static inline int task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+ && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
}
rcu_read_lock();
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else while (sd) {
struct sched_group *group;
*
* The adjacency matrix of the resulting graph is given by:
*
- * log_2 n
+ * log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0
*
*
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.]
- */
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
int cpu;
- schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_running);
return 0;
}
if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->se.statistics.nr_forced_migrations);
}
return 1;
}
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
return 0;
}
* so we can safely collect stats here rather than
* inside detach_tasks().
*/
- schedstat_inc(env->sd, lb_gained[env->idle]);
+ schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
return NULL;
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], detached);
+ schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached;
}
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
- */
+ */
group = child->groups;
do {
load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity;
- load_above_capacity *= NICE_0_LOAD;
+ load_above_capacity *= scale_load_down(NICE_0_LOAD);
load_above_capacity /= busiest->group_capacity;
} else
load_above_capacity = ~0UL;
*/
#define MAX_PINNED_INTERVAL 512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
cpumask_copy(cpus, cpu_active_mask);
- schedstat_inc(sd, lb_count[idle]);
+ schedstat_inc(sd->lb_count[idle]);
redo:
if (!should_we_balance(&env)) {
group = find_busiest_group(&env);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[idle]);
+ schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(&env, group);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[idle]);
+ schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
- schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
}
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ schedstat_inc(sd->lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
- schedstat_inc(sd, lb_balanced[idle]);
+ schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
}
static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
{
unsigned long interval, next;
- interval = get_sd_balance_interval(sd, cpu_busy);
+ /* used by idle balance, so cpu_busy = 0 */
+ interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
goto out;
continue;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
break;
}
curr_cost += domain_cost;
}
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
.idle = CPU_IDLE,
};
- schedstat_inc(sd, alb_count);
+ schedstat_inc(sd->alb_count);
p = detach_one_task(&env);
if (p) {
- schedstat_inc(sd, alb_pushed);
+ schedstat_inc(sd->alb_pushed);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
- schedstat_inc(sd, alb_failed);
+ schedstat_inc(sd->alb_failed);
}
}
rcu_read_unlock();
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+ atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+ atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
+ struct sched_domain_shared *sds;
struct sched_domain *sd;
- struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
bool kick = false;
return true;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd) {
- sgc = sd->groups->sgc;
- nr_busy = atomic_read(&sgc->nr_busy_cpus);
-
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * XXX: write a coherent comment on why we do this.
+ * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
kick = true;
goto unlock;
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
- static void run_rebalance_domains(struct softirq_action *h)
+ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
if (!vruntime_normalized(p)) {
/*
}
/* Catch up with the cfs_rq and remove our load when we leave */
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq, false);
detach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq, false);
}
static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
#endif
/* Synchronize task with its cfs_rq */
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq, false);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
wake_up_process(tsk);
}
+/*
+ * If ksoftirqd is scheduled, we do not want to process pending softirqs
+ * right now. Let ksoftirqd handle this at its own rate, to get fairness.
+ */
+static bool ksoftirqd_running(void)
+{
+ struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+
+ return tsk && (tsk->state == TASK_RUNNING);
+}
+
/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
pending = local_softirq_pending();
- if (pending)
+ if (pending && !ksoftirqd_running())
do_softirq_own_stack();
local_irq_restore(flags);
static inline void invoke_softirq(void)
{
+ if (ksoftirqd_running())
+ return;
+
if (!force_irqthreads) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
}
EXPORT_SYMBOL(__tasklet_hi_schedule_first);
- static void tasklet_action(struct softirq_action *a)
+ static __latent_entropy void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
}
}
- static void tasklet_hi_action(struct softirq_action *a)
+ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
{
struct tasklet_struct *list;
BUG();
}
-static void takeover_tasklets(unsigned int cpu)
+static int takeover_tasklets(unsigned int cpu)
{
/* CPU is dead, so no lock needed. */
local_irq_disable();
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_enable();
+ return 0;
}
+#else
+#define takeover_tasklets NULL
#endif /* CONFIG_HOTPLUG_CPU */
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
-{
- switch (action) {
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- takeover_tasklets((unsigned long)hcpu);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_nfb = {
- .notifier_call = cpu_callback
-};
-
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
static __init int spawn_ksoftirqd(void)
{
- register_cpu_notifier(&cpu_nfb);
-
+ cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
+ takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
return 0;
}
EXPORT_SYMBOL(irq_poll_complete);
- static void irq_poll_softirq(struct softirq_action *h)
+ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
{
struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
int rearm = 0, budget = irq_poll_budget;
}
EXPORT_SYMBOL(irq_poll_init);
-static int irq_poll_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int irq_poll_cpu_dead(unsigned int cpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
-
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
- this_cpu_ptr(&blk_cpu_iopoll));
- __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
- local_irq_enable();
- }
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+ this_cpu_ptr(&blk_cpu_iopoll));
+ __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+ local_irq_enable();
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block irq_poll_cpu_notifier = {
- .notifier_call = irq_poll_cpu_notify,
-};
-
static __init int irq_poll_setup(void)
{
int i;
INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq);
- register_hotcpu_notifier(&irq_poll_cpu_notifier);
+ cpuhp_setup_state_nocalls(CPUHP_IRQ_POLL_DEAD, "irq_poll:dead", NULL,
+ irq_poll_cpu_dead);
return 0;
}
subsys_initcall(irq_poll_setup);
}
#endif
- static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
+ static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy;
/**
* prandom_u32_state - seeded pseudo-random number generator.
u32 res;
res = prandom_u32_state(state);
- put_cpu_var(state);
+ put_cpu_var(net_rand_state);
return res;
}
struct rnd_state *state = &get_cpu_var(net_rand_state);
prandom_bytes_state(state, buf, bytes);
- put_cpu_var(state);
+ put_cpu_var(net_rand_state);
}
EXPORT_SYMBOL(prandom_bytes);
int _node_numa_mem_[MAX_NUMNODES];
#endif
+ #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
+ volatile u64 latent_entropy __latent_entropy;
+ EXPORT_SYMBOL(latent_entropy);
+ #endif
+
/*
* Array of node states.
*/
if (!debug_pagealloc_enabled())
return false;
+ if (!debug_guardpage_minorder())
+ return false;
+
return true;
}
if (!debug_pagealloc_enabled())
return;
+ if (!debug_guardpage_minorder())
+ return;
+
_debug_guardpage_enabled = true;
}
pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
-__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
-static inline void set_page_guard(struct zone *zone, struct page *page,
+static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
struct page_ext *page_ext;
if (!debug_guardpage_enabled())
- return;
+ return false;
+
+ if (order >= debug_guardpage_minorder())
+ return false;
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
- return;
+ return false;
__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+ return true;
}
static inline void clear_page_guard(struct zone *zone, struct page *page,
__mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
-struct page_ext_operations debug_guardpage_ops = { NULL, };
-static inline void set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
+struct page_ext_operations debug_guardpage_ops;
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
return;
/* Free a large naturally-aligned chunk if possible */
- if (nr_pages == MAX_ORDER_NR_PAGES &&
- (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+ if (nr_pages == pageblock_nr_pages &&
+ (pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, MAX_ORDER-1);
+ __free_pages_boot_core(page, pageblock_order);
return;
}
- for (i = 0; i < nr_pages; i++, page++)
+ for (i = 0; i < nr_pages; i++, page++, pfn++) {
+ if ((pfn & (pageblock_nr_pages - 1)) == 0)
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_boot_core(page, 0);
+ }
}
/* Completion tracking for deferred_init_memmap() threads */
/*
* Ensure pfn_valid is checked every
- * MAX_ORDER_NR_PAGES for memory holes
+ * pageblock_nr_pages for memory holes
*/
- if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ if ((pfn & (pageblock_nr_pages - 1)) == 0) {
if (!pfn_valid(pfn)) {
page = NULL;
goto free_range;
}
/* Minimise pfn page lookups and scheduler checks */
- if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+ if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
page++;
} else {
nr_pages += nr_to_free;
free_base_page = NULL;
free_base_pfn = nr_to_free = 0;
}
+ /* Free the last block of pages to allocator */
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
first_init_pfn = max(end_pfn, first_init_pfn);
}
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
- debug_guardpage_enabled() &&
- high < debug_guardpage_minorder()) {
- /*
- * Mark as guard pages (or page), that will allow to
- * merge back to allocator when buddy will be freed.
- * Corresponding page table entries will not be touched,
- * pages will stay not present in virtual address space
- */
- set_page_guard(zone, &page[size], high, migratetype);
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- }
+
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
- /* Obey watermarks as if the page was being allocated */
- watermark = low_wmark_pages(zone) + (1 << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ /*
+ * Obey watermarks as if the page was being allocated. We can
+ * emulate a high-order watermark check with a raised order-0
+ * watermark, because we already know our high-order page
+ * exists.
+ */
+ watermark = min_wmark_pages(zone) + (1UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
-void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
+ struct va_format vaf;
+ va_list args;
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
debug_guardpage_minorder() > 0)
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
- if (fmt) {
- struct va_format vaf;
- va_list args;
+ pr_warn("%s: ", current->comm);
- va_start(args, fmt);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_cont("%pV", &vaf);
+ va_end(args);
- vaf.fmt = fmt;
- vaf.va = &args;
+ pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
- pr_warn("%pV", &vaf);
-
- va_end(args);
- }
-
- pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
- current->comm, order, gfp_mask, &gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
return NULL;
}
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ enum compact_result compact_result,
+ enum compact_priority *compact_priority,
+ int *compaction_retries)
+{
+ int max_retries = MAX_COMPACT_RETRIES;
+ int min_priority;
+
+ if (!order)
+ return false;
+
+ if (compaction_made_progress(compact_result))
+ (*compaction_retries)++;
+
+ /*
+ * compaction considers all the zone as desperately out of memory
+ * so it doesn't really make much sense to retry except when the
+ * failure could be caused by insufficient priority
+ */
+ if (compaction_failed(compact_result))
+ goto check_priority;
+
+ /*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+ * But do not retry if the given zonelist is not suitable for
+ * compaction.
+ */
+ if (compaction_withdrawn(compact_result))
+ return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+ /*
+ * !costly requests are much more important than __GFP_REPEAT
+ * costly ones because they are de facto nofail and invoke OOM
+ * killer to move on while costly can fail and users are ready
+ * to cope with that. 1/4 retries is rather arbitrary but we
+ * would need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
+ if (*compaction_retries <= max_retries)
+ return true;
+
+ /*
+ * Make sure there are attempts at the highest priority if we exhausted
+ * all retries or failed at the lower priorities.
+ */
+check_priority:
+ min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+ MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+ if (*compact_priority > min_priority) {
+ (*compact_priority)--;
+ *compaction_retries = 0;
+ return true;
+ }
+ return false;
+}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
}
-#endif /* CONFIG_COMPACTION */
-
static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
- int compaction_retries)
+ int *compaction_retries)
{
struct zone *zone;
struct zoneref *z;
}
return false;
}
+#endif /* CONFIG_COMPACTION */
/* Perform direct synchronous page reclaim */
static int
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
struct alloc_context *ac, int alloc_flags,
- bool did_some_progress, int no_progress_loops)
+ bool did_some_progress, int *no_progress_loops)
{
struct zone *zone;
struct zoneref *z;
+ /*
+ * Costly allocations might have made a progress but this doesn't mean
+ * their order will become available due to high fragmentation so
+ * always increment the no progress counter for them
+ */
+ if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+ *no_progress_loops = 0;
+ else
+ (*no_progress_loops)++;
+
/*
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
- if (no_progress_loops > MAX_RECLAIM_RETRIES)
+ if (*no_progress_loops > MAX_RECLAIM_RETRIES)
return false;
/*
unsigned long reclaimable;
available = reclaimable = zone_reclaimable_pages(zone);
- available -= DIV_ROUND_UP(no_progress_loops * available,
+ available -= DIV_ROUND_UP((*no_progress_loops) * available,
MAX_RECLAIM_RETRIES);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
enum compact_result compact_result;
int compaction_retries = 0;
int no_progress_loops = 0;
+ unsigned long alloc_start = jiffies;
+ unsigned int stall_timeout = 10 * HZ;
/*
* In the slowpath, we sanity check order to avoid ever trying to
if (page)
goto got_pg;
- if (order && compaction_made_progress(compact_result))
- compaction_retries++;
-
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
goto nopage;
- /*
- * Costly allocations might have made a progress but this doesn't mean
- * their order will become available due to high fragmentation so
- * always increment the no progress counter for them
- */
- if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
- no_progress_loops = 0;
- else
- no_progress_loops++;
+ /* Make sure we know about allocations which stall for too long */
+ if (time_after(jiffies, alloc_start + stall_timeout)) {
+ warn_alloc(gfp_mask,
+ "page alloction stalls for %ums, order:%u\n",
+ jiffies_to_msecs(jiffies-alloc_start), order);
+ stall_timeout += 10 * HZ;
+ }
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
- did_some_progress > 0, no_progress_loops))
+ did_some_progress > 0, &no_progress_loops))
goto retry;
/*
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
- compaction_retries))
+ &compaction_retries))
goto retry;
/* Reclaim has failed us, start killing things */
}
nopage:
- warn_alloc_failed(gfp_mask, order, NULL);
+ warn_alloc(gfp_mask,
+ "page allocation failure: order:%u", order);
got_pg:
return page;
}
int j;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
int j;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[1];
+ zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
j = build_zonelists_node(pgdat, zonelist, 0);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
struct zone *z;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
pos = 0;
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
j = build_zonelists_node(pgdat, zonelist, 0);
/*
break;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
- * from zone_movable_pfn[nid] to end of each node should be
- * ZONE_MOVABLE not ZONE_NORMAL. skip it.
- */
- if (!mirrored_kernelcore && zone_movable_pfn[nid])
- if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
- continue;
-
/*
* Check given memblock attribute by firmware which can affect
* kernel memory layout. If zone==ZONE_MOVABLE but memory is
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (!mirrored_kernelcore &&
+ *zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
* and vice versa.
*/
- if (zone_movable_pfn[nid]) {
- if (mirrored_kernelcore) {
- unsigned long start_pfn, end_pfn;
- struct memblock_region *r;
-
- for_each_memblock(memory, r) {
- start_pfn = clamp(memblock_region_memory_base_pfn(r),
- zone_start_pfn, zone_end_pfn);
- end_pfn = clamp(memblock_region_memory_end_pfn(r),
- zone_start_pfn, zone_end_pfn);
-
- if (zone_type == ZONE_MOVABLE &&
- memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
-
- if (zone_type == ZONE_NORMAL &&
- !memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
- }
- } else {
- if (zone_type == ZONE_NORMAL)
- nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
}
}
__setup("hashdist=", set_hashdist);
#endif
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+ return 0;
+}
+#endif
+
/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
numentries = nr_kernel_pages;
+ numentries -= arch_reserved_kernel_pages();
/* It isn't necessary when PAGE_SIZE >= 1MB */
if (PAGE_SHIFT < 20)
else
skb_dst_force(skb);
-#ifdef CONFIG_NET_SWITCHDEV
- /* Don't forward if offload device already forwarded */
- if (skb->offload_fwd_mark &&
- skb->offload_fwd_mark == dev->offload_fwd_mark) {
- consume_skb(skb);
- rc = NET_XMIT_SUCCESS;
- goto out;
- }
-#endif
-
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
}
EXPORT_SYMBOL(netif_rx_ni);
- static void net_tx_action(struct softirq_action *h)
+ static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
}
}
-#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
- (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
unsigned char *addr) __read_mostly;
{
#ifdef CONFIG_NETFILTER_INGRESS
if (nf_hook_ingress_active(skb)) {
+ int ingress_retval;
+
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- return nf_hook_ingress(skb);
+ rcu_read_lock();
+ ingress_retval = nf_hook_ingress(skb);
+ rcu_read_unlock();
+ return ingress_retval;
}
#endif /* CONFIG_NETFILTER_INGRESS */
return 0;
}
EXPORT_SYMBOL(netif_receive_skb);
-/* Network device is going away, flush any packets still pending
- * Called with irqs disabled.
- */
-static void flush_backlog(void *arg)
+DEFINE_PER_CPU(struct work_struct, flush_works);
+
+/* Network device is going away, flush any packets still pending */
+static void flush_backlog(struct work_struct *work)
{
- struct net_device *dev = arg;
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
struct sk_buff *skb, *tmp;
+ struct softnet_data *sd;
+
+ local_bh_disable();
+ sd = this_cpu_ptr(&softnet_data);
+ local_irq_disable();
rps_lock(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- if (skb->dev == dev) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
rps_unlock(sd);
+ local_irq_enable();
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev == dev) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
+ local_bh_enable();
+}
+
+static void flush_all_backlogs(void)
+{
+ unsigned int cpu;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(&flush_works, cpu));
+
+ put_online_cpus();
}
static int napi_gro_complete(struct sk_buff *skb)
static int process_backlog(struct napi_struct *napi, int quota)
{
- int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+ bool again = true;
+ int work = 0;
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
}
napi->weight = weight_p;
- local_irq_disable();
- while (1) {
+ while (again) {
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) {
rcu_read_lock();
- local_irq_enable();
__netif_receive_skb(skb);
rcu_read_unlock();
- local_irq_disable();
input_queue_head_incr(sd);
- if (++work >= quota) {
- local_irq_enable();
+ if (++work >= quota)
return work;
- }
+
}
+ local_irq_disable();
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
- rps_unlock(sd);
-
- break;
+ again = false;
+ } else {
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
}
-
- skb_queue_splice_tail_init(&sd->input_pkt_queue,
- &sd->process_queue);
rps_unlock(sd);
+ local_irq_enable();
}
- local_irq_enable();
return work;
}
return work;
}
- static void net_rx_action(struct softirq_action *h)
+ static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies + 2;
static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
+ u16 ref_nr,
struct list_head *dev_list,
void *private, bool master)
{
adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
- adj->ref_nr++;
+ adj->ref_nr += ref_nr;
return 0;
}
adj->dev = adj_dev;
adj->master = master;
- adj->ref_nr = 1;
+ adj->ref_nr = ref_nr;
adj->private = private;
dev_hold(adj_dev);
static void __netdev_adjacent_dev_remove(struct net_device *dev,
struct net_device *adj_dev,
+ u16 ref_nr,
struct list_head *dev_list)
{
struct netdev_adjacent *adj;
BUG();
}
- if (adj->ref_nr > 1) {
- pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
- adj->ref_nr-1);
- adj->ref_nr--;
+ if (adj->ref_nr > ref_nr) {
+ pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
+ ref_nr, adj->ref_nr-ref_nr);
+ adj->ref_nr -= ref_nr;
return;
}
static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
struct net_device *upper_dev,
+ u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list,
void *private, bool master)
{
int ret;
- ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
- master);
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
+ private, master);
if (ret)
return ret;
- ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
- false);
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
+ private, false);
if (ret) {
- __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
return ret;
}
}
static int __netdev_adjacent_dev_link(struct net_device *dev,
- struct net_device *upper_dev)
+ struct net_device *upper_dev,
+ u16 ref_nr)
{
- return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
&dev->all_adj_list.upper,
&upper_dev->all_adj_list.lower,
NULL, false);
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
struct net_device *upper_dev,
+ u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list)
{
- __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
- __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
+ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}
static void __netdev_adjacent_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev)
+ struct net_device *upper_dev,
+ u16 ref_nr)
{
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
&dev->all_adj_list.upper,
&upper_dev->all_adj_list.lower);
}
struct net_device *upper_dev,
void *private, bool master)
{
- int ret = __netdev_adjacent_dev_link(dev, upper_dev);
+ int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
if (ret)
return ret;
- ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower,
private, master);
if (ret) {
- __netdev_adjacent_dev_unlink(dev, upper_dev);
+ __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
return ret;
}
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
struct net_device *upper_dev)
{
- __netdev_adjacent_dev_unlink(dev, upper_dev);
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower);
}
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
pr_debug("Interlinking %s with %s, non-neighbour\n",
i->dev->name, j->dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+ ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
if (ret)
goto rollback_mesh;
}
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
pr_debug("linking %s's upper device %s with %s\n",
upper_dev->name, i->dev->name, dev->name);
- ret = __netdev_adjacent_dev_link(dev, i->dev);
+ ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
if (ret)
goto rollback_upper_mesh;
}
list_for_each_entry(i, &dev->all_adj_list.lower, list) {
pr_debug("linking %s's lower device %s with %s\n", dev->name,
i->dev->name, upper_dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+ ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
if (ret)
goto rollback_lower_mesh;
}
list_for_each_entry(i, &dev->all_adj_list.lower, list) {
if (i == to_i)
break;
- __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
}
i = NULL;
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
if (i == to_i)
break;
- __netdev_adjacent_dev_unlink(dev, i->dev);
+ __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
}
i = j = NULL;
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
if (i == to_i && j == to_j)
break;
- __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
}
if (i == to_i)
break;
*/
list_for_each_entry(i, &dev->all_adj_list.lower, list)
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
/* remove also the devices itself from lower/upper device
* list
*/
list_for_each_entry(i, &dev->all_adj_list.lower, list)
- __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(dev, i->dev);
+ __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
unlist_netdevice(dev);
dev->reg_state = NETREG_UNREGISTERING;
- on_each_cpu(flush_backlog, dev, 1);
}
+ flush_all_backlogs();
synchronize_net();
INIT_LIST_HEAD(&dev->all_adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
+#ifdef CONFIG_NET_SCHED
+ hash_init(dev->qdisc_hash);
+#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
*/
for_each_possible_cpu(i) {
+ struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
+ INIT_WORK(flush, flush_backlog);
+
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
INIT_LIST_HEAD(&sd->poll_list);