summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks12
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c633
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h9
-rw-r--r--kernel/bpf/btf.c512
-rw-r--r--kernel/bpf/cgroup.c97
-rw-r--r--kernel/bpf/core.c7
-rw-r--r--kernel/bpf/cpumap.c76
-rw-r--r--kernel/bpf/devmap.c190
-rw-r--r--kernel/bpf/dispatcher.c158
-rw-r--r--kernel/bpf/hashtab.c264
-rw-r--r--kernel/bpf/helpers.c12
-rw-r--r--kernel/bpf/inode.c57
-rw-r--r--kernel/bpf/map_in_map.c3
-rw-r--r--kernel/bpf/offload.c12
-rw-r--r--kernel/bpf/syscall.c695
-rw-r--r--kernel/bpf/trampoline.c157
-rw-r--r--kernel/bpf/verifier.c504
-rw-r--r--kernel/bpf/xskmap.c18
-rw-r--r--kernel/cgroup/cgroup-internal.h4
-rw-r--r--kernel/cgroup/cgroup-v1.c35
-rw-r--r--kernel/cgroup/cgroup.c18
-rw-r--r--kernel/configs.c9
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/debug/kdb/kdb_bp.c1
-rw-r--r--kernel/debug/kdb/kdb_bt.c8
-rw-r--r--kernel/debug/kdb/kdb_io.c9
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/events/core.c61
-rw-r--r--kernel/events/internal.h34
-rw-r--r--kernel/events/ring_buffer.c54
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/irq/manage.c7
-rw-r--r--kernel/irq/proc.c42
-rw-r--r--kernel/kallsyms.c12
-rw-r--r--kernel/kprobes.c71
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/locking/lockdep_proc.c19
-rw-r--r--kernel/locking/osq_lock.c23
-rw-r--r--kernel/locking/qspinlock.c13
-rw-r--r--kernel/module.c77
-rw-r--r--kernel/padata.c386
-rw-r--r--kernel/pid.c90
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/profile.c24
-rw-r--r--kernel/rcu/Kconfig17
-rw-r--r--kernel/rcu/Makefile1
-rw-r--r--kernel/rcu/rcu.h33
-rw-r--r--kernel/rcu/rcu_segcblist.c25
-rw-r--r--kernel/rcu/rcu_segcblist.h25
-rw-r--r--kernel/rcu/rcuperf.c173
-rw-r--r--kernel/rcu/rcutorture.c141
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/srcutree.c11
-rw-r--r--kernel/rcu/tiny.c28
-rw-r--r--kernel/rcu/tree.c324
-rw-r--r--kernel/rcu/tree.h18
-rw-r--r--kernel/rcu/tree_exp.h146
-rw-r--r--kernel/rcu/tree_plugin.h168
-rw-r--r--kernel/rcu/tree_stall.h34
-rw-r--r--kernel/rcu/update.c14
-rw-r--r--kernel/sched/clock.c6
-rw-r--r--kernel/sched/core.c34
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cpupri.c25
-rw-r--r--kernel/sched/cpupri.h4
-rw-r--r--kernel/sched/cputime.c15
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c171
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/pelt.c20
-rw-r--r--kernel/sched/psi.c52
-rw-r--r--kernel/sched/rt.c83
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--kernel/sched/topology.c39
-rw-r--r--kernel/sched/wait_bit.c1
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/stop_machine.c32
-rw-r--r--kernel/sys.c25
-rw-r--r--kernel/sysctl-test.c4
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/itimer.c18
-rw-r--r--kernel/time/time.c58
-rw-r--r--kernel/trace/Kconfig360
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/blktrace.c8
-rw-r--r--kernel/trace/bpf_trace.c27
-rw-r--r--kernel/trace/ftrace.c37
-rw-r--r--kernel/trace/kprobe_event_gen_test.c225
-rw-r--r--kernel/trace/ring_buffer.c135
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/synth_event_gen_test.c523
-rw-r--r--kernel/trace/trace.c457
-rw-r--r--kernel/trace/trace.h129
-rw-r--r--kernel/trace/trace_boot.c334
-rw-r--r--kernel/trace/trace_branch.c6
-rw-r--r--kernel/trace/trace_dynevent.c212
-rw-r--r--kernel/trace/trace_dynevent.h32
-rw-r--r--kernel/trace/trace_entries.h68
-rw-r--r--kernel/trace/trace_events.c132
-rw-r--r--kernel/trace/trace_events_hist.c1038
-rw-r--r--kernel/trace/trace_events_trigger.c7
-rw-r--r--kernel/trace/trace_export.c106
-rw-r--r--kernel/trace/trace_functions.c8
-rw-r--r--kernel/trace/trace_functions_graph.c14
-rw-r--r--kernel/trace/trace_hwlat.c4
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_kdb.c8
-rw-r--r--kernel/trace/trace_kprobe.c254
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_probe.c6
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c20
-rw-r--r--kernel/trace/trace_selftest.c26
-rw-r--r--kernel/trace/trace_seq.c3
-rw-r--r--kernel/trace/trace_stat.c31
-rw-r--r--kernel/trace/trace_syscalls.c59
-rw-r--r--kernel/trace/trace_uprobe.c11
-rw-r--r--kernel/tsacct.c9
-rw-r--r--kernel/workqueue.c2
128 files changed, 7952 insertions, 2644 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index e0852dc333ac..3de8fd11873b 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -101,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK
# unlock and unlock_irq functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
# or
-# - DEBUG_SPINLOCK=n and PREEMPT=n
+# - DEBUG_SPINLOCK=n and PREEMPTION=n
#
# unlock_bh and unlock_irqrestore functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
@@ -139,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ
config INLINE_SPIN_UNLOCK_IRQRESTORE
def_bool y
@@ -168,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
def_bool y
- depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
+ depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
def_bool y
@@ -176,7 +176,7 @@ config INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ
config INLINE_READ_UNLOCK_IRQRESTORE
def_bool y
@@ -205,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
def_bool y
- depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
+ depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
def_bool y
@@ -213,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ
config INLINE_WRITE_UNLOCK_IRQRESTORE
def_bool y
diff --git a/kernel/Makefile b/kernel/Makefile
index f2cc0d118a0b..4cb4130ced32 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ KCOV_INSTRUMENT_softirq.o := n
# and produce insane amounts of uninteresting coverage.
KCOV_INSTRUMENT_module.o := n
KCOV_INSTRUMENT_extable.o := n
+KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
diff --git a/kernel/acct.c b/kernel/acct.c
index 81f9831a7859..11ff4a596d6b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -416,6 +416,7 @@ static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
u64 elapsed, run_time;
+ time64_t btime;
struct tty_struct *tty;
/*
@@ -448,7 +449,8 @@ static void fill_ac(acct_t *ac)
}
#endif
do_div(elapsed, AHZ);
- ac->ac_btime = get_seconds() - elapsed;
+ btime = ktime_get_real_seconds() - elapsed;
+ ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
#if ACCT_VERSION==2
ac->ac_ahz = AHZ;
#endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3f671bf617e8..046ce5d98033 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
@@ -26,3 +27,6 @@ endif
ifeq ($(CONFIG_SYSFS),y)
obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index f0d19bbb9211..95d77770353c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -503,6 +503,8 @@ const struct bpf_map_ops array_map_ops = {
.map_mmap = array_map_mmap,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
};
const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
new file mode 100644
index 000000000000..042f95534f86
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -0,0 +1,633 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/slab.h>
+#include <linux/numa.h>
+#include <linux/seq_file.h>
+#include <linux/refcount.h>
+#include <linux/mutex.h>
+
+enum bpf_struct_ops_state {
+ BPF_STRUCT_OPS_STATE_INIT,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE,
+};
+
+#define BPF_STRUCT_OPS_COMMON_VALUE \
+ refcount_t refcnt; \
+ enum bpf_struct_ops_state state
+
+struct bpf_struct_ops_value {
+ BPF_STRUCT_OPS_COMMON_VALUE;
+ char data[0] ____cacheline_aligned_in_smp;
+};
+
+struct bpf_struct_ops_map {
+ struct bpf_map map;
+ const struct bpf_struct_ops *st_ops;
+ /* protect map_update */
+ struct mutex lock;
+ /* progs has all the bpf_prog that is populated
+ * to the func ptr of the kernel's struct
+ * (in kvalue.data).
+ */
+ struct bpf_prog **progs;
+ /* image is a page that has all the trampolines
+ * that stores the func args before calling the bpf_prog.
+ * A PAGE_SIZE "image" is enough to store all trampoline for
+ * "progs[]".
+ */
+ void *image;
+ /* uvalue->data stores the kernel struct
+ * (e.g. tcp_congestion_ops) that is more useful
+ * to userspace than the kvalue. For example,
+ * the bpf_prog's id is stored instead of the kernel
+ * address of a func ptr.
+ */
+ struct bpf_struct_ops_value *uvalue;
+ /* kvalue.data stores the actual kernel's struct
+ * (e.g. tcp_congestion_ops) that will be
+ * registered to the kernel subsystem.
+ */
+ struct bpf_struct_ops_value kvalue;
+};
+
+#define VALUE_PREFIX "bpf_struct_ops_"
+#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
+
+/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
+ * the map's value exposed to the userspace and its btf-type-id is
+ * stored at the map->btf_vmlinux_value_type_id.
+ *
+ */
+#define BPF_STRUCT_OPS_TYPE(_name) \
+extern struct bpf_struct_ops bpf_##_name; \
+ \
+struct bpf_struct_ops_##_name { \
+ BPF_STRUCT_OPS_COMMON_VALUE; \
+ struct _name data ____cacheline_aligned_in_smp; \
+};
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+
+enum {
+#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+ __NR_BPF_STRUCT_OPS_TYPE,
+};
+
+static struct bpf_struct_ops * const bpf_struct_ops[] = {
+#define BPF_STRUCT_OPS_TYPE(_name) \
+ [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+};
+
+const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
+};
+
+const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+};
+
+static const struct btf_type *module_type;
+
+void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
+{
+ s32 type_id, value_id, module_id;
+ const struct btf_member *member;
+ struct bpf_struct_ops *st_ops;
+ const struct btf_type *t;
+ char value_name[128];
+ const char *mname;
+ u32 i, j;
+
+ /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
+#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+
+ module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
+ if (module_id < 0) {
+ pr_warn("Cannot find struct module in btf_vmlinux\n");
+ return;
+ }
+ module_type = btf_type_by_id(btf, module_id);
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ st_ops = bpf_struct_ops[i];
+
+ if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+ sizeof(value_name)) {
+ pr_warn("struct_ops name %s is too long\n",
+ st_ops->name);
+ continue;
+ }
+ sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
+
+ value_id = btf_find_by_name_kind(btf, value_name,
+ BTF_KIND_STRUCT);
+ if (value_id < 0) {
+ pr_warn("Cannot find struct %s in btf_vmlinux\n",
+ value_name);
+ continue;
+ }
+
+ type_id = btf_find_by_name_kind(btf, st_ops->name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0) {
+ pr_warn("Cannot find struct %s in btf_vmlinux\n",
+ st_ops->name);
+ continue;
+ }
+ t = btf_type_by_id(btf, type_id);
+ if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+ pr_warn("Cannot support #%u members in struct %s\n",
+ btf_type_vlen(t), st_ops->name);
+ continue;
+ }
+
+ for_each_member(j, t, member) {
+ const struct btf_type *func_proto;
+
+ mname = btf_name_by_offset(btf, member->name_off);
+ if (!*mname) {
+ pr_warn("anon member in struct %s is not supported\n",
+ st_ops->name);
+ break;
+ }
+
+ if (btf_member_bitfield_size(t, member)) {
+ pr_warn("bit field member %s in struct %s is not supported\n",
+ mname, st_ops->name);
+ break;
+ }
+
+ func_proto = btf_type_resolve_func_ptr(btf,
+ member->type,
+ NULL);
+ if (func_proto &&
+ btf_distill_func_proto(log, btf,
+ func_proto, mname,
+ &st_ops->func_models[j])) {
+ pr_warn("Error in parsing func ptr %s in struct %s\n",
+ mname, st_ops->name);
+ break;
+ }
+ }
+
+ if (j == btf_type_vlen(t)) {
+ if (st_ops->init(btf)) {
+ pr_warn("Error in init bpf_struct_ops %s\n",
+ st_ops->name);
+ } else {
+ st_ops->type_id = type_id;
+ st_ops->type = t;
+ st_ops->value_id = value_id;
+ st_ops->value_type = btf_type_by_id(btf,
+ value_id);
+ }
+ }
+ }
+}
+
+extern struct btf *btf_vmlinux;
+
+static const struct bpf_struct_ops *
+bpf_struct_ops_find_value(u32 value_id)
+{
+ unsigned int i;
+
+ if (!value_id || !btf_vmlinux)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ if (bpf_struct_ops[i]->value_id == value_id)
+ return bpf_struct_ops[i];
+ }
+
+ return NULL;
+}
+
+const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+{
+ unsigned int i;
+
+ if (!type_id || !btf_vmlinux)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ if (bpf_struct_ops[i]->type_id == type_id)
+ return bpf_struct_ops[i];
+ }
+
+ return NULL;
+}
+
+static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ if (key && *(u32 *)key == 0)
+ return -ENOENT;
+
+ *(u32 *)next_key = 0;
+ return 0;
+}
+
+int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
+ void *value)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ enum bpf_struct_ops_state state;
+
+ if (unlikely(*(u32 *)key != 0))
+ return -ENOENT;
+
+ kvalue = &st_map->kvalue;
+ /* Pair with smp_store_release() during map_update */
+ state = smp_load_acquire(&kvalue->state);
+ if (state == BPF_STRUCT_OPS_STATE_INIT) {
+ memset(value, 0, map->value_size);
+ return 0;
+ }
+
+ /* No lock is needed. state and refcnt do not need
+ * to be updated together under atomic context.
+ */
+ uvalue = (struct bpf_struct_ops_value *)value;
+ memcpy(uvalue, st_map->uvalue, map->value_size);
+ uvalue->state = state;
+ refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ return 0;
+}
+
+static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
+{
+ const struct btf_type *t = st_map->st_ops->type;
+ u32 i;
+
+ for (i = 0; i < btf_type_vlen(t); i++) {
+ if (st_map->progs[i]) {
+ bpf_prog_put(st_map->progs[i]);
+ st_map->progs[i] = NULL;
+ }
+ }
+}
+
+static int check_zero_holes(const struct btf_type *t, void *data)
+{
+ const struct btf_member *member;
+ u32 i, moff, msize, prev_mend = 0;
+ const struct btf_type *mtype;
+
+ for_each_member(i, t, member) {
+ moff = btf_member_bit_offset(t, member) / 8;
+ if (moff > prev_mend &&
+ memchr_inv(data + prev_mend, 0, moff - prev_mend))
+ return -EINVAL;
+
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ NULL, NULL);
+ if (IS_ERR(mtype))
+ return PTR_ERR(mtype);
+ prev_mend = moff + msize;
+ }
+
+ if (t->size > prev_mend &&
+ memchr_inv(data + prev_mend, 0, t->size - prev_mend))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ const struct btf_member *member;
+ const struct btf_type *t = st_ops->type;
+ void *udata, *kdata;
+ int prog_fd, err = 0;
+ void *image;
+ u32 i;
+
+ if (flags)
+ return -EINVAL;
+
+ if (*(u32 *)key != 0)
+ return -E2BIG;
+
+ err = check_zero_holes(st_ops->value_type, value);
+ if (err)
+ return err;
+
+ uvalue = (struct bpf_struct_ops_value *)value;
+ err = check_zero_holes(t, uvalue->data);
+ if (err)
+ return err;
+
+ if (uvalue->state || refcount_read(&uvalue->refcnt))
+ return -EINVAL;
+
+ uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
+ kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
+
+ mutex_lock(&st_map->lock);
+
+ if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+ err = -EBUSY;
+ goto unlock;
+ }
+
+ memcpy(uvalue, value, map->value_size);
+
+ udata = &uvalue->data;
+ kdata = &kvalue->data;
+ image = st_map->image;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *mtype, *ptype;
+ struct bpf_prog *prog;
+ u32 moff;
+
+ moff = btf_member_bit_offset(t, member) / 8;
+ ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
+ if (ptype == module_type) {
+ if (*(void **)(udata + moff))
+ goto reset_unlock;
+ *(void **)(kdata + moff) = BPF_MODULE_OWNER;
+ continue;
+ }
+
+ err = st_ops->init_member(t, member, kdata, udata);
+ if (err < 0)
+ goto reset_unlock;
+
+ /* The ->init_member() has handled this member */
+ if (err > 0)
+ continue;
+
+ /* If st_ops->init_member does not handle it,
+ * we will only handle func ptrs and zero-ed members
+ * here. Reject everything else.
+ */
+
+ /* All non func ptr member must be 0 */
+ if (!ptype || !btf_type_is_func_proto(ptype)) {
+ u32 msize;
+
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ NULL, NULL);
+ if (IS_ERR(mtype)) {
+ err = PTR_ERR(mtype);
+ goto reset_unlock;
+ }
+
+ if (memchr_inv(udata + moff, 0, msize)) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ continue;
+ }
+
+ prog_fd = (int)(*(unsigned long *)(udata + moff));
+ /* Similar check as the attr->attach_prog_fd */
+ if (!prog_fd)
+ continue;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto reset_unlock;
+ }
+ st_map->progs[i] = prog;
+
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
+ prog->aux->attach_btf_id != st_ops->type_id ||
+ prog->expected_attach_type != i) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ err = arch_prepare_bpf_trampoline(image,
+ st_map->image + PAGE_SIZE,
+ &st_ops->func_models[i], 0,
+ &prog, 1, NULL, 0, NULL);
+ if (err < 0)
+ goto reset_unlock;
+
+ *(void **)(kdata + moff) = image;
+ image += err;
+
+ /* put prog_id to udata */
+ *(unsigned long *)(udata + moff) = prog->aux->id;
+ }
+
+ refcount_set(&kvalue->refcnt, 1);
+ bpf_map_inc(map);
+
+ set_memory_ro((long)st_map->image, 1);
+ set_memory_x((long)st_map->image, 1);
+ err = st_ops->reg(kdata);
+ if (likely(!err)) {
+ /* Pair with smp_load_acquire() during lookup_elem().
+ * It ensures the above udata updates (e.g. prog->aux->id)
+ * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+ goto unlock;
+ }
+
+ /* Error during st_ops->reg(). It is very unlikely since
+ * the above init_member() should have caught it earlier
+ * before reg(). The only possibility is if there was a race
+ * in registering the struct_ops (under the same name) to
+ * a sub-system through different struct_ops's maps.
+ */
+ set_memory_nx((long)st_map->image, 1);
+ set_memory_rw((long)st_map->image, 1);
+ bpf_map_put(map);
+
+reset_unlock:
+ bpf_struct_ops_map_put_progs(st_map);
+ memset(uvalue, 0, map->value_size);
+ memset(kvalue, 0, map->value_size);
+unlock:
+ mutex_unlock(&st_map->lock);
+ return err;
+}
+
+static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+{
+ enum bpf_struct_ops_state prev_state;
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = (struct bpf_struct_ops_map *)map;
+ prev_state = cmpxchg(&st_map->kvalue.state,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE);
+ if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) {
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ if (refcount_dec_and_test(&st_map->kvalue.refcnt))
+ bpf_map_put(map);
+ }
+
+ return 0;
+}
+
+static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+ int err;
+
+ value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ if (!value)
+ return;
+
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ if (!err) {
+ btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
+ value, m);
+ seq_puts(m, "\n");
+ }
+
+ kfree(value);
+}
+
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ if (st_map->progs)
+ bpf_struct_ops_map_put_progs(st_map);
+ bpf_map_area_free(st_map->progs);
+ bpf_jit_free_exec(st_map->image);
+ bpf_map_area_free(st_map->uvalue);
+ bpf_map_area_free(st_map);
+}
+
+static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
+ attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ return -EINVAL;
+ return 0;
+}
+
+static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
+{
+ const struct bpf_struct_ops *st_ops;
+ size_t map_total_size, st_map_size;
+ struct bpf_struct_ops_map *st_map;
+ const struct btf_type *t, *vt;
+ struct bpf_map_memory mem;
+ struct bpf_map *map;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
+ if (!st_ops)
+ return ERR_PTR(-ENOTSUPP);
+
+ vt = st_ops->value_type;
+ if (attr->value_size != vt->size)
+ return ERR_PTR(-EINVAL);
+
+ t = st_ops->type;
+
+ st_map_size = sizeof(*st_map) +
+ /* kvalue stores the
+ * struct bpf_struct_ops_tcp_congestions_ops
+ */
+ (vt->size - sizeof(struct bpf_struct_ops_value));
+ map_total_size = st_map_size +
+ /* uvalue */
+ sizeof(vt->size) +
+ /* struct bpf_progs **progs */
+ btf_type_vlen(t) * sizeof(struct bpf_prog *);
+ err = bpf_map_charge_init(&mem, map_total_size);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
+ if (!st_map) {
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+ st_map->st_ops = st_ops;
+ map = &st_map->map;
+
+ st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
+ st_map->progs =
+ bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+ NUMA_NO_NODE);
+ st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!st_map->uvalue || !st_map->progs || !st_map->image) {
+ bpf_struct_ops_map_free(map);
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mutex_init(&st_map->lock);
+ set_vm_flush_reset_perms(st_map->image);
+ bpf_map_init_from_attr(map, attr);
+ bpf_map_charge_move(&map->memory, &mem);
+
+ return map;
+}
+
+const struct bpf_map_ops bpf_struct_ops_map_ops = {
+ .map_alloc_check = bpf_struct_ops_map_alloc_check,
+ .map_alloc = bpf_struct_ops_map_alloc,
+ .map_free = bpf_struct_ops_map_free,
+ .map_get_next_key = bpf_struct_ops_map_get_next_key,
+ .map_lookup_elem = bpf_struct_ops_map_lookup_elem,
+ .map_delete_elem = bpf_struct_ops_map_delete_elem,
+ .map_update_elem = bpf_struct_ops_map_update_elem,
+ .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+};
+
+/* "const void *" because some subsystem is
+ * passing a const (e.g. const struct tcp_congestion_ops *)
+ */
+bool bpf_struct_ops_get(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+
+ return refcount_inc_not_zero(&kvalue->refcnt);
+}
+
+void bpf_struct_ops_put(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ if (refcount_dec_and_test(&kvalue->refcnt)) {
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(kvalue, struct bpf_struct_ops_map,
+ kvalue);
+ bpf_map_put(&st_map->map);
+ }
+}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
new file mode 100644
index 000000000000..066d83ea1c99
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* internal file - do not include directly */
+
+#ifdef CONFIG_BPF_JIT
+#ifdef CONFIG_INET
+#include <net/tcp.h>
+BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
+#endif
+#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ed2075884724..805c43b083e9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -180,11 +180,6 @@
*/
#define BTF_MAX_SIZE (16 * 1024 * 1024)
-#define for_each_member(i, struct_type, member) \
- for (i = 0, member = btf_type_member(struct_type); \
- i < btf_type_vlen(struct_type); \
- i++, member++)
-
#define for_each_member_from(i, from, struct_type, member) \
for (i = from, member = btf_type_member(struct_type) + from; \
i < btf_type_vlen(struct_type); \
@@ -281,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = "DATASEC",
};
+static const char *btf_type_str(const struct btf_type *t)
+{
+ return btf_kind_str[BTF_INFO_KIND(t->info)];
+}
+
struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
@@ -382,6 +382,65 @@ static bool btf_type_is_datasec(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
+s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
+{
+ const struct btf_type *t;
+ const char *tname;
+ u32 i;
+
+ for (i = 1; i <= btf->nr_types; i++) {
+ t = btf->types[i];
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, name))
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t)) {
+ id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ if (res_id)
+ *res_id = id;
+
+ return t;
+}
+
+const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, id, NULL);
+ if (!btf_type_is_ptr(t))
+ return NULL;
+
+ return btf_type_skip_modifiers(btf, t->type, res_id);
+}
+
+const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *ptype;
+
+ ptype = btf_type_resolve_ptr(btf, id, res_id);
+ if (ptype && btf_type_is_func_proto(ptype))
+ return ptype;
+
+ return NULL;
+}
+
/* Types that act only as a source, not sink or intermediate
* type when resolving.
*/
@@ -446,30 +505,6 @@ static const char *btf_int_encoding_str(u8 encoding)
return "UNKN";
}
-static u16 btf_type_vlen(const struct btf_type *t)
-{
- return BTF_INFO_VLEN(t->info);
-}
-
-static bool btf_type_kflag(const struct btf_type *t)
-{
- return BTF_INFO_KFLAG(t->info);
-}
-
-static u32 btf_member_bit_offset(const struct btf_type *struct_type,
- const struct btf_member *member)
-{
- return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
- : member->offset;
-}
-
-static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
- const struct btf_member *member)
-{
- return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
- : 0;
-}
-
static u32 btf_type_int(const struct btf_type *t)
{
return *(u32 *)(t + 1);
@@ -480,11 +515,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t)
return (const struct btf_array *)(t + 1);
}
-static const struct btf_member *btf_type_member(const struct btf_type *t)
-{
- return (const struct btf_member *)(t + 1);
-}
-
static const struct btf_enum *btf_type_enum(const struct btf_type *t)
{
return (const struct btf_enum *)(t + 1);
@@ -1057,7 +1087,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
* *elem_type: same as return type ("struct X")
* *total_nelems: 1
*/
-static const struct btf_type *
+const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
u32 *type_size, const struct btf_type **elem_type,
u32 *total_nelems)
@@ -1111,8 +1141,10 @@ resolved:
return ERR_PTR(-EINVAL);
*type_size = nelems * size;
- *total_nelems = nelems;
- *elem_type = type;
+ if (total_nelems)
+ *total_nelems = nelems;
+ if (elem_type)
+ *elem_type = type;
return array_type ? : type;
}
@@ -1826,7 +1858,10 @@ static void btf_modifier_seq_show(const struct btf *btf,
u32 type_id, void *data,
u8 bits_offset, struct seq_file *m)
{
- t = btf_type_id_resolve(btf, &type_id);
+ if (btf->resolved_ids)
+ t = btf_type_id_resolve(btf, &type_id);
+ else
+ t = btf_type_skip_modifiers(btf, type_id, NULL);
btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
}
@@ -2621,8 +2656,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_vlen(t)) {
- btf_verifier_log_type(env, t, "vlen != 0");
+ if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) {
+ btf_verifier_log_type(env, t, "Invalid func linkage");
return -EINVAL;
}
@@ -3476,7 +3511,8 @@ static u8 bpf_ctx_convert_map[] = {
static const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type)
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
{
const struct btf_type *conv_struct;
const struct btf_type *ctx_struct;
@@ -3497,12 +3533,13 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- bpf_log(log, "BPF program ctx type is not a struct\n");
+ if (log->level & BPF_LOG_LEVEL)
+ bpf_log(log, "arg#%d type is not a struct\n", arg);
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
- bpf_log(log, "BPF program ctx struct doesn't have a name\n");
+ bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
return NULL;
}
/* prog_type is valid bpf program type. No need for bounds check. */
@@ -3535,11 +3572,12 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
- enum bpf_prog_type prog_type)
+ enum bpf_prog_type prog_type,
+ int arg)
{
const struct btf_member *prog_ctx_type, *kern_ctx_type;
- prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type);
+ prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
if (!prog_ctx_type)
return -ENOENT;
kern_ctx_type = prog_ctx_type + 1;
@@ -3605,6 +3643,8 @@ struct btf *btf_parse_vmlinux(void)
goto errout;
}
+ bpf_struct_ops_init(btf, log);
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -3629,6 +3669,19 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
}
}
+static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+{
+ /* t comes in already as a pointer */
+ t = btf_type_by_id(btf, t->type);
+
+ /* allow const */
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
+ t = btf_type_by_id(btf, t->type);
+
+ /* char, signed char, unsigned char */
+ return btf_type_is_int(t) && t->size == 1;
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -3677,7 +3730,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t))
+ if (btf_type_is_int(t) || btf_type_is_enum(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -3695,12 +3748,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
+ if (is_string_ptr(btf, t))
+ return true;
+
/* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID;
- info->btf_id = t->type;
if (tgt_prog) {
- ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type);
+ ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
if (ret > 0) {
info->btf_id = ret;
return true;
@@ -3708,10 +3763,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
}
+
+ info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
/* skip modifiers */
- while (btf_type_is_modifier(t))
+ while (btf_type_is_modifier(t)) {
+ info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+ }
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
@@ -3737,23 +3796,57 @@ int btf_struct_access(struct bpf_verifier_log *log,
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
if (!btf_type_is_struct(t)) {
- bpf_log(log, "Type '%s' is not a struct", tname);
+ bpf_log(log, "Type '%s' is not a struct\n", tname);
return -EINVAL;
}
- for_each_member(i, t, member) {
- if (btf_member_bitfield_size(t, member))
- /* bitfields are not supported yet */
- continue;
+ if (off + size > t->size) {
+ bpf_log(log, "access beyond struct %s at off %u size %u\n",
+ tname, off, size);
+ return -EACCES;
+ }
+ for_each_member(i, t, member) {
/* offset of the field in bytes */
moff = btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
+
+ if (btf_member_bitfield_size(t, member)) {
+ u32 end_bit = btf_member_bit_offset(t, member) +
+ btf_member_bitfield_size(t, member);
+
+ /* off <= moff instead of off == moff because clang
+ * does not generate a BTF member for anonymous
+ * bitfield like the ":16" here:
+ * struct {
+ * int :16;
+ * int x:8;
+ * };
+ */
+ if (off <= moff &&
+ BITS_ROUNDUP_BYTES(end_bit) <= off + size)
+ return SCALAR_VALUE;
+
+ /* off may be accessing a following member
+ *
+ * or
+ *
+ * Doing partial access at either end of this
+ * bitfield. Continue on this case also to
+ * treat it as not accessing this bitfield
+ * and eventually error out as field not
+ * found to keep it simple.
+ * It could be relaxed if there was a legit
+ * partial access case later.
+ */
+ continue;
+ }
+
/* In case of "off" is pointing to holes of a struct */
if (off < moff)
- continue;
+ break;
/* type of the field */
mtype = btf_type_by_id(btf_vmlinux, member->type);
@@ -3838,6 +3931,7 @@ again:
if (btf_type_is_ptr(mtype)) {
const struct btf_type *stype;
+ u32 id;
if (msize != size || off != moff) {
bpf_log(log,
@@ -3846,12 +3940,9 @@ again:
return -EACCES;
}
- stype = btf_type_by_id(btf_vmlinux, mtype->type);
- /* skip modifiers */
- while (btf_type_is_modifier(stype))
- stype = btf_type_by_id(btf_vmlinux, stype->type);
+ stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id);
if (btf_type_is_struct(stype)) {
- *next_btf_id = mtype->type;
+ *next_btf_id = id;
return PTR_TO_BTF_ID;
}
}
@@ -4043,11 +4134,158 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return 0;
}
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
+/* Compare BTFs of two functions assuming only scalars and pointers to context.
+ * t1 points to BTF_KIND_FUNC in btf1
+ * t2 points to BTF_KIND_FUNC in btf2
+ * Returns:
+ * EINVAL - function prototype mismatch
+ * EFAULT - verifier bug
+ * 0 - 99% match. The last 1% is validated by the verifier.
+ */
+int btf_check_func_type_match(struct bpf_verifier_log *log,
+ struct btf *btf1, const struct btf_type *t1,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ const struct btf_param *args1, *args2;
+ const char *fn1, *fn2, *s1, *s2;
+ u32 nargs1, nargs2, i;
+
+ fn1 = btf_name_by_offset(btf1, t1->name_off);
+ fn2 = btf_name_by_offset(btf2, t2->name_off);
+
+ if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn1);
+ return -EINVAL;
+ }
+ if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_by_id(btf1, t1->type);
+ if (!t1 || !btf_type_is_func_proto(t1))
+ return -EFAULT;
+ t2 = btf_type_by_id(btf2, t2->type);
+ if (!t2 || !btf_type_is_func_proto(t2))
+ return -EFAULT;
+
+ args1 = (const struct btf_param *)(t1 + 1);
+ nargs1 = btf_type_vlen(t1);
+ args2 = (const struct btf_param *)(t2 + 1);
+ nargs2 = btf_type_vlen(t2);
+
+ if (nargs1 != nargs2) {
+ bpf_log(log, "%s() has %d args while %s() has %d args\n",
+ fn1, nargs1, fn2, nargs2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (t1->info != t2->info) {
+ bpf_log(log,
+ "Return type %s of %s() doesn't match type %s of %s()\n",
+ btf_type_str(t1), fn1,
+ btf_type_str(t2), fn2);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nargs1; i++) {
+ t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL);
+
+ if (t1->info != t2->info) {
+ bpf_log(log, "arg%d in %s() is %s while %s() has %s\n",
+ i, fn1, btf_type_str(t1),
+ fn2, btf_type_str(t2));
+ return -EINVAL;
+ }
+ if (btf_type_has_size(t1) && t1->size != t2->size) {
+ bpf_log(log,
+ "arg%d in %s() has size %d while %s() has %d\n",
+ i, fn1, t1->size,
+ fn2, t2->size);
+ return -EINVAL;
+ }
+
+ /* global functions are validated with scalars and pointers
+ * to context only. And only global functions can be replaced.
+ * Hence type check only those types.
+ */
+ if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ continue;
+ if (!btf_type_is_ptr(t1)) {
+ bpf_log(log,
+ "arg%d in %s() has unrecognized type\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (!btf_type_is_struct(t1)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ if (!btf_type_is_struct(t2)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn2);
+ return -EINVAL;
+ }
+ /* This is an optional check to make program writing easier.
+ * Compare names of structs and report an error to the user.
+ * btf_prepare_func_args() already checked that t2 struct
+ * is a context type. btf_prepare_func_args() will check
+ * later that t1 struct is a context type as well.
+ */
+ s1 = btf_name_by_offset(btf1, t1->name_off);
+ s2 = btf_name_by_offset(btf2, t2->name_off);
+ if (strcmp(s1, s2)) {
+ bpf_log(log,
+ "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n",
+ i, fn1, s1, fn2, s2);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* Compare BTFs of given program with BTF of target program */
+int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ struct btf *btf1 = prog->aux->btf;
+ const struct btf_type *t1;
+ u32 btf_id = 0;
+
+ if (!prog->aux->func_info) {
+ bpf_log(&env->log, "Program extension requires BTF\n");
+ return -EINVAL;
+ }
+
+ btf_id = prog->aux->func_info[0].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ t1 = btf_type_by_id(btf1, btf_id);
+ if (!t1 || !btf_type_is_func(t1))
+ return -EFAULT;
+
+ return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2);
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *st = env->cur_state;
- struct bpf_func_state *func = st->frame[st->curframe];
- struct bpf_reg_state *reg = func->regs;
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
struct btf *btf = prog->aux->btf;
@@ -4057,27 +4295,30 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
const char *tname;
if (!prog->aux->func_info)
- return 0;
+ return -EINVAL;
btf_id = prog->aux->func_info[subprog].type_id;
if (!btf_id)
- return 0;
+ return -EFAULT;
if (prog->aux->func_info_aux[subprog].unreliable)
- return 0;
+ return -EINVAL;
t = btf_type_by_id(btf, btf_id);
if (!t || !btf_type_is_func(t)) {
- bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n",
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
subprog);
- return -EINVAL;
+ return -EFAULT;
}
tname = btf_name_by_offset(btf, t->name_off);
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid type of func %s\n", tname);
- return -EINVAL;
+ bpf_log(log, "Invalid BTF of func %s\n", tname);
+ return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
@@ -4103,25 +4344,130 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
bpf_log(log, "R%d is not a pointer\n", i + 1);
goto out;
}
- /* If program is passing PTR_TO_CTX into subprogram
- * check that BTF type matches.
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
*/
- if (reg[i + 1].type == PTR_TO_CTX &&
- !btf_get_prog_ctx_type(log, btf, t, prog->type))
- goto out;
- /* All other pointers are ok */
- continue;
+ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
+ if (reg[i + 1].type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ goto out;
+ }
+ if (check_ctx_reg(env, &reg[i + 1], i + 1))
+ goto out;
+ continue;
+ }
}
- bpf_log(log, "Unrecognized argument type %s\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ bpf_log(log, "Unrecognized arg#%d type %s\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)]);
goto out;
}
return 0;
out:
- /* LLVM optimizations can remove arguments from static functions. */
- bpf_log(log,
- "Type info disagrees with actual arguments due to compiler optimizations\n");
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
prog->aux->func_info_aux[subprog].unreliable = true;
+ return -EINVAL;
+}
+
+/* Convert BTF of a function into bpf_reg_state if possible
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - cannot convert BTF.
+ * 0 - Successfully converted BTF into bpf_reg_state
+ * (either PTR_TO_CTX or SCALAR_VALUE).
+ */
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct bpf_prog *prog = env->prog;
+ enum bpf_prog_type prog_type = prog->type;
+ struct btf *btf = prog->aux->btf;
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 i, nargs, btf_id;
+ const char *tname;
+
+ if (!prog->aux->func_info ||
+ prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "Verifier bug\n");
+ return -EFAULT;
+ }
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id) {
+ bpf_log(log, "Global functions need valid BTF\n");
+ return -EFAULT;
+ }
+
+ t = btf_type_by_id(btf, btf_id);
+ if (!t || !btf_type_is_func(t)) {
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
+ subprog);
+ return -EFAULT;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+
+ if (log->level & BPF_LOG_LEVEL)
+ bpf_log(log, "Validating %s() func#%d...\n",
+ tname, subprog);
+
+ if (prog->aux->func_info_aux[subprog].unreliable) {
+ bpf_log(log, "Verifier bug in function %s()\n", tname);
+ return -EFAULT;
+ }
+ if (prog_type == BPF_PROG_TYPE_EXT)
+ prog_type = prog->aux->linked_prog->type;
+
+ t = btf_type_by_id(btf, t->type);
+ if (!t || !btf_type_is_func_proto(t)) {
+ bpf_log(log, "Invalid type of function %s()\n", tname);
+ return -EFAULT;
+ }
+ args = (const struct btf_param *)(t + 1);
+ nargs = btf_type_vlen(t);
+ if (nargs > 5) {
+ bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
+ tname, nargs);
+ return -EINVAL;
+ }
+ /* check that function returns int */
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ bpf_log(log,
+ "Global function %s() doesn't return scalar. Only those are supported.\n",
+ tname);
+ return -EINVAL;
+ }
+ /* Convert BTF function arguments into verifier types.
+ * Only PTR_TO_CTX and SCALAR are supported atm.
+ */
+ for (i = 0; i < nargs; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ reg[i + 1].type = SCALAR_VALUE;
+ continue;
+ }
+ if (btf_type_is_ptr(t) &&
+ btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ reg[i + 1].type = PTR_TO_CTX;
+ continue;
+ }
+ bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ return -EINVAL;
+ }
return 0;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9e43b72eb619..9a500fadbef5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -106,8 +106,7 @@ static u32 prog_list_length(struct list_head *head)
* if parent has overridable or multi-prog, allow attaching
*/
static bool hierarchy_allows_attach(struct cgroup *cgrp,
- enum bpf_attach_type type,
- u32 new_flags)
+ enum bpf_attach_type type)
{
struct cgroup *p;
@@ -290,31 +289,34 @@ cleanup:
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
+ * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
* @type: Type of attach operation
* @flags: Option flags
*
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_prog *replace_prog,
enum bpf_attach_type type, u32 flags)
{
+ u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+ struct bpf_prog_list *pl, *replace_pl = NULL;
enum bpf_cgroup_storage_type stype;
- struct bpf_prog_list *pl;
- bool pl_was_allocated;
int err;
- if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
+ ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
/* invalid combination */
return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type, flags))
+ if (!hierarchy_allows_attach(cgrp, type))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -324,6 +326,21 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+ if (pl->prog == replace_prog)
+ replace_pl = pl;
+ }
+ if ((flags & BPF_F_REPLACE) && !replace_pl)
+ /* prog to replace not found for cgroup */
+ return -ENOENT;
+ } else if (!list_empty(progs)) {
+ replace_pl = list_first_entry(progs, typeof(*pl), node);
+ }
+
for_each_cgroup_storage_type(stype) {
storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
if (IS_ERR(storage[stype])) {
@@ -334,53 +351,28 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
}
}
- if (flags & BPF_F_ALLOW_MULTI) {
- list_for_each_entry(pl, progs, node) {
- if (pl->prog == prog) {
- /* disallow attaching the same prog twice */
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -EINVAL;
- }
+ if (replace_pl) {
+ pl = replace_pl;
+ old_prog = pl->prog;
+ for_each_cgroup_storage_type(stype) {
+ old_storage[stype] = pl->storage[stype];
+ bpf_cgroup_storage_unlink(old_storage[stype]);
}
-
+ } else {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
for_each_cgroup_storage_type(stype)
bpf_cgroup_storage_free(storage[stype]);
return -ENOMEM;
}
-
- pl_was_allocated = true;
- pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
list_add_tail(&pl->node, progs);
- } else {
- if (list_empty(progs)) {
- pl = kmalloc(sizeof(*pl), GFP_KERNEL);
- if (!pl) {
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -ENOMEM;
- }
- pl_was_allocated = true;
- list_add_tail(&pl->node, progs);
- } else {
- pl = list_first_entry(progs, typeof(*pl), node);
- old_prog = pl->prog;
- for_each_cgroup_storage_type(stype) {
- old_storage[stype] = pl->storage[stype];
- bpf_cgroup_storage_unlink(old_storage[stype]);
- }
- pl_was_allocated = false;
- }
- pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
}
- cgrp->bpf.flags[type] = flags;
+ pl->prog = prog;
+ for_each_cgroup_storage_type(stype)
+ pl->storage[stype] = storage[stype];
+
+ cgrp->bpf.flags[type] = saved_flags;
err = update_effective_progs(cgrp, type);
if (err)
@@ -408,7 +400,7 @@ cleanup:
pl->storage[stype] = old_storage[stype];
bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
}
- if (pl_was_allocated) {
+ if (!replace_pl) {
list_del(&pl->node);
kfree(pl);
}
@@ -546,6 +538,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
+ struct bpf_prog *replace_prog = NULL;
struct cgroup *cgrp;
int ret;
@@ -553,8 +546,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
- ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
+ (attr->attach_flags & BPF_F_REPLACE)) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
+ if (IS_ERR(replace_prog)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(replace_prog);
+ }
+ }
+
+ ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
attr->attach_flags);
+
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
cgroup_put(cgrp);
return ret;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index af6b738cf435..973a20d49749 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
u32 pages, delta;
int ret;
- BUG_ON(fp_old == NULL);
-
size = round_up(size, PAGE_SIZE);
pages = size / PAGE_SIZE;
if (pages <= fp_old->pages)
@@ -520,9 +518,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
-int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
+int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
-int bpf_jit_kallsyms __read_mostly;
long bpf_jit_limit __read_mostly;
static __always_inline void
@@ -2139,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
+const struct bpf_func_proto bpf_jiffies64_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef49e17ae47c..70f71b154fa5 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -72,17 +72,18 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
- struct list_head __percpu *flush_list;
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
+static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
+
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
- int ret, cpu;
u64 cost;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
- cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */
ret = bpf_map_charge_init(&cmap->map.memory, cost);
@@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap;
}
- cmap->flush_list = alloc_percpu(struct list_head);
- if (!cmap->flush_list)
- goto free_charge;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
if (!cmap->cpu_map)
- goto free_percpu;
+ goto free_charge;
return &cmap->map;
-free_percpu:
- free_percpu(cmap->flush_list);
free_charge:
bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
@@ -399,22 +390,14 @@ free_rcu:
static void __cpu_map_entry_free(struct rcu_head *rcu)
{
struct bpf_cpu_map_entry *rcpu;
- int cpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
- /* Flush remaining packets in percpu bulkq */
- for_each_online_cpu(cpu) {
- struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
-
- /* No concurrent bq_enqueue can run at this point */
- bq_flush_to_queue(bq, false);
- }
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
put_cpu_map_entry(rcpu);
@@ -436,7 +419,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
* percpu bulkq to queue. Due to caller map_delete_elem() disable
* preemption, cannot call kthread_stop() to make sure queue is empty.
* Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * cpu_map_kthread_stop, which waits for an RCU grace period before
* stopping kthread, emptying the queue.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
@@ -507,7 +490,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
static void cpu_map_free(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- int cpu;
u32 i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
@@ -522,18 +504,6 @@ static void cpu_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_rcu();
- /* To ensure all pending flush operations have completed wait for flush
- * list be empty on _all_ cpus. Because the above synchronize_rcu()
- * ensures the map is disconnected from the program we can assume no new
- * items will be added to the list.
- */
- for_each_online_cpu(cpu) {
- struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
-
- while (!list_empty(flush_list))
- cond_resched();
- }
-
/* For cpu_map the remote CPUs can still be using the entries
* (struct bpf_cpu_map_entry).
*/
@@ -544,10 +514,9 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU graze-period */
+ /* bq flush and cleanup happens after RCU grace-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
- free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfree(cmap);
}
@@ -599,7 +568,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
{
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
@@ -620,10 +589,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
err = __ptr_ring_produce(q, xdpf);
if (err) {
drops++;
- if (likely(in_napi_ctx))
- xdp_return_frame_rx_napi(xdpf);
- else
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
}
processed++;
}
@@ -642,11 +608,11 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
*/
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
- bq_flush_to_queue(bq, true);
+ bq_flush_to_queue(bq);
/* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver
@@ -681,16 +647,26 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
-void __cpu_map_flush(struct bpf_map *map)
+void __cpu_map_flush(void)
{
- struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
- bq_flush_to_queue(bq, true);
+ bq_flush_to_queue(bq);
/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(bq->obj->kthread);
}
}
+
+static int __init cpu_map_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
+ return 0;
+}
+
+subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 3d3d61b5985b..58bdca5d978a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -53,13 +53,11 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16
-struct bpf_dtab_netdev;
-
-struct xdp_bulk_queue {
+struct xdp_dev_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node;
+ struct net_device *dev;
struct net_device *dev_rx;
- struct bpf_dtab_netdev *obj;
unsigned int count;
};
@@ -67,15 +65,13 @@ struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
struct bpf_dtab *dtab;
- struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
- unsigned int idx; /* keep track of map index for tracepoint */
+ unsigned int idx;
};
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
- struct list_head __percpu *flush_list;
struct list_head list;
/* these are only used for DEVMAP_HASH type maps */
@@ -85,6 +81,7 @@ struct bpf_dtab {
u32 n_buckets;
};
+static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -109,8 +106,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
- int err, cpu;
- u64 cost;
+ u64 cost = 0;
+ int err;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -125,9 +122,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
bpf_map_init_from_attr(&dtab->map, attr);
- /* make sure page count doesn't overflow */
- cost = (u64) sizeof(struct list_head) * num_possible_cpus();
-
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
@@ -143,17 +137,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
if (err)
return -EINVAL;
- dtab->flush_list = alloc_percpu(struct list_head);
- if (!dtab->flush_list)
- goto free_charge;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
-
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
if (!dtab->dev_index_head)
- goto free_percpu;
+ goto free_charge;
spin_lock_init(&dtab->index_lock);
} else {
@@ -161,13 +148,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_percpu;
+ goto free_charge;
}
return 0;
-free_percpu:
- free_percpu(dtab->flush_list);
free_charge:
bpf_map_charge_finish(&dtab->map.memory);
return -ENOMEM;
@@ -201,14 +186,16 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i, cpu;
+ int i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further reads against netdev_map. It does __not__ ensure pending
- * flush operations (if any) are complete.
+ * disconnected from events. The following synchronize_rcu() guarantees
+ * both rcu read critical sections complete and waits for
+ * preempt-disable regions (NAPI being the relevant context here) so we
+ * are certain there will be no further reads against the netdev_map and
+ * all flush operations are complete. Flush operations can only be done
+ * from NAPI context for this reason.
*/
spin_lock(&dev_map_lock);
@@ -221,18 +208,6 @@ static void dev_map_free(struct bpf_map *map)
/* Make sure prior __dev_map_entry_free() have completed. */
rcu_barrier();
- /* To ensure all pending flush operations have completed wait for flush
- * list to empty on _all_ cpus.
- * Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new items will be added.
- */
- for_each_online_cpu(cpu) {
- struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
-
- while (!list_empty(flush_list))
- cond_resched();
- }
-
if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
for (i = 0; i < dtab->n_buckets; i++) {
struct bpf_dtab_netdev *dev;
@@ -243,7 +218,6 @@ static void dev_map_free(struct bpf_map *map)
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
hlist_del_rcu(&dev->index_hlist);
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -258,7 +232,6 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -266,7 +239,6 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- free_percpu(dtab->flush_list);
kfree(dtab);
}
@@ -293,7 +265,8 @@ struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
struct hlist_head *head = dev_map_index_hash(dtab, key);
struct bpf_dtab_netdev *dev;
- hlist_for_each_entry_rcu(dev, head, index_hlist)
+ hlist_for_each_entry_rcu(dev, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock))
if (dev->idx == key)
return dev;
@@ -345,11 +318,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
-static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
- bool in_napi_ctx)
+static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
- struct bpf_dtab_netdev *obj = bq->obj;
- struct net_device *dev = obj->dev;
+ struct net_device *dev = bq->dev;
int sent = 0, drops = 0, err = 0;
int i;
@@ -372,8 +343,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
out:
bq->count = 0;
- trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx,
- sent, drops, bq->dev_rx, dev, err);
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node);
return 0;
@@ -384,33 +354,29 @@ error:
for (i = 0; i < bq->count; i++) {
struct xdp_frame *xdpf = bq->q[i];
- /* RX path under NAPI protection, can return frames faster */
- if (likely(in_napi_ctx))
- xdp_return_frame_rx_napi(xdpf);
- else
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
drops++;
}
goto out;
}
-/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the flush list
* is empty before completing to ensure all flush operations have completed.
+ * When drivers update the bpf program they may need to ensure any flush ops
+ * are also complete. Using synchronize_rcu or call_rcu will suffice for this
+ * because both wait for napi context to exit.
*/
-void __dev_map_flush(struct bpf_map *map)
+void __dev_flush(void)
{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
- struct xdp_bulk_queue *bq, *tmp;
+ struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct xdp_dev_bulk_queue *bq, *tmp;
- rcu_read_lock();
list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
- bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
- rcu_read_unlock();
+ bq_xmit_all(bq, XDP_XMIT_FLUSH);
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -432,15 +398,14 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
-
{
- struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
- struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+ struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
- bq_xmit_all(bq, 0, true);
+ bq_xmit_all(bq, 0);
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
@@ -457,10 +422,9 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
return 0;
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
- struct net_device *dev = dst->dev;
struct xdp_frame *xdpf;
int err;
@@ -475,7 +439,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- return bq_enqueue(dst, xdpf, dev_rx);
+ return bq_enqueue(dev, xdpf, dev_rx);
+}
+
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ return __xdp_enqueue(dev, xdp, dev_rx);
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
+
+ return __xdp_enqueue(dev, xdp, dev_rx);
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -509,28 +487,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->ifindex : NULL;
}
-static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
-{
- if (dev->dev->netdev_ops->ndo_xdp_xmit) {
- struct xdp_bulk_queue *bq;
- int cpu;
-
- rcu_read_lock();
- for_each_online_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
- }
- rcu_read_unlock();
- }
-}
-
static void __dev_map_entry_free(struct rcu_head *rcu)
{
struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
- dev_map_flush_old(dev);
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -545,12 +506,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
/* Use call_rcu() here to ensure any rcu critical sections have
- * completed, but this does not guarantee a flush has happened
- * yet. Because driver side rcu_read_lock/unlock only protects the
- * running XDP program. However, for pending flush operations the
- * dev and ctx are stored in another per cpu map. And additionally,
- * the driver tear down ensures all soft irqs are complete before
- * removing the net device in the case of dev_put equals zero.
+ * completed as well as any flush operations because call_rcu
+ * will wait for preempt-disable region to complete, NAPI in this
+ * context. And additionally, the driver tear down ensures all
+ * soft irqs are complete before removing the net device in the
+ * case of dev_put equals zero.
*/
old_dev = xchg(&dtab->netdev_map[k], NULL);
if (old_dev)
@@ -585,30 +545,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
u32 ifindex,
unsigned int idx)
{
- gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev;
- struct xdp_bulk_queue *bq;
- int cpu;
- dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+ dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+ dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
- dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
- sizeof(void *), gfp);
- if (!dev->bulkq) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
- for_each_possible_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq->obj = dev;
- }
-
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
- free_percpu(dev->bulkq);
kfree(dev);
return ERR_PTR(-EINVAL);
}
@@ -768,9 +713,23 @@ static int dev_map_notification(struct notifier_block *notifier,
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
struct bpf_dtab *dtab;
- int i;
+ int i, cpu;
switch (event) {
+ case NETDEV_REGISTER:
+ if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
+ break;
+
+ /* will be freed in free_netdev() */
+ netdev->xdp_bulkq =
+ __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
+ sizeof(void *), GFP_ATOMIC);
+ if (!netdev->xdp_bulkq)
+ return NOTIFY_BAD;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ break;
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because
* dev_map_list is an RCU list AND to ensure a delete
@@ -810,10 +769,15 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
+ int cpu;
+
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
new file mode 100644
index 000000000000..b3e5b214fed8
--- /dev/null
+++ b/kernel/bpf/dispatcher.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2019 Intel Corporation. */
+
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* The BPF dispatcher is a multiway branch code generator. The
+ * dispatcher is a mechanism to avoid the performance penalty of an
+ * indirect call, which is expensive when retpolines are enabled. A
+ * dispatch client registers a BPF program into the dispatcher, and if
+ * there is available room in the dispatcher a direct call to the BPF
+ * program will be generated. All calls to the BPF programs called via
+ * the dispatcher will then be a direct call, instead of an
+ * indirect. The dispatcher hijacks a trampoline function it via the
+ * __fentry__ of the trampoline. The trampoline function has the
+ * following signature:
+ *
+ * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi,
+ * unsigned int (*bpf_func)(const void *,
+ * const struct bpf_insn *));
+ */
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog(
+ struct bpf_dispatcher *d, struct bpf_prog *prog)
+{
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (prog == d->progs[i].prog)
+ return &d->progs[i];
+ }
+ return NULL;
+}
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_free(
+ struct bpf_dispatcher *d)
+{
+ return bpf_dispatcher_find_prog(d, NULL);
+}
+
+static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (entry) {
+ refcount_inc(&entry->users);
+ return false;
+ }
+
+ entry = bpf_dispatcher_find_free(d);
+ if (!entry)
+ return false;
+
+ bpf_prog_inc(prog);
+ entry->prog = prog;
+ refcount_set(&entry->users, 1);
+ d->num_progs++;
+ return true;
+}
+
+static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (!entry)
+ return false;
+
+ if (refcount_dec_and_test(&entry->users)) {
+ entry->prog = NULL;
+ bpf_prog_put(prog);
+ d->num_progs--;
+ return true;
+ }
+ return false;
+}
+
+int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+ return -ENOTSUPP;
+}
+
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+{
+ s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (d->progs[i].prog)
+ *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
+ }
+ return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+}
+
+static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
+{
+ void *old, *new;
+ u32 noff;
+ int err;
+
+ if (!prev_num_progs) {
+ old = NULL;
+ noff = 0;
+ } else {
+ old = d->image + d->image_off;
+ noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
+ }
+
+ new = d->num_progs ? d->image + noff : NULL;
+ if (new) {
+ if (bpf_dispatcher_prepare(d, new))
+ return;
+ }
+
+ err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
+ if (err || !new)
+ return;
+
+ d->image_off = noff;
+}
+
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+ struct bpf_prog *to)
+{
+ bool changed = false;
+ int prev_num_progs;
+
+ if (from == to)
+ return;
+
+ mutex_lock(&d->mutex);
+ if (!d->image) {
+ d->image = bpf_image_alloc();
+ if (!d->image)
+ goto out;
+ }
+
+ prev_num_progs = d->num_progs;
+ changed |= bpf_dispatcher_remove_prog(d, from);
+ changed |= bpf_dispatcher_add_prog(d, to);
+
+ if (!changed)
+ goto out;
+
+ bpf_dispatcher_update(d, prev_num_progs);
+out:
+ mutex_unlock(&d->mutex);
+}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..2d182c4ee9d9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,6 +17,16 @@
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
+#define BATCH_OPS(_name) \
+ .map_lookup_batch = \
+ _name##_map_lookup_batch, \
+ .map_lookup_and_delete_batch = \
+ _name##_map_lookup_and_delete_batch, \
+ .map_update_batch = \
+ generic_map_update_batch, \
+ .map_delete_batch = \
+ generic_map_delete_batch
+
struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
@@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int
+__htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete, bool is_lru_map,
+ bool is_percpu)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
+ void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
+ void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+ void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+ void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ u32 batch, max_count, size, bucket_size;
+ u64 elem_map_flags, map_flags;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ unsigned long flags;
+ struct htab_elem *l;
+ struct bucket *b;
+ int ret = 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ if ((elem_map_flags & ~BPF_F_LOCK) ||
+ ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ batch = 0;
+ if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
+ return -EFAULT;
+
+ if (batch >= htab->n_buckets)
+ return -ENOENT;
+
+ key_size = htab->map.key_size;
+ roundup_key_size = round_up(htab->map.key_size, 8);
+ value_size = htab->map.value_size;
+ size = round_up(value_size, 8);
+ if (is_percpu)
+ value_size = size * num_possible_cpus();
+ total = 0;
+ /* while experimenting with hash tables with sizes ranging from 10 to
+ * 1000, it was observed that a bucket can have upto 5 entries.
+ */
+ bucket_size = 5;
+
+alloc:
+ /* We cannot do copy_from_user or copy_to_user inside
+ * the rcu_read_lock. Allocate enough space here.
+ */
+ keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ if (!keys || !values) {
+ ret = -ENOMEM;
+ goto after_loop;
+ }
+
+again:
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+again_nocopy:
+ dst_key = keys;
+ dst_val = values;
+ b = &htab->buckets[batch];
+ head = &b->head;
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ bucket_cnt = 0;
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ bucket_cnt++;
+
+ if (bucket_cnt > (max_count - total)) {
+ if (total == 0)
+ ret = -ENOSPC;
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ goto after_loop;
+ }
+
+ if (bucket_cnt > bucket_size) {
+ bucket_size = bucket_cnt;
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ kvfree(keys);
+ kvfree(values);
+ goto alloc;
+ }
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ memcpy(dst_key, l->key, key_size);
+
+ if (is_percpu) {
+ int off = 0, cpu;
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(dst_val + off,
+ per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ } else {
+ value = l->key + roundup_key_size;
+ if (elem_map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, dst_val, value,
+ true);
+ else
+ copy_map_value(map, dst_val, value);
+ check_and_init_map_lock(map, dst_val);
+ }
+ if (do_delete) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (is_lru_map)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ else
+ free_htab_elem(htab, l);
+ }
+ dst_key += key_size;
+ dst_val += value_size;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ /* If we are not copying data, we can go to next bucket and avoid
+ * unlocking the rcu.
+ */
+ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
+ batch++;
+ goto again_nocopy;
+ }
+
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
+ key_size * bucket_cnt) ||
+ copy_to_user(uvalues + total * value_size, values,
+ value_size * bucket_cnt))) {
+ ret = -EFAULT;
+ goto after_loop;
+ }
+
+ total += bucket_cnt;
+ batch++;
+ if (batch >= htab->n_buckets) {
+ ret = -ENOENT;
+ goto after_loop;
+ }
+ goto again;
+
+after_loop:
+ if (ret == -EFAULT)
+ goto out;
+
+ /* copy # of entries and next batch */
+ ubatch = u64_to_user_ptr(attr->batch.out_batch);
+ if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
+ put_user(total, &uattr->batch.count))
+ ret = -EFAULT;
+
+out:
+ kvfree(keys);
+ kvfree(values);
+ return ret;
+}
+
+static int
+htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, true);
+}
+
+static int
+htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, true);
+}
+
+static int
+htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, false);
+}
+
+static int
+htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, false);
+}
+
+static int
+htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, true);
+}
+
+static int
+htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, true);
+}
+
+static int
+htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, false);
+}
+
+static int
+htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, false);
+}
+
const struct bpf_map_ops htab_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab),
};
const struct bpf_map_ops htab_lru_map_ops = {
@@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab_lru),
};
/* Called from eBPF program */
@@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_percpu),
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
@@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_lru_percpu),
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index cada974c9f4e..d8b7b110a1c5 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,7 @@
#include <linux/uidgid.h>
#include <linux/filter.h>
#include <linux/ctype.h>
+#include <linux/jiffies.h>
#include "../../lib/kstrtox.h"
@@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
preempt_enable();
}
+BPF_CALL_0(bpf_jiffies64)
+{
+ return get_jiffies_64();
+}
+
+const struct bpf_func_proto bpf_jiffies64_proto = {
+ .func = bpf_jiffies64,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index ecf42bec38c0..5e40e7fccc21 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -196,6 +196,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
void *key = map_iter(m)->key;
void *prev_key;
+ (*pos)++;
if (map_iter(m)->done)
return NULL;
@@ -208,8 +209,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
map_iter(m)->done = true;
return NULL;
}
-
- ++(*pos);
return key;
}
@@ -380,7 +379,7 @@ static const struct inode_operations bpf_dir_iops = {
.unlink = simple_unlink,
};
-static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+static int bpf_obj_do_pin(const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
@@ -389,7 +388,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
umode_t mode;
int ret;
- dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+ dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -422,30 +421,22 @@ out:
int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
{
- struct filename *pname;
enum bpf_type type;
void *raw;
int ret;
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
raw = bpf_fd_probe_obj(ufd, &type);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
- ret = bpf_obj_do_pin(pname, raw, type);
+ ret = bpf_obj_do_pin(pathname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
-out:
- putname(pname);
+
return ret;
}
-static void *bpf_obj_do_get(const struct filename *pathname,
+static void *bpf_obj_do_get(const char __user *pathname,
enum bpf_type *type, int flags)
{
struct inode *inode;
@@ -453,7 +444,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
void *raw;
int ret;
- ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
if (ret)
return ERR_PTR(ret);
@@ -480,36 +471,27 @@ out:
int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
- struct filename *pname;
- int ret = -ENOENT;
int f_flags;
void *raw;
+ int ret;
f_flags = bpf_get_file_flag(flags);
if (f_flags < 0)
return f_flags;
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
- raw = bpf_obj_do_get(pname, &type, f_flags);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
+ raw = bpf_obj_do_get(pathname, &type, f_flags);
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else
- goto out;
+ return -ENOENT;
if (ret < 0)
bpf_any_put(raw, type);
-out:
- putname(pname);
return ret;
}
@@ -587,16 +569,11 @@ enum {
OPT_MODE,
};
-static const struct fs_parameter_spec bpf_param_specs[] = {
+static const struct fs_parameter_spec bpf_fs_parameters[] = {
fsparam_u32oct ("mode", OPT_MODE),
{}
};
-static const struct fs_parameter_description bpf_fs_parameters = {
- .name = "bpf",
- .specs = bpf_param_specs,
-};
-
struct bpf_mount_opts {
umode_t mode;
};
@@ -607,7 +584,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct fs_parse_result result;
int opt;
- opt = fs_parse(fc, &bpf_fs_parameters, param, &result);
+ opt = fs_parse(fc, bpf_fs_parameters, param, &result);
if (opt < 0)
/* We might like to report bad mount options here, but
* traditionally we've ignored all mount options, so we'd
@@ -683,7 +660,7 @@ static struct file_system_type bpf_fs_type = {
.owner = THIS_MODULE,
.name = "bpf",
.init_fs_context = bpf_init_fs_context,
- .parameters = &bpf_fs_parameters,
+ .parameters = bpf_fs_parameters,
.kill_sb = kill_litter_super,
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 5e9366b33f0f..b3c48d1533cb 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
*/
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
+ inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 5b9da0954a27..2c5dc6541ece 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -302,14 +302,14 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
struct inode *ns_inode;
struct path ns_path;
char __user *uinsns;
- void *res;
+ int res;
u32 ulen;
res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
- if (IS_ERR(res)) {
+ if (res) {
if (!info->ifindex)
return -ENODEV;
- return PTR_ERR(res);
+ return res;
}
down_read(&bpf_devs_lock);
@@ -526,13 +526,13 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
};
struct inode *ns_inode;
struct path ns_path;
- void *res;
+ int res;
res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
- if (IS_ERR(res)) {
+ if (res) {
if (!info->ifindex)
return -ENODEV;
- return PTR_ERR(res);
+ return res;
}
ns_inode = ns_path.dentry->d_inode;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e3461ec59570..a91ad518c050 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,7 @@
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/nospec.h>
+#include <linux/audit.h>
#include <uapi/linux/btf.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -128,6 +129,152 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
+static u32 bpf_map_value_size(struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ return round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ return sizeof(u32);
+ else
+ return map->value_size;
+}
+
+static void maybe_wait_bpf_programs(struct bpf_map *map)
+{
+ /* Wait for any running BPF programs to complete so that
+ * userspace, when we return to it, knows that all programs
+ * that could be running use the new map value.
+ */
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+ synchronize_rcu();
+}
+
+static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
+ void *value, __u64 flags)
+{
+ int err;
+
+ /* Need to create a kthread, thus must support schedule */
+ if (bpf_map_is_dev_bound(map)) {
+ return bpf_map_offload_update_elem(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+ map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ return map->ops->map_update_elem(map, key, value, flags);
+ } else if (IS_FD_PROG_ARRAY(map)) {
+ return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ flags);
+ }
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+ * inside bpf map update or delete otherwise deadlocks are possible
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_update(map, key, value,
+ flags);
+ } else if (IS_FD_ARRAY(map)) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ flags);
+ rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ rcu_read_lock();
+ err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ flags);
+ rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ /* rcu_read_lock() is not needed */
+ err = bpf_fd_reuseport_array_update_elem(map, key, value,
+ flags);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_push_elem(map, value, flags);
+ } else {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ }
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+
+ return err;
+}
+
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
+ __u64 flags)
+{
+ void *ptr;
+ int err;
+
+ if (bpf_map_is_dev_bound(map))
+ return bpf_map_offload_lookup_elem(map, key, value);
+
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ err = bpf_stackmap_copy(map, key, value);
+ } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_peek_elem(map, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* struct_ops map requires directly updating "value" */
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ } else {
+ rcu_read_lock();
+ if (map->ops->map_lookup_elem_sys_only)
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
+ else
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ } else if (!ptr) {
+ err = -ENOENT;
+ } else {
+ err = 0;
+ if (flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock, since value wasn't zero inited */
+ check_and_init_map_lock(map, value);
+ }
+ rcu_read_unlock();
+ }
+
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+
+ return err;
+}
+
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
@@ -627,7 +774,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -641,6 +788,14 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ if (attr->btf_vmlinux_value_type_id) {
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
+ attr->btf_key_type_id || attr->btf_value_type_id)
+ return -EINVAL;
+ } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+ return -EINVAL;
+ }
+
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
@@ -663,32 +818,35 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
- if (attr->btf_key_type_id || attr->btf_value_type_id) {
+ map->spin_lock_off = -EINVAL;
+ if (attr->btf_key_type_id || attr->btf_value_type_id ||
+ /* Even the map's value is a kernel's struct,
+ * the bpf_prog.o must have BTF to begin with
+ * to figure out the corresponding kernel's
+ * counter part. Thus, attr->btf_fd has
+ * to be valid also.
+ */
+ attr->btf_vmlinux_value_type_id) {
struct btf *btf;
- if (!attr->btf_value_type_id) {
- err = -EINVAL;
- goto free_map;
- }
-
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto free_map;
}
+ map->btf = btf;
- err = map_check_btf(map, btf, attr->btf_key_type_id,
- attr->btf_value_type_id);
- if (err) {
- btf_put(btf);
- goto free_map;
+ if (attr->btf_value_type_id) {
+ err = map_check_btf(map, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
+ if (err)
+ goto free_map;
}
- map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
- } else {
- map->spin_lock_off = -EINVAL;
+ map->btf_vmlinux_value_type_id =
+ attr->btf_vmlinux_value_type_id;
}
err = security_bpf_map_alloc(map);
@@ -815,7 +973,7 @@ static int map_lookup_elem(union bpf_attr *attr)
void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
- void *key, *value, *ptr;
+ void *key, *value;
u32 value_size;
struct fd f;
int err;
@@ -847,72 +1005,14 @@ static int map_lookup_elem(union bpf_attr *attr)
goto err_put;
}
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
- else if (IS_FD_MAP(map))
- value_size = sizeof(u32);
- else
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_lookup_elem(map, key, value);
- goto done;
- }
-
- preempt_disable();
- this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
- err = bpf_stackmap_copy(map, key, value);
- } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
- err = bpf_fd_array_map_lookup_elem(map, key, value);
- } else if (IS_FD_HASH(map)) {
- err = bpf_fd_htab_map_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
- err = map->ops->map_peek_elem(map, value);
- } else {
- rcu_read_lock();
- if (map->ops->map_lookup_elem_sys_only)
- ptr = map->ops->map_lookup_elem_sys_only(map, key);
- else
- ptr = map->ops->map_lookup_elem(map, key);
- if (IS_ERR(ptr)) {
- err = PTR_ERR(ptr);
- } else if (!ptr) {
- err = -ENOENT;
- } else {
- err = 0;
- if (attr->flags & BPF_F_LOCK)
- /* lock 'ptr' and copy everything but lock */
- copy_map_value_locked(map, value, ptr, true);
- else
- copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
- }
- rcu_read_unlock();
- }
- this_cpu_dec(bpf_prog_active);
- preempt_enable();
-
-done:
+ err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
@@ -931,16 +1031,6 @@ err_put:
return err;
}
-static void maybe_wait_bpf_programs(struct bpf_map *map)
-{
- /* Wait for any running BPF programs to complete so that
- * userspace, when we return to it, knows that all programs
- * that could be running use the new map value.
- */
- if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
- map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
- synchronize_rcu();
-}
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
@@ -996,60 +1086,8 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
- /* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_update_elem(map, key, value, attr->flags);
- goto out;
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
- map->map_type == BPF_MAP_TYPE_SOCKHASH ||
- map->map_type == BPF_MAP_TYPE_SOCKMAP) {
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- goto out;
- } else if (IS_FD_PROG_ARRAY(map)) {
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
- attr->flags);
- goto out;
- }
+ err = bpf_map_update_value(map, f, key, value, attr->flags);
- /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
- * inside bpf map update or delete otherwise deadlocks are possible
- */
- preempt_disable();
- __this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_update(map, key, value,
- attr->flags);
- } else if (IS_FD_ARRAY(map)) {
- rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
- attr->flags);
- rcu_read_unlock();
- } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
- attr->flags);
- rcu_read_unlock();
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- /* rcu_read_lock() is not needed */
- err = bpf_fd_reuseport_array_update_elem(map, key, value,
- attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
- err = map->ops->map_push_elem(map, value, attr->flags);
- } else {
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- rcu_read_unlock();
- }
- __this_cpu_dec(bpf_prog_active);
- preempt_enable();
- maybe_wait_bpf_programs(map);
-out:
free_value:
kfree(value);
free_key:
@@ -1091,7 +1129,9 @@ static int map_delete_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
- } else if (IS_FD_PROG_ARRAY(map)) {
+ } else if (IS_FD_PROG_ARRAY(map) ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* These maps require sleepable context */
err = map->ops->map_delete_elem(map, key);
goto out;
}
@@ -1178,6 +1218,220 @@ err_put:
return err;
}
+int generic_map_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 cp, max_count;
+ int err = 0;
+ void *key;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ return -EINVAL;
+ }
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size))
+ break;
+
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ break;
+ }
+
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+ if (err)
+ break;
+ }
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kfree(key);
+ return err;
+}
+
+int generic_map_update_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 value_size, cp, max_count;
+ int ufd = attr->map_fd;
+ void *key, *value;
+ struct fd f;
+ int err = 0;
+
+ f = fdget(ufd);
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ return -EINVAL;
+ }
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ if (!value) {
+ kfree(key);
+ return -ENOMEM;
+ }
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size) ||
+ copy_from_user(value, values + cp * value_size, value_size))
+ break;
+
+ err = bpf_map_update_value(map, f, key, value,
+ attr->batch.elem_flags);
+
+ if (err)
+ break;
+ }
+
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kfree(value);
+ kfree(key);
+ return err;
+}
+
+#define MAP_LOOKUP_RETRIES 3
+
+int generic_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ void *buf, *buf_prevkey, *prev_key, *key, *value;
+ int err, retry = MAP_LOOKUP_RETRIES;
+ u32 value_size, cp, max_count;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map))
+ return -EINVAL;
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!buf_prevkey)
+ return -ENOMEM;
+
+ buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ if (!buf) {
+ kvfree(buf_prevkey);
+ return -ENOMEM;
+ }
+
+ err = -EFAULT;
+ prev_key = NULL;
+ if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
+ goto free_buf;
+ key = buf;
+ value = key + map->key_size;
+ if (ubatch)
+ prev_key = buf_prevkey;
+
+ for (cp = 0; cp < max_count;) {
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, prev_key, key);
+ rcu_read_unlock();
+ if (err)
+ break;
+ err = bpf_map_copy_value(map, key, value,
+ attr->batch.elem_flags);
+
+ if (err == -ENOENT) {
+ if (retry) {
+ retry--;
+ continue;
+ }
+ err = -EINTR;
+ break;
+ }
+
+ if (err)
+ goto free_buf;
+
+ if (copy_to_user(keys + cp * map->key_size, key,
+ map->key_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+ if (copy_to_user(values + cp * value_size, value, value_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+
+ if (!prev_key)
+ prev_key = buf_prevkey;
+
+ swap(prev_key, key);
+ retry = MAP_LOOKUP_RETRIES;
+ cp++;
+ }
+
+ if (err == -EFAULT)
+ goto free_buf;
+
+ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
+ (cp && copy_to_user(uobatch, prev_key, map->key_size))))
+ err = -EFAULT;
+
+free_buf:
+ kfree(buf_prevkey);
+ kfree(buf);
+ return err;
+}
+
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
static int map_lookup_and_delete_elem(union bpf_attr *attr)
@@ -1306,6 +1560,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
return 0;
}
+enum bpf_audit {
+ BPF_AUDIT_LOAD,
+ BPF_AUDIT_UNLOAD,
+ BPF_AUDIT_MAX,
+};
+
+static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
+ [BPF_AUDIT_LOAD] = "LOAD",
+ [BPF_AUDIT_UNLOAD] = "UNLOAD",
+};
+
+static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
+{
+ struct audit_context *ctx = NULL;
+ struct audit_buffer *ab;
+
+ if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
+ return;
+ if (audit_enabled == AUDIT_OFF)
+ return;
+ if (op == BPF_AUDIT_LOAD)
+ ctx = audit_context();
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "prog-id=%u op=%s",
+ prog->aux->id, bpf_audit_str[op]);
+ audit_log_end(ab);
+}
+
int __bpf_prog_charge(struct user_struct *user, u32 pages)
{
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1421,6 +1705,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic64_dec_and_test(&prog->aux->refcnt)) {
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
__bpf_prog_put_noref(prog, true);
@@ -1640,17 +1925,24 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
u32 btf_id, u32 prog_fd)
{
- switch (prog_type) {
- case BPF_PROG_TYPE_TRACING:
+ if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
return -EINVAL;
- break;
- default:
- if (btf_id || prog_fd)
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ case BPF_PROG_TYPE_EXT:
+ break;
+ default:
return -EINVAL;
- break;
+ }
}
+ if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
+ prog_type != BPF_PROG_TYPE_EXT)
+ return -EINVAL;
+
switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
switch (expected_attach_type) {
@@ -1691,6 +1983,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_EXT:
+ if (expected_attach_type)
+ return -EINVAL;
+ /* fallthrough */
default:
return 0;
}
@@ -1830,6 +2126,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
*/
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_LOAD);
err = bpf_prog_new_fd(prog);
if (err < 0)
@@ -1892,7 +2189,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
int tr_fd, err;
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
- prog->expected_attach_type != BPF_TRACE_FEXIT) {
+ prog->expected_attach_type != BPF_TRACE_FEXIT &&
+ prog->type != BPF_PROG_TYPE_EXT) {
err = -EINVAL;
goto out_put_prog;
}
@@ -1959,12 +2257,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_EXT &&
prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
err = -EINVAL;
goto out_put_prog;
}
- if (prog->type == BPF_PROG_TYPE_TRACING) {
+ if (prog->type == BPF_PROG_TYPE_TRACING ||
+ prog->type == BPF_PROG_TYPE_EXT) {
if (attr->raw_tracepoint.name) {
/* The attach point for this category of programs
* should be specified via btf_id during program load.
@@ -2040,10 +2340,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
}
}
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
#define BPF_F_ATTACH_MASK \
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
static int bpf_prog_attach(const union bpf_attr *attr)
{
@@ -2305,17 +2605,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
-static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+struct bpf_prog *bpf_prog_by_id(u32 id)
{
struct bpf_prog *prog;
- u32 id = attr->prog_id;
- int fd;
- if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!id)
+ return ERR_PTR(-ENOENT);
spin_lock_bh(&prog_idr_lock);
prog = idr_find(&prog_idr, id);
@@ -2324,7 +2619,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
else
prog = ERR_PTR(-ENOENT);
spin_unlock_bh(&prog_idr_lock);
+ return prog;
+}
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ prog = bpf_prog_by_id(id);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -2774,6 +3084,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}
+ info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_info_fill(&info, map);
@@ -2986,6 +3297,61 @@ out:
return err;
}
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+#define BPF_DO_BATCH(fn) \
+ do { \
+ if (!fn) { \
+ err = -ENOTSUPP; \
+ goto err_put; \
+ } \
+ err = fn(map, attr, uattr); \
+ } while (0)
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ int cmd)
+{
+ struct bpf_map *map;
+ int err, ufd;
+ struct fd f;
+
+ if (CHECK_ATTR(BPF_MAP_BATCH))
+ return -EINVAL;
+
+ ufd = attr->batch.map_fd;
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if ((cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd != BPF_MAP_LOOKUP_BATCH &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd == BPF_MAP_LOOKUP_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_batch);
+ else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ else if (cmd == BPF_MAP_UPDATE_BATCH)
+ BPF_DO_BATCH(map->ops->map_update_batch);
+ else
+ BPF_DO_BATCH(map->ops->map_delete_batch);
+
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -3083,6 +3449,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
+ case BPF_MAP_LOOKUP_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ break;
+ case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+ break;
+ case BPF_MAP_UPDATE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+ break;
+ case BPF_MAP_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 23b0d5cfd47e..6b264a92064b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -4,16 +4,98 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
+#include <linux/rbtree_latch.h>
+
+/* dummy _ops. The verifier will operate on target program's ops. */
+const struct bpf_verifier_ops bpf_extension_verifier_ops = {
+};
+const struct bpf_prog_ops bpf_extension_prog_ops = {
+};
/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
#define TRAMPOLINE_HASH_BITS 10
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+static struct latch_tree_root image_tree __cacheline_aligned;
-/* serializes access to trampoline_table */
+/* serializes access to trampoline_table and image_tree */
static DEFINE_MUTEX(trampoline_mutex);
+static void *bpf_jit_alloc_exec_page(void)
+{
+ void *image;
+
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!image)
+ return NULL;
+
+ set_vm_flush_reset_perms(image);
+ /* Keep image as writeable. The alternative is to keep flipping ro/rw
+ * everytime new program is attached or detached.
+ */
+ set_memory_x((long)image, 1);
+ return image;
+}
+
+static __always_inline bool image_tree_less(struct latch_tree_node *a,
+ struct latch_tree_node *b)
+{
+ struct bpf_image *ia = container_of(a, struct bpf_image, tnode);
+ struct bpf_image *ib = container_of(b, struct bpf_image, tnode);
+
+ return ia < ib;
+}
+
+static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n)
+{
+ void *image = container_of(n, struct bpf_image, tnode);
+
+ if (addr < image)
+ return -1;
+ if (addr >= image + PAGE_SIZE)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops image_tree_ops = {
+ .less = image_tree_less,
+ .comp = image_tree_comp,
+};
+
+static void *__bpf_image_alloc(bool lock)
+{
+ struct bpf_image *image;
+
+ image = bpf_jit_alloc_exec_page();
+ if (!image)
+ return NULL;
+
+ if (lock)
+ mutex_lock(&trampoline_mutex);
+ latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops);
+ if (lock)
+ mutex_unlock(&trampoline_mutex);
+ return image->data;
+}
+
+void *bpf_image_alloc(void)
+{
+ return __bpf_image_alloc(true);
+}
+
+bool is_bpf_image_address(unsigned long addr)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL;
+ rcu_read_unlock();
+
+ return ret;
+}
+
struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
@@ -34,7 +116,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
goto out;
/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
- image = bpf_jit_alloc_exec(PAGE_SIZE);
+ image = __bpf_image_alloc(false);
if (!image) {
kfree(tr);
tr = NULL;
@@ -48,12 +130,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
-
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
- */
- set_memory_x((long)image, 1);
tr->image = image;
out:
mutex_unlock(&trampoline_mutex);
@@ -115,14 +191,14 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
- * bytes on x86. Pick a number to fit into PAGE_SIZE / 2
+ * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
*/
#define BPF_MAX_TRAMP_PROGS 40
static int bpf_trampoline_update(struct bpf_trampoline *tr)
{
- void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
- void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+ void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
+ void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
@@ -150,11 +226,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
if (fexit_cnt)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
- err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
+ /* Though the second half of trampoline page is unused a task could be
+ * preempted in the middle of the first half of trampoline and two
+ * updates to trampoline would change the code from underneath the
+ * preempted task. Hence wait for tasks to voluntarily schedule or go
+ * to userspace.
+ */
+ synchronize_rcu_tasks();
+
+ err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
+ &tr->func.model, flags,
fentry, fentry_cnt,
fexit, fexit_cnt,
tr->func.addr);
- if (err)
+ if (err < 0)
goto out;
if (tr->selector)
@@ -175,8 +260,10 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
switch (t) {
case BPF_TRACE_FENTRY:
return BPF_TRAMP_FENTRY;
- default:
+ case BPF_TRACE_FEXIT:
return BPF_TRAMP_FEXIT;
+ default:
+ return BPF_TRAMP_REPLACE;
}
}
@@ -185,12 +272,31 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog)
enum bpf_tramp_prog_type kind;
struct bpf_trampoline *tr;
int err = 0;
+ int cnt;
tr = prog->aux->trampoline;
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
mutex_lock(&tr->mutex);
- if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]
- >= BPF_MAX_TRAMP_PROGS) {
+ if (tr->extension_prog) {
+ /* cannot attach fentry/fexit if extension prog is attached.
+ * cannot overwrite extension prog either.
+ */
+ err = -EBUSY;
+ goto out;
+ }
+ cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+ if (kind == BPF_TRAMP_REPLACE) {
+ /* Cannot attach extension if fentry/fexit are in use. */
+ if (cnt) {
+ err = -EBUSY;
+ goto out;
+ }
+ tr->extension_prog = prog;
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ prog->bpf_func);
+ goto out;
+ }
+ if (cnt >= BPF_MAX_TRAMP_PROGS) {
err = -E2BIG;
goto out;
}
@@ -221,15 +327,25 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
tr = prog->aux->trampoline;
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
mutex_lock(&tr->mutex);
+ if (kind == BPF_TRAMP_REPLACE) {
+ WARN_ON_ONCE(!tr->extension_prog);
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+ tr->extension_prog->bpf_func, NULL);
+ tr->extension_prog = NULL;
+ goto out;
+ }
hlist_del(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
err = bpf_trampoline_update(prog->aux->trampoline);
+out:
mutex_unlock(&tr->mutex);
return err;
}
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
+ struct bpf_image *image;
+
if (!tr)
return;
mutex_lock(&trampoline_mutex);
@@ -240,7 +356,11 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
goto out;
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
- bpf_jit_free_exec(tr->image);
+ image = container_of(tr->image, struct bpf_image, data);
+ latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
+ /* wait for tasks to get out of trampoline before freeing it */
+ synchronize_rcu_tasks();
+ bpf_jit_free_exec(image);
hlist_del(&tr->hlist);
kfree(tr);
out:
@@ -286,7 +406,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
}
int __weak
-arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
+arch_prepare_bpf_trampoline(void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
struct bpf_prog **fentry_progs, int fentry_cnt,
struct bpf_prog **fexit_progs, int fexit_cnt,
void *orig_call)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7d530ce8719d..1cc945daa9c8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1122,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env,
regs[BPF_REG_FP].type = PTR_TO_STACK;
mark_reg_known_zero(env, regs, BPF_REG_FP);
regs[BPF_REG_FP].frameno = state->frameno;
-
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(env, regs, BPF_REG_1);
}
#define BPF_MAIN_FUNC (-1)
@@ -1916,6 +1912,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
+ case PTR_TO_BTF_ID:
return true;
default:
return false;
@@ -2738,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
}
#endif
-static int check_ctx_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
+int check_ctx_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
{
/* Access to ctx or passing it to a helper is only allowed in
* its original, unmodified form.
@@ -2858,11 +2855,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
u32 btf_id;
int ret;
- if (atype != BPF_READ) {
- verbose(env, "only read is supported\n");
- return -EACCES;
- }
-
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -2879,17 +2871,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
+ if (env->ops->btf_struct_access) {
+ ret = env->ops->btf_struct_access(&env->log, t, off, size,
+ atype, &btf_id);
+ } else {
+ if (atype != BPF_READ) {
+ verbose(env, "only read is supported\n");
+ return -EACCES;
+ }
+
+ ret = btf_struct_access(&env->log, t, off, size, atype,
+ &btf_id);
+ }
+
if (ret < 0)
return ret;
- if (ret == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
- return 0;
+ if (atype == BPF_READ) {
+ if (ret == SCALAR_VALUE) {
+ mark_reg_unknown(env, regs, value_regno);
+ return 0;
+ }
+ mark_reg_known_zero(env, regs, value_regno);
+ regs[value_regno].type = PTR_TO_BTF_ID;
+ regs[value_regno].btf_id = btf_id;
}
- mark_reg_known_zero(env, regs, value_regno);
- regs[value_regno].type = PTR_TO_BTF_ID;
- regs[value_regno].btf_id = btf_id;
+
return 0;
}
@@ -3945,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env,
return 0;
}
+static void clear_caller_saved_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ int i;
+
+ /* after the call registers r0 - r5 were scratched */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
+}
+
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
int i, err, subprog, target_insn;
+ bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
@@ -3973,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
+ func_info_aux = env->prog->aux->func_info_aux;
+ if (func_info_aux)
+ is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (is_global) {
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d\n",
+ subprog);
+ return err;
+ } else {
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env,
+ "Func#%d is global and valid. Skipping.\n",
+ subprog);
+ clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return SCALAR_VALUE */
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+
+ /* continue with next insn after call */
+ return 0;
+ }
+ }
+
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -3999,18 +4046,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
callee->regs[i] = caller->regs[i];
- /* after the call registers r0 - r5 were scratched */
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, caller->regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
- }
+ clear_caller_saved_regs(env, caller->regs);
/* only increment it after check_reg_arg() finished */
state->curframe++;
- if (btf_check_func_arg_match(env, subprog))
- return -EINVAL;
-
/* and go analyze first insn of the callee */
*insn_idx = target_insn;
@@ -6360,8 +6400,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
static int check_return_code(struct bpf_verifier_env *env)
{
struct tnum enforce_attach_type_range = tnum_unknown;
+ const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
+ int err;
+
+ /* The struct_ops func-ptr's return type could be "void" */
+ if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
+ !prog->aux->attach_func_proto->type)
+ return 0;
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose(env, "R0 leaks addr as return value\n");
+ return -EACCES;
+ }
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -6750,12 +6812,13 @@ static int check_btf_func(struct bpf_verifier_env *env,
/* check type_id */
type = btf_type_by_id(btf, krecord[i].type_id);
- if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+ if (!type || !btf_type_is_func(type)) {
verbose(env, "invalid type id %d in func info",
krecord[i].type_id);
ret = -EINVAL;
goto err_free;
}
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
prev_offset = krecord[i].insn_off;
urecord += urec_size;
}
@@ -7735,35 +7798,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
bool do_print_state = false;
int prev_insn_idx = -1;
- env->prev_linfo = NULL;
-
- state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
- if (!state)
- return -ENOMEM;
- state->curframe = 0;
- state->speculative = false;
- state->branches = 1;
- state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
- if (!state->frame[0]) {
- kfree(state);
- return -ENOMEM;
- }
- env->cur_state = state;
- init_func_state(env, state->frame[0],
- BPF_MAIN_FUNC /* callsite */,
- 0 /* frameno */,
- 0 /* subprogno, zero == main subprog */);
-
- if (btf_check_func_arg_match(env, 0))
- return -EINVAL;
-
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -7841,7 +7882,7 @@ static int do_check(struct bpf_verifier_env *env)
}
regs = cur_regs(env);
- env->insn_aux_data[env->insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
prev_insn_idx = env->insn_idx;
if (class == BPF_ALU || class == BPF_ALU64) {
@@ -8027,21 +8068,6 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- /* eBPF calling convetion is such that R0 is used
- * to return the value from eBPF program.
- * Make sure that it's readable at this time
- * of bpf_exit, which means that program wrote
- * something into it earlier
- */
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
- if (err)
- return err;
-
- if (is_pointer_value(env, BPF_REG_0)) {
- verbose(env, "R0 leaks addr as return value\n");
- return -EACCES;
- }
-
err = check_return_code(env);
if (err)
return err;
@@ -8076,7 +8102,7 @@ process_bpf_exit:
return err;
env->insn_idx++;
- env->insn_aux_data[env->insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
@@ -8089,7 +8115,6 @@ process_bpf_exit:
env->insn_idx++;
}
- env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -8149,6 +8174,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ verbose(env, "bpf_struct_ops map cannot be used in prog\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -8361,7 +8391,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
for (i = off; i < off + cnt - 1; i++) {
- new_data[i].seen = true;
+ new_data[i].seen = env->pass_cnt;
new_data[i].zext_dst = insn_has_def32(env, insn + i);
}
env->insn_aux_data = new_data;
@@ -8840,12 +8870,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
case PTR_TO_BTF_ID:
- if (type == BPF_WRITE) {
+ if (type == BPF_READ) {
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ env->prog->aux->num_exentries++;
+ } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
verbose(env, "Writes through BTF pointers are not allowed\n");
return -EINVAL;
}
- insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
- env->prog->aux->num_exentries++;
continue;
default:
continue;
@@ -9425,6 +9457,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_jiffies64) {
+ struct bpf_insn ld_jiffies_addr[2] = {
+ BPF_LD_IMM64(BPF_REG_0,
+ (unsigned long)&jiffies),
+ };
+
+ insn_buf[0] = ld_jiffies_addr[0];
+ insn_buf[1] = ld_jiffies_addr[1];
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
+ BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -9471,6 +9527,7 @@ static void free_states(struct bpf_verifier_env *env)
kfree(sl);
sl = sln;
}
+ env->free_list = NULL;
if (!env->explored_states)
return;
@@ -9484,11 +9541,164 @@ static void free_states(struct bpf_verifier_env *env)
kfree(sl);
sl = sln;
}
+ env->explored_states[i] = NULL;
}
+}
- kvfree(env->explored_states);
+/* The verifier is using insn_aux_data[] to store temporary data during
+ * verification and to store information for passes that run after the
+ * verification like dead code sanitization. do_check_common() for subprogram N
+ * may analyze many other subprograms. sanitize_insn_aux_data() clears all
+ * temporary data after do_check_common() finds that subprogram N cannot be
+ * verified independently. pass_cnt counts the number of times
+ * do_check_common() was run and insn->aux->seen tells the pass number
+ * insn_aux_data was touched. These variables are compared to clear temporary
+ * data from failed pass. For testing and experiments do_check_common() can be
+ * run multiple times even when prior attempt to verify is unsuccessful.
+ */
+static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_insn_aux_data *aux;
+ int i, class;
+
+ for (i = 0; i < env->prog->len; i++) {
+ class = BPF_CLASS(insn[i].code);
+ if (class != BPF_LDX && class != BPF_STX)
+ continue;
+ aux = &env->insn_aux_data[i];
+ if (aux->seen != env->pass_cnt)
+ continue;
+ memset(aux, 0, offsetof(typeof(*aux), orig_idx));
+ }
}
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_verifier_state *state;
+ struct bpf_reg_state *regs;
+ int ret, i;
+
+ env->prev_linfo = NULL;
+ env->pass_cnt++;
+
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ state->curframe = 0;
+ state->speculative = false;
+ state->branches = 1;
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
+ if (!state->frame[0]) {
+ kfree(state);
+ return -ENOMEM;
+ }
+ env->cur_state = state;
+ init_func_state(env, state->frame[0],
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno */,
+ subprog);
+
+ regs = state->frame[state->curframe]->regs;
+ if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
+ ret = btf_prepare_func_args(env, subprog, regs);
+ if (ret)
+ goto out;
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (regs[i].type == PTR_TO_CTX)
+ mark_reg_known_zero(env, regs, i);
+ else if (regs[i].type == SCALAR_VALUE)
+ mark_reg_unknown(env, regs, i);
+ }
+ } else {
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(env, regs, BPF_REG_1);
+ ret = btf_check_func_arg_match(env, subprog, regs);
+ if (ret == -EFAULT)
+ /* unlikely verifier bug. abort.
+ * ret == 0 and ret < 0 are sadly acceptable for
+ * main() function due to backward compatibility.
+ * Like socket filter program may be written as:
+ * int bpf_prog(struct pt_regs *ctx)
+ * and never dereference that ctx in the program.
+ * 'struct pt_regs' is a type mismatch for socket
+ * filter that should be using 'struct __sk_buff'.
+ */
+ goto out;
+ }
+
+ ret = do_check(env);
+out:
+ /* check for NULL is necessary, since cur_state can be freed inside
+ * do_check() under memory pressure.
+ */
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
+ while (!pop_stack(env, NULL, NULL));
+ free_states(env);
+ if (ret)
+ /* clean aux data in case subprog was rejected */
+ sanitize_insn_aux_data(env);
+ return ret;
+}
+
+/* Verify all global functions in a BPF program one by one based on their BTF.
+ * All global functions must pass verification. Otherwise the whole program is rejected.
+ * Consider:
+ * int bar(int);
+ * int foo(int f)
+ * {
+ * return bar(f);
+ * }
+ * int bar(int b)
+ * {
+ * ...
+ * }
+ * foo() will be verified first for R1=any_scalar_value. During verification it
+ * will be assumed that bar() already verified successfully and call to bar()
+ * from foo() will be checked for type match only. Later bar() will be verified
+ * independently to check that it's safe for R1=any_scalar_value.
+ */
+static int do_check_subprogs(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int i, ret;
+
+ if (!aux->func_info)
+ return 0;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+ continue;
+ env->insn_idx = env->subprog_info[i].start;
+ WARN_ON_ONCE(env->insn_idx == 0);
+ ret = do_check_common(env, i);
+ if (ret) {
+ return ret;
+ } else if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env,
+ "Func#%d is safe for any args that match its prototype\n",
+ i);
+ }
+ }
+ return 0;
+}
+
+static int do_check_main(struct bpf_verifier_env *env)
+{
+ int ret;
+
+ env->insn_idx = 0;
+ ret = do_check_common(env, 0);
+ if (!ret)
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+ return ret;
+}
+
+
static void print_verification_stats(struct bpf_verifier_env *env)
{
int i;
@@ -9513,9 +9723,62 @@ static void print_verification_stats(struct bpf_verifier_env *env)
env->peak_states, env->longest_mark_read_walk);
}
+static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
+{
+ const struct btf_type *t, *func_proto;
+ const struct bpf_struct_ops *st_ops;
+ const struct btf_member *member;
+ struct bpf_prog *prog = env->prog;
+ u32 btf_id, member_idx;
+ const char *mname;
+
+ btf_id = prog->aux->attach_btf_id;
+ st_ops = bpf_struct_ops_find(btf_id);
+ if (!st_ops) {
+ verbose(env, "attach_btf_id %u is not a supported struct\n",
+ btf_id);
+ return -ENOTSUPP;
+ }
+
+ t = st_ops->type;
+ member_idx = prog->expected_attach_type;
+ if (member_idx >= btf_type_vlen(t)) {
+ verbose(env, "attach to invalid member idx %u of struct %s\n",
+ member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ member = &btf_type_member(t)[member_idx];
+ mname = btf_name_by_offset(btf_vmlinux, member->name_off);
+ func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
+ NULL);
+ if (!func_proto) {
+ verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
+ mname, member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ if (st_ops->check_member) {
+ int err = st_ops->check_member(t, member);
+
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+ }
+
+ prog->aux->attach_func_proto = func_proto;
+ prog->aux->attach_func_name = mname;
+ env->ops = st_ops->verifier_ops;
+
+ return 0;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
+ bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
struct bpf_prog *tgt_prog = prog->aux->linked_prog;
u32 btf_id = prog->aux->attach_btf_id;
const char prefix[] = "btf_trace_";
@@ -9528,7 +9791,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
long addr;
u64 key;
- if (prog->type != BPF_PROG_TYPE_TRACING)
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
+ return check_struct_ops_btf_id(env);
+
+ if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension)
return 0;
if (!btf_id) {
@@ -9564,8 +9830,59 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
conservative = aux->func_info_aux[subprog].unreliable;
+ if (prog_extension) {
+ if (conservative) {
+ verbose(env,
+ "Cannot replace static functions\n");
+ return -EINVAL;
+ }
+ if (!prog->jit_requested) {
+ verbose(env,
+ "Extension programs should be JITed\n");
+ return -EINVAL;
+ }
+ env->ops = bpf_verifier_ops[tgt_prog->type];
+ }
+ if (!tgt_prog->jited) {
+ verbose(env, "Can attach to only JITed progs\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == prog->type) {
+ /* Cannot fentry/fexit another fentry/fexit program.
+ * Cannot attach program extension to another extension.
+ * It's ok to attach fentry/fexit to extension program.
+ */
+ verbose(env, "Cannot recursively attach\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
+ prog_extension &&
+ (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+ /* Program extensions can extend all program types
+ * except fentry/fexit. The reason is the following.
+ * The fentry/fexit programs are used for performance
+ * analysis, stats and can be attached to any program
+ * type except themselves. When extension program is
+ * replacing XDP function it is necessary to allow
+ * performance analysis of all functions. Both original
+ * XDP program and its program extension. Hence
+ * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
+ * allowed. If extending of fentry/fexit was allowed it
+ * would be possible to create long call chain
+ * fentry->extension->fentry->extension beyond
+ * reasonable stack size. Hence extending fentry is not
+ * allowed.
+ */
+ verbose(env, "Cannot extend fentry/fexit\n");
+ return -EINVAL;
+ }
key = ((u64)aux->id) << 32 | btf_id;
} else {
+ if (prog_extension) {
+ verbose(env, "Cannot replace kernel functions\n");
+ return -EINVAL;
+ }
key = btf_id;
}
@@ -9603,6 +9920,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_func_proto = t;
prog->aux->attach_btf_trace = true;
return 0;
+ default:
+ if (!prog_extension)
+ return -EINVAL;
+ /* fallthrough */
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
if (!btf_type_is_func(t)) {
@@ -9610,6 +9931,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
btf_id);
return -EINVAL;
}
+ if (prog_extension &&
+ btf_check_type_match(env, prog, btf, t))
+ return -EINVAL;
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
@@ -9633,18 +9957,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (ret < 0)
goto out;
if (tgt_prog) {
- if (!tgt_prog->jited) {
- /* for now */
- verbose(env, "Can trace only JITed BPF progs\n");
- ret = -EINVAL;
- goto out;
- }
- if (tgt_prog->type == BPF_PROG_TYPE_TRACING) {
- /* prevent cycles */
- verbose(env, "Cannot recursively attach\n");
- ret = -EINVAL;
- goto out;
- }
if (subprog == 0)
addr = (long) tgt_prog->bpf_func;
else
@@ -9666,8 +9978,6 @@ out:
if (ret)
bpf_trampoline_put(tr);
return ret;
- default:
- return -EINVAL;
}
}
@@ -9737,10 +10047,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
goto skip_full_check;
}
- ret = check_attach_btf_id(env);
- if (ret)
- goto skip_full_check;
-
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
@@ -9777,22 +10083,22 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (ret < 0)
goto skip_full_check;
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
- ret = do_check(env);
- if (env->cur_state) {
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- }
+ ret = do_check_subprogs(env);
+ ret = ret ?: do_check_main(env);
if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
- while (!pop_stack(env, NULL, NULL));
- free_states(env);
+ kvfree(env->explored_states);
if (ret == 0)
ret = check_max_stack_depth(env);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 90c4fce1c981..2cc5c8f4c800 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs,
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct bpf_map_memory mem;
- int cpu, err, numa_node;
+ int err, numa_node;
struct xsk_map *m;
- u64 cost, size;
+ u64 size;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
numa_node = bpf_map_attr_numa_node(attr);
size = struct_size(m, xsk_map, attr->max_entries);
- cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
- err = bpf_map_charge_init(&mem, cost);
+ err = bpf_map_charge_init(&mem, size);
if (err < 0)
return ERR_PTR(err);
@@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
bpf_map_charge_move(&m->map.memory, &mem);
spin_lock_init(&m->lock);
- m->flush_list = alloc_percpu(struct list_head);
- if (!m->flush_list) {
- bpf_map_charge_finish(&m->map.memory);
- bpf_map_area_free(m);
- return ERR_PTR(-ENOMEM);
- }
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
-
return &m->map;
}
@@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_net();
- free_percpu(m->flush_list);
bpf_map_area_free(m);
}
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 90d1710fef6c..bfbeabc17a9d 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -7,7 +7,7 @@
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
-#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
@@ -265,7 +265,7 @@ extern const struct proc_ns_operations cgroupns_operations;
*/
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
-extern const struct fs_parameter_description cgroup1_fs_parameters;
+extern const struct fs_parameter_spec cgroup1_fs_parameters[];
int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 09f3a413f6f8..be1a1c83cdd1 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -18,8 +18,6 @@
#include <trace/events/cgroup.h>
-#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
-
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -887,7 +885,7 @@ enum cgroup1_param {
Opt_xattr,
};
-static const struct fs_parameter_spec cgroup1_param_specs[] = {
+const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("all", Opt_all),
fsparam_flag ("clone_children", Opt_clone_children),
fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
@@ -899,11 +897,6 @@ static const struct fs_parameter_spec cgroup1_param_specs[] = {
{}
};
-const struct fs_parameter_description cgroup1_fs_parameters = {
- .name = "cgroup1",
- .specs = cgroup1_param_specs,
-};
-
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
@@ -911,7 +904,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct fs_parse_result result;
int opt, i;
- opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
+ opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
if (strcmp(param->key, "source") == 0) {
fc->source = param->string;
@@ -924,7 +917,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->subsys_mask |= (1 << i);
return 0;
}
- return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
+ return invalfc(fc, "Unknown subsys name '%s'", param->key);
}
if (opt < 0)
return opt;
@@ -952,7 +945,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
- return cg_invalf(fc, "cgroup1: release_agent respecified");
+ return invalfc(fc, "release_agent respecified");
ctx->release_agent = param->string;
param->string = NULL;
break;
@@ -962,9 +955,9 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -ENOENT;
/* Can't specify an empty name */
if (!param->size)
- return cg_invalf(fc, "cgroup1: Empty name");
+ return invalfc(fc, "Empty name");
if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
- return cg_invalf(fc, "cgroup1: Name too long");
+ return invalfc(fc, "Name too long");
/* Must match [\w.-]+ */
for (i = 0; i < param->size; i++) {
char c = param->string[i];
@@ -972,11 +965,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
- return cg_invalf(fc, "cgroup1: Invalid name");
+ return invalfc(fc, "Invalid name");
}
/* Specifying two names is forbidden */
if (ctx->name)
- return cg_invalf(fc, "cgroup1: name respecified");
+ return invalfc(fc, "name respecified");
ctx->name = param->string;
param->string = NULL;
break;
@@ -1011,7 +1004,7 @@ static int check_cgroupfs_options(struct fs_context *fc)
if (ctx->all_ss) {
/* Mutually exclusive option 'all' + subsystem name */
if (ctx->subsys_mask)
- return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
+ return invalfc(fc, "subsys name conflicts with all");
/* 'all' => select all the subsystems */
ctx->subsys_mask = enabled;
}
@@ -1021,7 +1014,7 @@ static int check_cgroupfs_options(struct fs_context *fc)
* empty hierarchies must have a name).
*/
if (!ctx->subsys_mask && !ctx->name)
- return cg_invalf(fc, "cgroup1: Need name or subsystem set");
+ return invalfc(fc, "Need name or subsystem set");
/*
* Option noprefix was introduced just for backward compatibility
@@ -1029,11 +1022,11 @@ static int check_cgroupfs_options(struct fs_context *fc)
* the cpuset subsystem.
*/
if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
- return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
+ return invalfc(fc, "noprefix used incorrectly");
/* Can't specify "none" and some subsystems */
if (ctx->subsys_mask && ctx->none)
- return cg_invalf(fc, "cgroup1: none used incorrectly");
+ return invalfc(fc, "none used incorrectly");
return 0;
}
@@ -1063,7 +1056,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
/* Don't allow flags or name to change at remount */
if ((ctx->flags ^ root->flags) ||
(ctx->name && strcmp(ctx->name, root->name))) {
- cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
+ errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
ctx->flags, ctx->name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
@@ -1180,7 +1173,7 @@ static int cgroup1_root_to_use(struct fs_context *fc)
* can't create new one without subsys specification.
*/
if (!ctx->subsys_mask && !ctx->none)
- return cg_invalf(fc, "cgroup1: No subsys list or none specified");
+ return invalfc(fc, "No subsys list or none specified");
/* Hierarchies may only be created in the initial cgroup namespace. */
if (ctx->ns != &init_cgroup_ns)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1e12e6928bca..db552b9f9377 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1816,24 +1816,19 @@ enum cgroup2_param {
nr__cgroup2_params
};
-static const struct fs_parameter_spec cgroup2_param_specs[] = {
+static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
fsparam_flag("memory_localevents", Opt_memory_localevents),
{}
};
-static const struct fs_parameter_description cgroup2_fs_parameters = {
- .name = "cgroup2",
- .specs = cgroup2_param_specs,
-};
-
static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct fs_parse_result result;
int opt;
- opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
+ opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -2156,7 +2151,7 @@ static void cgroup_kill_sb(struct super_block *sb)
struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.init_fs_context = cgroup_init_fs_context,
- .parameters = &cgroup1_fs_parameters,
+ .parameters = cgroup1_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -2164,7 +2159,7 @@ struct file_system_type cgroup_fs_type = {
static struct file_system_type cgroup2_fs_type = {
.name = "cgroup2",
.init_fs_context = cgroup_init_fs_context,
- .parameters = &cgroup2_fs_parameters,
+ .parameters = cgroup2_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -6289,12 +6284,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 flags)
+ struct bpf_prog *replace_prog, enum bpf_attach_type type,
+ u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/configs.c b/kernel/configs.c
index c09ea4c995e1..a28c79c5f713 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -47,10 +47,9 @@ ikconfig_read_current(struct file *file, char __user *buf,
&kernel_config_data);
}
-static const struct file_operations ikconfig_file_ops = {
- .owner = THIS_MODULE,
- .read = ikconfig_read_current,
- .llseek = default_llseek,
+static const struct proc_ops config_gz_proc_ops = {
+ .proc_read = ikconfig_read_current,
+ .proc_lseek = default_llseek,
};
static int __init ikconfig_init(void)
@@ -59,7 +58,7 @@ static int __init ikconfig_init(void)
/* create the current config file */
entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
- &ikconfig_file_ops);
+ &config_gz_proc_ops);
if (!entry)
return -ENOMEM;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 4dc279ed3b2d..9c706af713fb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -525,8 +525,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
- /* Unpark the stopper thread and the hotplug thread of the target cpu */
- stop_machine_unpark(cpu);
+ /* Unpark the hotplug thread of the target cpu */
kthread_unpark(st->thread);
/*
@@ -1089,8 +1088,8 @@ void notify_cpu_starting(unsigned int cpu)
/*
* Called from the idle task. Wake up the controlling task which brings the
- * stopper and the hotplug thread of the upcoming CPU up and then delegates
- * the rest of the online bringup to the hotplug thread.
+ * hotplug thread of the upcoming CPU up and then delegates the rest of the
+ * online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
@@ -1100,6 +1099,12 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ /*
+ * Unpart the stopper thread before we start the idle loop (and start
+ * scheduling); this ensures the stopper task is always available.
+ */
+ stop_machine_unpark(smp_processor_id());
+
st->state = CPUHP_AP_ONLINE_IDLE;
complete_ap_thread(st, true);
}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 62c301ad0773..d7ebb2c79cb8 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -412,7 +412,6 @@ static int kdb_bc(int argc, const char **argv)
* assume that the breakpoint number is desired.
*/
if (addr < KDB_MAXBPT) {
- bp = &kdb_breakpoints[addr];
lowbp = highbp = addr;
highbp++;
} else {
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 4af48ac53625..3de0cc780c16 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -119,7 +119,6 @@ kdb_bt_cpu(unsigned long cpu)
return;
}
- kdb_set_current_task(kdb_tsk);
kdb_bt1(kdb_tsk, ~0UL, false);
}
@@ -166,10 +165,8 @@ kdb_bt(int argc, const char **argv)
if (diag)
return diag;
p = find_task_by_pid_ns(pid, &init_pid_ns);
- if (p) {
- kdb_set_current_task(p);
+ if (p)
return kdb_bt1(p, ~0UL, false);
- }
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
} else if (strcmp(argv[0], "btt") == 0) {
@@ -178,11 +175,9 @@ kdb_bt(int argc, const char **argv)
diag = kdbgetularg((char *)argv[1], &addr);
if (diag)
return diag;
- kdb_set_current_task((struct task_struct *)addr);
return kdb_bt1((struct task_struct *)addr, ~0UL, false);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
- struct task_struct *save_current_task = kdb_current_task;
if (argc > 1)
return KDB_ARGCOUNT;
if (argc == 1) {
@@ -204,7 +199,6 @@ kdb_bt(int argc, const char **argv)
kdb_bt_cpu(cpu);
touch_nmi_watchdog();
}
- kdb_set_current_task(save_current_task);
}
return 0;
} else {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 8bcdded5d61f..924bc9298a42 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -553,7 +553,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
- struct console *c = console_drivers;
+ struct console *c;
unsigned long uninitialized_var(flags);
/* Serialize kdb_printf if multiple cpus try to write at once.
@@ -698,10 +698,9 @@ kdb_printit:
cp2++;
}
}
- while (c) {
+ for_each_console(c) {
c->write(c, cp, retlen - (cp - kdb_buffer));
touch_nmi_watchdog();
- c = c->next;
}
}
if (logging) {
@@ -752,7 +751,6 @@ kdb_printit:
moreprompt = "more> ";
kdb_input_flush();
- c = console_drivers;
if (dbg_io_ops && !dbg_io_ops->is_console) {
len = strlen(moreprompt);
@@ -762,10 +760,9 @@ kdb_printit:
cp++;
}
}
- while (c) {
+ for_each_console(c) {
c->write(c, moreprompt, strlen(moreprompt));
touch_nmi_watchdog();
- c = c->next;
}
if (logging)
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4567fe998c30..ba12e9f4661e 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -73,7 +73,6 @@ int kdb_nextline = 1;
int kdb_state; /* General KDB state */
struct task_struct *kdb_current_task;
-EXPORT_SYMBOL(kdb_current_task);
struct pt_regs *kdb_current_regs;
const char *kdb_diemsg;
@@ -1139,7 +1138,7 @@ static void kdb_dumpregs(struct pt_regs *regs)
console_loglevel = old_lvl;
}
-void kdb_set_current_task(struct task_struct *p)
+static void kdb_set_current_task(struct task_struct *p)
{
kdb_current_task = p;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 55d052061ef9..2e296e4a234c 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -240,8 +240,8 @@ extern void *debug_kmalloc(size_t size, gfp_t flags);
extern void debug_kfree(void *);
extern void debug_kusage(void);
-extern void kdb_set_current_task(struct task_struct *);
extern struct task_struct *kdb_current_task;
+extern struct pt_regs *kdb_current_regs;
#ifdef CONFIG_KDB_KEYBOARD
extern void kdb_kbd_cleanup_state(void);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2173c23c25b4..e453589da97c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -951,9 +951,9 @@ list_update_cgroup_event(struct perf_event *event,
/*
* Because cgroup events are always per-cpu events,
- * this will always be called from the right CPU.
+ * @ctx == &cpuctx->ctx.
*/
- cpuctx = __get_cpu_context(ctx);
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
/*
* Since setting cpuctx->cgrp is conditional on the current @cgrp
@@ -979,7 +979,8 @@ list_update_cgroup_event(struct perf_event *event,
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
if (add)
- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ list_add(cpuctx_entry,
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
else
list_del(cpuctx_entry);
}
@@ -4373,7 +4374,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
+ struct perf_buffer *rb);
static void detach_sb_event(struct perf_event *event)
{
@@ -5054,7 +5055,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
static __poll_t perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
poll_wait(file, &event->waitq, wait);
@@ -5296,7 +5297,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return perf_event_set_bpf_prog(event, arg);
case PERF_EVENT_IOC_PAUSE_OUTPUT: {
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5432,7 +5433,7 @@ static void calc_timer_values(struct perf_event *event,
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5464,7 +5465,7 @@ void __weak arch_perf_update_userpage(
void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
u64 enabled, running, now;
rcu_read_lock();
@@ -5515,7 +5516,7 @@ EXPORT_SYMBOL_GPL(perf_event_update_userpage);
static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
struct perf_event *event = vmf->vma->vm_file->private_data;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
vm_fault_t ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -5548,9 +5549,9 @@ unlock:
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb)
+ struct perf_buffer *rb)
{
- struct ring_buffer *old_rb = NULL;
+ struct perf_buffer *old_rb = NULL;
unsigned long flags;
if (event->rb) {
@@ -5608,7 +5609,7 @@ static void ring_buffer_attach(struct perf_event *event,
static void ring_buffer_wakeup(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5619,9 +5620,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_unlock();
}
-struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5634,7 +5635,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
return rb;
}
-void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct perf_buffer *rb)
{
if (!refcount_dec_and_test(&rb->refcount))
return;
@@ -5672,7 +5673,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- struct ring_buffer *rb = ring_buffer_get(event);
+ struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
@@ -5790,8 +5791,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
+ struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
- struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
@@ -5916,7 +5917,15 @@ accounting:
*/
user_lock_limit *= num_online_cpus();
- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+ user_locked = atomic_long_read(&user->locked_vm);
+
+ /*
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
+ */
+ if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += user_extra;
if (user_locked > user_lock_limit) {
/*
@@ -6266,7 +6275,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event,
size_t size)
{
struct perf_event *sampler = event->aux_event;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
data->aux_size = 0;
@@ -6299,7 +6308,7 @@ out:
return data->aux_size;
}
-long perf_pmu_snapshot_aux(struct ring_buffer *rb,
+long perf_pmu_snapshot_aux(struct perf_buffer *rb,
struct perf_event *event,
struct perf_output_handle *handle,
unsigned long size)
@@ -6338,8 +6347,8 @@ static void perf_aux_sample_output(struct perf_event *event,
struct perf_sample_data *data)
{
struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
unsigned long pad;
- struct ring_buffer *rb;
long size;
if (WARN_ON_ONCE(!sampler || !data->aux_size))
@@ -6707,7 +6716,7 @@ void perf_output_sample(struct perf_output_handle *handle,
int wakeup_events = event->attr.wakeup_events;
if (wakeup_events) {
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
int events = local_inc_return(&rb->events);
if (events >= wakeup_events) {
@@ -7150,7 +7159,7 @@ void perf_event_exec(void)
}
struct remote_output {
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
int err;
};
@@ -7158,7 +7167,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
{
struct perf_event *parent = event->parent;
struct remote_output *ro = data;
- struct ring_buffer *rb = ro->rb;
+ struct perf_buffer *rb = ro->rb;
struct stop_event_data sd = {
.event = event,
};
@@ -7495,7 +7504,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
{
struct path ns_path;
struct inode *ns_inode;
- void *error;
+ int error;
error = ns_get_path(&ns_path, task, ns_ops);
if (!error) {
@@ -10998,7 +11007,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL;
+ struct perf_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 747d67f130cb..f16f66b6b655 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -10,7 +10,7 @@
#define RING_BUFFER_WRITABLE 0x01
-struct ring_buffer {
+struct perf_buffer {
refcount_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
@@ -58,17 +58,17 @@ struct ring_buffer {
void *data_pages[0];
};
-extern void rb_free(struct ring_buffer *rb);
+extern void rb_free(struct perf_buffer *rb);
static inline void rb_free_rcu(struct rcu_head *rcu_head)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
- rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+ rb = container_of(rcu_head, struct perf_buffer, rcu_head);
rb_free(rb);
}
-static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
+static inline void rb_toggle_paused(struct perf_buffer *rb, bool pause)
{
if (!pause && rb->nr_pages)
rb->paused = 0;
@@ -76,16 +76,16 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
rb->paused = 1;
}
-extern struct ring_buffer *
+extern struct perf_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
-extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+extern int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags);
-extern void rb_free_aux(struct ring_buffer *rb);
-extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
-extern void ring_buffer_put(struct ring_buffer *rb);
+extern void rb_free_aux(struct perf_buffer *rb);
+extern struct perf_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct perf_buffer *rb);
-static inline bool rb_has_aux(struct ring_buffer *rb)
+static inline bool rb_has_aux(struct perf_buffer *rb)
{
return !!rb->aux_nr_pages;
}
@@ -94,7 +94,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
unsigned long size, u64 flags);
extern struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff);
#ifdef CONFIG_PERF_USE_VMALLOC
/*
@@ -103,25 +103,25 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
* Required for architectures that have d-cache aliasing issues.
*/
-static inline int page_order(struct ring_buffer *rb)
+static inline int page_order(struct perf_buffer *rb)
{
return rb->page_order;
}
#else
-static inline int page_order(struct ring_buffer *rb)
+static inline int page_order(struct perf_buffer *rb)
{
return 0;
}
#endif
-static inline unsigned long perf_data_size(struct ring_buffer *rb)
+static inline unsigned long perf_data_size(struct perf_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
-static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+static inline unsigned long perf_aux_size(struct perf_buffer *rb)
{
return rb->aux_nr_pages << PAGE_SHIFT;
}
@@ -141,7 +141,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
buf += written; \
handle->size -= written; \
if (!handle->size) { \
- struct ring_buffer *rb = handle->rb; \
+ struct perf_buffer *rb = handle->rb; \
\
handle->page++; \
handle->page &= rb->nr_pages - 1; \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 7ffd5c763f93..192b8abc6330 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -35,7 +35,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
*/
static void perf_output_get_handle(struct perf_output_handle *handle)
{
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
preempt_disable();
@@ -49,7 +49,7 @@ static void perf_output_get_handle(struct perf_output_handle *handle)
static void perf_output_put_handle(struct perf_output_handle *handle)
{
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
unsigned long head;
unsigned int nest;
@@ -150,7 +150,7 @@ __perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
bool backward)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned long tail, offset, head;
int have_lost, page_shift;
struct {
@@ -301,7 +301,7 @@ void perf_output_end(struct perf_output_handle *handle)
}
static void
-ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
{
long max_size = perf_data_size(rb);
@@ -361,7 +361,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
{
struct perf_event *output_event = event;
unsigned long aux_head, aux_tail;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned int nest;
if (output_event->parent)
@@ -449,7 +449,7 @@ err:
}
EXPORT_SYMBOL_GPL(perf_aux_output_begin);
-static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb)
+static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
{
if (rb->aux_overwrite)
return false;
@@ -475,7 +475,7 @@ static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb)
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
{
bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
unsigned long aux_head;
/* in overwrite mode, driver provides aux_head via handle */
@@ -532,7 +532,7 @@ EXPORT_SYMBOL_GPL(perf_aux_output_end);
*/
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
if (size > handle->size)
return -ENOSPC;
@@ -569,8 +569,8 @@ long perf_output_copy_aux(struct perf_output_handle *aux_handle,
struct perf_output_handle *handle,
unsigned long from, unsigned long to)
{
+ struct perf_buffer *rb = aux_handle->rb;
unsigned long tocopy, remainder, len = 0;
- struct ring_buffer *rb = aux_handle->rb;
void *addr;
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
@@ -626,7 +626,7 @@ static struct page *rb_alloc_aux_page(int node, int order)
return page;
}
-static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+static void rb_free_aux_page(struct perf_buffer *rb, int idx)
{
struct page *page = virt_to_page(rb->aux_pages[idx]);
@@ -635,7 +635,7 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
__free_page(page);
}
-static void __rb_free_aux(struct ring_buffer *rb)
+static void __rb_free_aux(struct perf_buffer *rb)
{
int pg;
@@ -662,7 +662,7 @@ static void __rb_free_aux(struct ring_buffer *rb)
}
}
-int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags)
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
@@ -753,7 +753,7 @@ out:
return ret;
}
-void rb_free_aux(struct ring_buffer *rb)
+void rb_free_aux(struct perf_buffer *rb)
{
if (refcount_dec_and_test(&rb->aux_refcount))
__rb_free_aux(rb);
@@ -766,7 +766,7 @@ void rb_free_aux(struct ring_buffer *rb)
*/
static struct page *
-__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
if (pgoff > rb->nr_pages)
return NULL;
@@ -798,13 +798,13 @@ static void perf_mmap_free_page(void *addr)
__free_page(page);
}
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned long size;
int i;
- size = sizeof(struct ring_buffer);
+ size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
@@ -843,7 +843,7 @@ fail:
return NULL;
}
-void rb_free(struct ring_buffer *rb)
+void rb_free(struct perf_buffer *rb)
{
int i;
@@ -854,13 +854,13 @@ void rb_free(struct ring_buffer *rb)
}
#else
-static int data_page_nr(struct ring_buffer *rb)
+static int data_page_nr(struct perf_buffer *rb)
{
return rb->nr_pages << page_order(rb);
}
static struct page *
-__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
/* The '>' counts in the user page. */
if (pgoff > data_page_nr(rb))
@@ -878,11 +878,11 @@ static void perf_mmap_unmark_page(void *addr)
static void rb_free_work(struct work_struct *work)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
void *base;
int i, nr;
- rb = container_of(work, struct ring_buffer, work);
+ rb = container_of(work, struct perf_buffer, work);
nr = data_page_nr(rb);
base = rb->user_page;
@@ -894,18 +894,18 @@ static void rb_free_work(struct work_struct *work)
kfree(rb);
}
-void rb_free(struct ring_buffer *rb)
+void rb_free(struct perf_buffer *rb)
{
schedule_work(&rb->work);
}
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned long size;
void *all_buf;
- size = sizeof(struct ring_buffer);
+ size = sizeof(struct perf_buffer);
size += sizeof(void *);
rb = kzalloc(size, GFP_KERNEL);
@@ -939,7 +939,7 @@ fail:
#endif
struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
if (rb->aux_nr_pages) {
/* above AUX space */
diff --git a/kernel/extable.c b/kernel/extable.c
index f6920a11e28a..a0024f27d3a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -131,8 +131,9 @@ int kernel_text_address(unsigned long addr)
* triggers a stack trace, or a WARN() that happens during
* coming back from idle, or cpu on or offlining.
*
- * is_module_text_address() as well as the kprobe slots
- * and is_bpf_text_address() require RCU to be watching.
+ * is_module_text_address() as well as the kprobe slots,
+ * is_bpf_text_address() and is_bpf_image_address require
+ * RCU to be watching.
*/
no_rcu = !rcu_is_watching();
@@ -148,6 +149,8 @@ int kernel_text_address(unsigned long addr)
goto out;
if (is_bpf_text_address(addr))
goto out;
+ if (is_bpf_image_address(addr))
+ goto out;
ret = 0;
out:
if (no_rcu)
diff --git a/kernel/fork.c b/kernel/fork.c
index ef82feb4bddc..60a1295f4384 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -692,7 +692,7 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
- mmu_notifier_mm_destroy(mm);
+ mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
free_mm(mm);
@@ -1025,7 +1025,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_aio(mm);
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
- mmu_notifier_mm_init(mm);
+ mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 060e8e726755..3941a9c48f83 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- select CONSTRUCTORS
+ select CONSTRUCTORS if !UML
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 818b2802d3e7..3089a60ea8f9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -731,6 +731,13 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
*
* Wakeup mode lets this IRQ wake the system from sleep
* states like "suspend to RAM".
+ *
+ * Note: irq enable/disable state is completely orthogonal
+ * to the enable/disable state of irq wake. An irq can be
+ * disabled with disable_irq() and still wake the system as
+ * long as the irq has wake enabled. If this does not hold,
+ * then the underlying irq chip and the related driver need
+ * to be investigated.
*/
int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index cfc4f088a0e7..9e5783d98033 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -176,20 +176,20 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
}
-static const struct file_operations irq_affinity_proc_fops = {
- .open = irq_affinity_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = irq_affinity_proc_write,
+static const struct proc_ops irq_affinity_proc_ops = {
+ .proc_open = irq_affinity_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = irq_affinity_proc_write,
};
-static const struct file_operations irq_affinity_list_proc_fops = {
- .open = irq_affinity_list_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = irq_affinity_list_proc_write,
+static const struct proc_ops irq_affinity_list_proc_ops = {
+ .proc_open = irq_affinity_list_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = irq_affinity_list_proc_write,
};
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
@@ -246,12 +246,12 @@ static int default_affinity_open(struct inode *inode, struct file *file)
return single_open(file, default_affinity_show, PDE_DATA(inode));
}
-static const struct file_operations default_affinity_proc_fops = {
- .open = default_affinity_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = default_affinity_write,
+static const struct proc_ops default_affinity_proc_ops = {
+ .proc_open = default_affinity_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = default_affinity_write,
};
static int irq_node_proc_show(struct seq_file *m, void *v)
@@ -342,7 +342,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
proc_create_data("smp_affinity", 0644, desc->dir,
- &irq_affinity_proc_fops, irqp);
+ &irq_affinity_proc_ops, irqp);
/* create /proc/irq/<irq>/affinity_hint */
proc_create_single_data("affinity_hint", 0444, desc->dir,
@@ -350,7 +350,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
/* create /proc/irq/<irq>/smp_affinity_list */
proc_create_data("smp_affinity_list", 0644, desc->dir,
- &irq_affinity_list_proc_fops, irqp);
+ &irq_affinity_list_proc_ops, irqp);
proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
irqp);
@@ -401,7 +401,7 @@ static void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
proc_create("irq/default_smp_affinity", 0644, NULL,
- &default_affinity_proc_fops);
+ &default_affinity_proc_ops);
#endif
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 136ce049c4ad..d812b90f4c86 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -698,16 +698,16 @@ const char *kdb_walk_kallsyms(loff_t *pos)
}
#endif /* CONFIG_KGDB_KDB */
-static const struct file_operations kallsyms_operations = {
- .open = kallsyms_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static const struct proc_ops kallsyms_proc_ops = {
+ .proc_open = kallsyms_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release_private,
};
static int __init kallsyms_init(void)
{
- proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
+ proc_create("kallsyms", 0444, NULL, &kallsyms_proc_ops);
return 0;
}
device_initcall(kallsyms_init);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 53534aa258a6..2625c241ac00 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -510,6 +510,8 @@ static void do_unoptimize_kprobes(void)
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
+ /* Switching from detour code to origin */
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
/* Disarm probes if marked disabled */
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
@@ -610,6 +612,18 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
+static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+{
+ struct optimized_kprobe *_op;
+
+ list_for_each_entry(_op, &unoptimizing_list, list) {
+ if (op == _op)
+ return true;
+ }
+
+ return false;
+}
+
/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
@@ -631,17 +645,21 @@ static void optimize_kprobe(struct kprobe *p)
return;
/* Check if it is already optimized. */
- if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
+ if (optprobe_queued_unopt(op)) {
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ }
return;
+ }
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- if (!list_empty(&op->list))
- /* This is under unoptimizing. Just dequeue the probe */
- list_del_init(&op->list);
- else {
- list_add(&op->list, &optimizing_list);
- kick_kprobe_optimizer();
- }
+ /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
+ if (WARN_ON_ONCE(!list_empty(&op->list)))
+ return;
+
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
}
/* Short cut to direct unoptimizing */
@@ -649,6 +667,7 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
}
@@ -662,31 +681,33 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
return; /* This is not an optprobe nor optimized */
op = container_of(p, struct optimized_kprobe, kp);
- if (!kprobe_optimized(p)) {
- /* Unoptimized or unoptimizing case */
- if (force && !list_empty(&op->list)) {
- /*
- * Only if this is unoptimizing kprobe and forced,
- * forcibly unoptimize it. (No need to unoptimize
- * unoptimized kprobe again :)
- */
- list_del_init(&op->list);
- force_unoptimize_kprobe(op);
- }
+ if (!kprobe_optimized(p))
return;
- }
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
- list_del_init(&op->list);
+ if (optprobe_queued_unopt(op)) {
+ /* Queued in unoptimizing queue */
+ if (force) {
+ /*
+ * Forcibly unoptimize the kprobe here, and queue it
+ * in the freeing list for release afterwards.
+ */
+ force_unoptimize_kprobe(op);
+ list_move(&op->list, &freeing_list);
+ }
+ } else {
+ /* Dequeue from the optimizing queue */
+ list_del_init(&op->list);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ }
return;
}
+
/* Optimized kprobe case */
- if (force)
+ if (force) {
/* Forcibly update the code: this is a special case */
force_unoptimize_kprobe(op);
- else {
+ } else {
list_add(&op->list, &unoptimizing_list);
kick_kprobe_optimizer();
}
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index e3acead004e6..8d1c15832e55 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -255,17 +255,17 @@ static int lstats_open(struct inode *inode, struct file *filp)
return single_open(filp, lstats_show, NULL);
}
-static const struct file_operations lstats_fops = {
- .open = lstats_open,
- .read = seq_read,
- .write = lstats_write,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops lstats_proc_ops = {
+ .proc_open = lstats_open,
+ .proc_read = seq_read,
+ .proc_write = lstats_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
static int __init init_lstats_procfs(void)
{
- proc_create("latency_stats", 0644, NULL, &lstats_fops);
+ proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
return 0;
}
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index dadb7b7fba37..231684cfc5ae 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -286,9 +286,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
- seq_printf(m, " number of stack traces: %llu\n",
+ seq_printf(m, " number of stack traces: %11llu\n",
lockdep_stack_trace_count());
- seq_printf(m, " number of stack hash chains: %llu\n",
+ seq_printf(m, " number of stack hash chains: %11llu\n",
lockdep_stack_hash_count());
#endif
seq_printf(m, " combined max dependencies: %11u\n",
@@ -643,12 +643,12 @@ static int lock_stat_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static const struct file_operations proc_lock_stat_operations = {
- .open = lock_stat_open,
- .write = lock_stat_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = lock_stat_release,
+static const struct proc_ops lock_stat_proc_ops = {
+ .proc_open = lock_stat_open,
+ .proc_write = lock_stat_write,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = lock_stat_release,
};
#endif /* CONFIG_LOCK_STAT */
@@ -660,8 +660,7 @@ static int __init lockdep_proc_init(void)
#endif
proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
#ifdef CONFIG_LOCK_STAT
- proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
- &proc_lock_stat_operations);
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &lock_stat_proc_ops);
#endif
return 0;
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 6ef600aa0f47..1f7734949ac8 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -134,20 +134,17 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!READ_ONCE(node->locked)) {
- /*
- * If we need to reschedule bail... so we can block.
- * Use vcpu_is_preempted() to avoid waiting for a preempted
- * lock holder:
- */
- if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
- goto unqueue;
-
- cpu_relax();
- }
- return true;
+ /*
+ * Wait to acquire the lock or cancelation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
+ vcpu_is_preempted(node_cpu(node->prev))))
+ return true;
-unqueue:
+ /* unqueue */
/*
* Step - A -- stabilize @prev
*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2473f10c6956..b9515fcc9b29 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -31,14 +31,15 @@
/*
* The basic principle of a queue-based spinlock can best be understood
* by studying a classic queue-based spinlock implementation called the
- * MCS lock. The paper below provides a good description for this kind
- * of lock.
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
*
- * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
*
- * This queued spinlock implementation is based on the MCS lock, however to make
- * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
- * API, we must modify it somehow.
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
*
* In particular; where the traditional MCS lock consists of a tail pointer
* (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
diff --git a/kernel/module.c b/kernel/module.c
index b56f3224b161..33569a01d6e1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -214,7 +214,8 @@ static struct module *mod_find(unsigned long addr)
{
struct module *mod;
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
if (within_module(addr, mod))
return mod;
}
@@ -448,7 +449,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
return true;
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
struct symsearch arr[] = {
{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
NOT_GPL_ONLY, false },
@@ -616,7 +618,8 @@ static struct module *find_module_all(const char *name, size_t len,
module_assert_mutex_or_preempt();
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
continue;
if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
@@ -1781,6 +1784,8 @@ static int module_add_modinfo_attrs(struct module *mod)
error_out:
if (i > 0)
module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
return error;
}
@@ -2031,49 +2036,6 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx);
}
-/* Iterate through all modules and set each module's text as RW */
-void set_all_modules_text_rw(void)
-{
- struct module *mod;
-
- if (!rodata_enabled)
- return;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- frob_text(&mod->core_layout, set_memory_rw);
- frob_text(&mod->init_layout, set_memory_rw);
- }
- mutex_unlock(&module_mutex);
-}
-
-/* Iterate through all modules and set each module's text as RO */
-void set_all_modules_text_ro(void)
-{
- struct module *mod;
-
- if (!rodata_enabled)
- return;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_rcu(mod, &modules, list) {
- /*
- * Ignore going modules since it's possible that ro
- * protection has already been disabled, otherwise we'll
- * run into protection faults at module deallocation.
- */
- if (mod->state == MODULE_STATE_UNFORMED ||
- mod->state == MODULE_STATE_GOING)
- continue;
-
- frob_text(&mod->core_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_ro);
- }
- mutex_unlock(&module_mutex);
-}
#else /* !CONFIG_STRICT_MODULE_RWX */
static void module_enable_nx(const struct module *mod) { }
#endif /* CONFIG_STRICT_MODULE_RWX */
@@ -2877,7 +2839,7 @@ static int module_sig_check(struct load_info *info, int flags)
reason = "Loading of module with unavailable key";
decide:
if (is_module_sig_enforced()) {
- pr_notice("%s is rejected\n", reason);
+ pr_notice("%s: %s is rejected\n", info->name, reason);
return -EKEYREJECTED;
}
@@ -3054,9 +3016,7 @@ static int setup_load_info(struct load_info *info, int flags)
/* Try to find a name early so we can log errors with a module name */
info->index.info = find_sec(info, ".modinfo");
- if (!info->index.info)
- info->name = "(missing .modinfo section)";
- else
+ if (info->index.info)
info->name = get_modinfo(info, "name");
/* Find internal symbols and strings. */
@@ -3071,14 +3031,15 @@ static int setup_load_info(struct load_info *info, int flags)
}
if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", info->name);
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
return -ENOEXEC;
}
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
pr_warn("%s: No module found in object\n",
- info->name ?: "(missing .modinfo name field)");
+ info->name ?: "(missing .modinfo section or name field)");
return -ENOEXEC;
}
/* This is temporary: point mod into copy of data. */
@@ -4393,16 +4354,16 @@ static int modules_open(struct inode *inode, struct file *file)
return err;
}
-static const struct file_operations proc_modules_operations = {
- .open = modules_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops modules_proc_ops = {
+ .proc_open = modules_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static int __init proc_modules_init(void)
{
- proc_create("modules", 0, NULL, &proc_modules_operations);
+ proc_create("modules", 0, NULL, &modules_proc_ops);
return 0;
}
module_init(proc_modules_init);
diff --git a/kernel/padata.c b/kernel/padata.c
index c3fec1413295..72777c10bb9c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -2,7 +2,7 @@
/*
* padata.c - generic interface to process data streams in parallel
*
- * See Documentation/padata.txt for an api documentation.
+ * See Documentation/core-api/padata.rst for more information.
*
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
@@ -35,6 +35,8 @@
#define MAX_OBJ_NUM 1000
+static void padata_free_pd(struct parallel_data *pd);
+
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
@@ -87,7 +89,7 @@ static void padata_parallel_worker(struct work_struct *parallel_work)
/**
* padata_do_parallel - padata parallelization function
*
- * @pinst: padata instance
+ * @ps: padatashell
* @padata: object to be parallelized
* @cb_cpu: pointer to the CPU that the serialization callback function should
* run on. If it's not in the serial cpumask of @pinst
@@ -97,17 +99,20 @@ static void padata_parallel_worker(struct work_struct *parallel_work)
* The parallelization callback function will run with BHs off.
* Note: Every object which is parallelized by padata_do_parallel
* must be seen by padata_do_serial.
+ *
+ * Return: 0 on success or else negative error code.
*/
-int padata_do_parallel(struct padata_instance *pinst,
+int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu)
{
+ struct padata_instance *pinst = ps->pinst;
int i, cpu, cpu_index, target_cpu, err;
struct padata_parallel_queue *queue;
struct parallel_data *pd;
rcu_read_lock_bh();
- pd = rcu_dereference_bh(pinst->pd);
+ pd = rcu_dereference_bh(ps->pd);
err = -EINVAL;
if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
@@ -160,14 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel);
/*
* padata_find_next - Find the next object that needs serialization.
*
- * Return values are:
- *
- * A pointer to the control struct of the next object that needs
- * serialization, if present in one of the percpu reorder queues.
- *
- * NULL, if the next object that needs serialization will
- * be parallel processed by another cpu and is not yet present in
- * the cpu's reorder queue.
+ * Return:
+ * * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ * * NULL, if the next object that needs serialization will
+ * be parallel processed by another cpu and is not yet present in
+ * the cpu's reorder queue.
*/
static struct padata_priv *padata_find_next(struct parallel_data *pd,
bool remove_object)
@@ -199,7 +202,6 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
if (remove_object) {
list_del_init(&padata->list);
- atomic_dec(&pd->reorder_objects);
++pd->processed;
pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);
}
@@ -210,10 +212,10 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
static void padata_reorder(struct parallel_data *pd)
{
+ struct padata_instance *pinst = pd->ps->pinst;
int cb_cpu;
struct padata_priv *padata;
struct padata_serial_queue *squeue;
- struct padata_instance *pinst = pd->pinst;
struct padata_parallel_queue *next_queue;
/*
@@ -283,6 +285,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
+ int cnt;
local_bh_disable();
squeue = container_of(serial_work, struct padata_serial_queue, work);
@@ -292,6 +295,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_replace_init(&squeue->serial.list, &local_list);
spin_unlock(&squeue->serial.lock);
+ cnt = 0;
+
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -301,9 +306,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_del_init(&padata->list);
padata->serial(padata);
- atomic_dec(&pd->refcnt);
+ cnt++;
}
local_bh_enable();
+
+ if (atomic_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
}
/**
@@ -327,7 +335,6 @@ void padata_do_serial(struct padata_priv *padata)
if (cur->seq_nr < padata->seq_nr)
break;
list_add(&padata->list, &cur->list);
- atomic_inc(&pd->reorder_objects);
spin_unlock(&pqueue->reorder.lock);
/*
@@ -341,36 +348,39 @@ void padata_do_serial(struct padata_priv *padata)
}
EXPORT_SYMBOL(padata_do_serial);
-static int padata_setup_cpumasks(struct parallel_data *pd,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static int padata_setup_cpumasks(struct padata_instance *pinst)
{
struct workqueue_attrs *attrs;
+ int err;
+
+ attrs = alloc_workqueue_attrs();
+ if (!attrs)
+ return -ENOMEM;
+
+ /* Restrict parallel_wq workers to pd->cpumask.pcpu. */
+ cpumask_copy(attrs->cpumask, pinst->cpumask.pcpu);
+ err = apply_workqueue_attrs(pinst->parallel_wq, attrs);
+ free_workqueue_attrs(attrs);
+
+ return err;
+}
+
+static int pd_setup_cpumasks(struct parallel_data *pd,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
+{
int err = -ENOMEM;
if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
goto out;
- cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
-
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
goto free_pcpu_mask;
- cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
- attrs = alloc_workqueue_attrs();
- if (!attrs)
- goto free_cbcpu_mask;
-
- /* Restrict parallel_wq workers to pd->cpumask.pcpu. */
- cpumask_copy(attrs->cpumask, pd->cpumask.pcpu);
- err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs);
- free_workqueue_attrs(attrs);
- if (err < 0)
- goto free_cbcpu_mask;
+ cpumask_copy(pd->cpumask.pcpu, pcpumask);
+ cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
return 0;
-free_cbcpu_mask:
- free_cpumask_var(pd->cpumask.cbcpu);
free_pcpu_mask:
free_cpumask_var(pd->cpumask.pcpu);
out:
@@ -414,12 +424,16 @@ static void padata_init_pqueues(struct parallel_data *pd)
}
/* Allocate and initialize the internal cpumask dependend resources. */
-static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
{
+ struct padata_instance *pinst = ps->pinst;
+ const struct cpumask *cbcpumask;
+ const struct cpumask *pcpumask;
struct parallel_data *pd;
+ cbcpumask = pinst->rcpumask.cbcpu;
+ pcpumask = pinst->rcpumask.pcpu;
+
pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
if (!pd)
goto err;
@@ -432,15 +446,14 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
if (!pd->squeue)
goto err_free_pqueue;
- pd->pinst = pinst;
- if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+ pd->ps = ps;
+ if (pd_setup_cpumasks(pd, pcpumask, cbcpumask))
goto err_free_squeue;
padata_init_pqueues(pd);
padata_init_squeues(pd);
atomic_set(&pd->seq_nr, -1);
- atomic_set(&pd->reorder_objects, 0);
- atomic_set(&pd->refcnt, 0);
+ atomic_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
@@ -466,29 +479,6 @@ static void padata_free_pd(struct parallel_data *pd)
kfree(pd);
}
-/* Flush all objects out of the padata queues. */
-static void padata_flush_queues(struct parallel_data *pd)
-{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct padata_serial_queue *squeue;
-
- for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
- flush_work(&pqueue->work);
- }
-
- if (atomic_read(&pd->reorder_objects))
- padata_reorder(pd);
-
- for_each_cpu(cpu, pd->cpumask.cbcpu) {
- squeue = per_cpu_ptr(pd->squeue, cpu);
- flush_work(&squeue->work);
- }
-
- BUG_ON(atomic_read(&pd->refcnt) != 0);
-}
-
static void __padata_start(struct padata_instance *pinst)
{
pinst->flags |= PADATA_INIT;
@@ -502,72 +492,52 @@ static void __padata_stop(struct padata_instance *pinst)
pinst->flags &= ~PADATA_INIT;
synchronize_rcu();
-
- get_online_cpus();
- padata_flush_queues(pinst->pd);
- put_online_cpus();
}
/* Replace the internal control structure with a new one. */
-static void padata_replace(struct padata_instance *pinst,
- struct parallel_data *pd_new)
+static int padata_replace_one(struct padata_shell *ps)
+{
+ struct parallel_data *pd_new;
+
+ pd_new = padata_alloc_pd(ps);
+ if (!pd_new)
+ return -ENOMEM;
+
+ ps->opd = rcu_dereference_protected(ps->pd, 1);
+ rcu_assign_pointer(ps->pd, pd_new);
+
+ return 0;
+}
+
+static int padata_replace(struct padata_instance *pinst)
{
- struct parallel_data *pd_old = pinst->pd;
- int notification_mask = 0;
+ struct padata_shell *ps;
+ int err;
pinst->flags |= PADATA_RESET;
- rcu_assign_pointer(pinst->pd, pd_new);
+ cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu,
+ cpu_online_mask);
- synchronize_rcu();
+ cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu,
+ cpu_online_mask);
- if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
- notification_mask |= PADATA_CPU_PARALLEL;
- if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
- notification_mask |= PADATA_CPU_SERIAL;
+ list_for_each_entry(ps, &pinst->pslist, list) {
+ err = padata_replace_one(ps);
+ if (err)
+ break;
+ }
- padata_flush_queues(pd_old);
- padata_free_pd(pd_old);
+ synchronize_rcu();
- if (notification_mask)
- blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
- notification_mask,
- &pd_new->cpumask);
+ list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
+ if (atomic_dec_and_test(&ps->opd->refcnt))
+ padata_free_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
-}
-
-/**
- * padata_register_cpumask_notifier - Registers a notifier that will be called
- * if either pcpu or cbcpu or both cpumasks change.
- *
- * @pinst: A poineter to padata instance
- * @nblock: A pointer to notifier block.
- */
-int padata_register_cpumask_notifier(struct padata_instance *pinst,
- struct notifier_block *nblock)
-{
- return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
- nblock);
-}
-EXPORT_SYMBOL(padata_register_cpumask_notifier);
-/**
- * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
- * registered earlier using padata_register_cpumask_notifier
- *
- * @pinst: A pointer to data instance.
- * @nlock: A pointer to notifier block.
- */
-int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
- struct notifier_block *nblock)
-{
- return blocking_notifier_chain_unregister(
- &pinst->cpumask_change_notifier,
- nblock);
+ return err;
}
-EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
-
/* If cpumask contains no active cpu, we mark the instance as invalid. */
static bool padata_validate_cpumask(struct padata_instance *pinst,
@@ -587,7 +557,7 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
cpumask_var_t cbcpumask)
{
int valid;
- struct parallel_data *pd;
+ int err;
valid = padata_validate_cpumask(pinst, pcpumask);
if (!valid) {
@@ -600,29 +570,26 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
__padata_stop(pinst);
out_replace:
- pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
- if (!pd)
- return -ENOMEM;
-
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
- padata_replace(pinst, pd);
+ err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst);
if (valid)
__padata_start(pinst);
- return 0;
+ return err;
}
/**
- * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
- * equivalent to @cpumask.
- *
+ * padata_set_cpumask - Sets specified by @cpumask_type cpumask to the value
+ * equivalent to @cpumask.
* @pinst: padata instance
* @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
* to parallel and serial cpumasks respectively.
* @cpumask: the cpumask to use
+ *
+ * Return: 0 on success or negative error code
*/
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask)
@@ -630,8 +597,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- mutex_lock(&pinst->lock);
get_online_cpus();
+ mutex_lock(&pinst->lock);
switch (cpumask_type) {
case PADATA_CPU_PARALLEL:
@@ -649,8 +616,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
out:
- put_online_cpus();
mutex_unlock(&pinst->lock);
+ put_online_cpus();
return err;
}
@@ -660,6 +627,8 @@ EXPORT_SYMBOL(padata_set_cpumask);
* padata_start - start the parallel processing
*
* @pinst: padata instance to start
+ *
+ * Return: 0 on success or negative error code
*/
int padata_start(struct padata_instance *pinst)
{
@@ -695,82 +664,33 @@ EXPORT_SYMBOL(padata_stop);
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd;
+ int err = 0;
if (cpumask_test_cpu(cpu, cpu_online_mask)) {
- pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
- pinst->cpumask.cbcpu);
- if (!pd)
- return -ENOMEM;
-
- padata_replace(pinst, pd);
+ err = padata_replace(pinst);
if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
__padata_start(pinst);
}
- return 0;
+ return err;
}
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd = NULL;
-
- if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+ int err = 0;
+ if (!cpumask_test_cpu(cpu, cpu_online_mask)) {
if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
!padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
__padata_stop(pinst);
- pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
- pinst->cpumask.cbcpu);
- if (!pd)
- return -ENOMEM;
-
- padata_replace(pinst, pd);
-
- cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
- cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
+ err = padata_replace(pinst);
}
- return 0;
-}
-
- /**
- * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
- * padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to remove
- * @mask: bitmask specifying from which cpumask @cpu should be removed
- * The @mask may be any combination of the following flags:
- * PADATA_CPU_SERIAL - serial cpumask
- * PADATA_CPU_PARALLEL - parallel cpumask
- */
-int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
- int err;
-
- if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
- return -EINVAL;
-
- mutex_lock(&pinst->lock);
-
- get_online_cpus();
- if (mask & PADATA_CPU_SERIAL)
- cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
- if (mask & PADATA_CPU_PARALLEL)
- cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
-
- err = __padata_remove_cpu(pinst, cpu);
- put_online_cpus();
-
- mutex_unlock(&pinst->lock);
-
return err;
}
-EXPORT_SYMBOL(padata_remove_cpu);
static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
{
@@ -793,7 +713,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
return ret;
}
-static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node)
+static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
struct padata_instance *pinst;
int ret;
@@ -814,11 +734,15 @@ static enum cpuhp_state hp_online;
static void __padata_free(struct padata_instance *pinst)
{
#ifdef CONFIG_HOTPLUG_CPU
+ cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node);
cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node);
#endif
+ WARN_ON(!list_empty(&pinst->pslist));
+
padata_stop(pinst);
- padata_free_pd(pinst->pd);
+ free_cpumask_var(pinst->rcpumask.cbcpu);
+ free_cpumask_var(pinst->rcpumask.pcpu);
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
destroy_workqueue(pinst->serial_wq);
@@ -959,13 +883,14 @@ static struct kobj_type padata_attr_type = {
* @name: used to identify the instance
* @pcpumask: cpumask that will be used for padata parallelization
* @cbcpumask: cpumask that will be used for padata serialization
+ *
+ * Return: new instance on success, NULL on error
*/
static struct padata_instance *padata_alloc(const char *name,
const struct cpumask *pcpumask,
const struct cpumask *cbcpumask)
{
struct padata_instance *pinst;
- struct parallel_data *pd = NULL;
pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
if (!pinst)
@@ -993,29 +918,40 @@ static struct padata_instance *padata_alloc(const char *name,
!padata_validate_cpumask(pinst, cbcpumask))
goto err_free_masks;
- pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
- if (!pd)
+ if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL))
goto err_free_masks;
+ if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL))
+ goto err_free_rcpumask_pcpu;
- rcu_assign_pointer(pinst->pd, pd);
+ INIT_LIST_HEAD(&pinst->pslist);
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+ cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask);
+ cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask);
+
+ if (padata_setup_cpumasks(pinst))
+ goto err_free_rcpumask_cbcpu;
pinst->flags = 0;
- BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
#ifdef CONFIG_HOTPLUG_CPU
cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);
+ cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD,
+ &pinst->node);
#endif
put_online_cpus();
return pinst;
+err_free_rcpumask_cbcpu:
+ free_cpumask_var(pinst->rcpumask.cbcpu);
+err_free_rcpumask_pcpu:
+ free_cpumask_var(pinst->rcpumask.pcpu);
err_free_masks:
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
@@ -1036,6 +972,8 @@ err:
* parallel workers.
*
* @name: used to identify the instance
+ *
+ * Return: new instance on success, NULL on error
*/
struct padata_instance *padata_alloc_possible(const char *name)
{
@@ -1046,7 +984,7 @@ EXPORT_SYMBOL(padata_alloc_possible);
/**
* padata_free - free a padata instance
*
- * @padata_inst: padata instance to free
+ * @pinst: padata instance to free
*/
void padata_free(struct padata_instance *pinst)
{
@@ -1054,6 +992,63 @@ void padata_free(struct padata_instance *pinst)
}
EXPORT_SYMBOL(padata_free);
+/**
+ * padata_alloc_shell - Allocate and initialize padata shell.
+ *
+ * @pinst: Parent padata_instance object.
+ *
+ * Return: new shell on success, NULL on error
+ */
+struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
+{
+ struct parallel_data *pd;
+ struct padata_shell *ps;
+
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto out;
+
+ ps->pinst = pinst;
+
+ get_online_cpus();
+ pd = padata_alloc_pd(ps);
+ put_online_cpus();
+
+ if (!pd)
+ goto out_free_ps;
+
+ mutex_lock(&pinst->lock);
+ RCU_INIT_POINTER(ps->pd, pd);
+ list_add(&ps->list, &pinst->pslist);
+ mutex_unlock(&pinst->lock);
+
+ return ps;
+
+out_free_ps:
+ kfree(ps);
+out:
+ return NULL;
+}
+EXPORT_SYMBOL(padata_alloc_shell);
+
+/**
+ * padata_free_shell - free a padata shell
+ *
+ * @ps: padata shell to free
+ */
+void padata_free_shell(struct padata_shell *ps)
+{
+ struct padata_instance *pinst = ps->pinst;
+
+ mutex_lock(&pinst->lock);
+ list_del(&ps->list);
+ padata_free_pd(rcu_dereference_protected(ps->pd, 1));
+ mutex_unlock(&pinst->lock);
+
+ kfree(ps);
+}
+EXPORT_SYMBOL(padata_free_shell);
+
#ifdef CONFIG_HOTPLUG_CPU
static __init int padata_driver_init(void)
@@ -1061,17 +1056,24 @@ static __init int padata_driver_init(void)
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
- padata_cpu_online,
- padata_cpu_prep_down);
+ padata_cpu_online, NULL);
if (ret < 0)
return ret;
hp_online = ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
+ NULL, padata_cpu_dead);
+ if (ret < 0) {
+ cpuhp_remove_multi_state(hp_online);
+ return ret;
+ }
return 0;
}
module_init(padata_driver_init);
static __exit void padata_driver_exit(void)
{
+ cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
cpuhp_remove_multi_state(hp_online);
}
module_exit(padata_driver_exit);
diff --git a/kernel/pid.c b/kernel/pid.c
index 2278e249141d..0f4ecb57214c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -578,3 +578,93 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
+
+static struct file *__pidfd_fget(struct task_struct *task, int fd)
+{
+ struct file *file;
+ int ret;
+
+ ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
+ file = fget_task(task, fd);
+ else
+ file = ERR_PTR(-EPERM);
+
+ mutex_unlock(&task->signal->cred_guard_mutex);
+
+ return file ?: ERR_PTR(-EBADF);
+}
+
+static int pidfd_getfd(struct pid *pid, int fd)
+{
+ struct task_struct *task;
+ struct file *file;
+ int ret;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
+
+ file = __pidfd_fget(task, fd);
+ put_task_struct(task);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = security_file_receive(file);
+ if (ret) {
+ fput(file);
+ return ret;
+ }
+
+ ret = get_unused_fd_flags(O_CLOEXEC);
+ if (ret < 0)
+ fput(file);
+ else
+ fd_install(ret, file);
+
+ return ret;
+}
+
+/**
+ * sys_pidfd_getfd() - Get a file descriptor from another process
+ *
+ * @pidfd: the pidfd file descriptor of the process
+ * @fd: the file descriptor number to get
+ * @flags: flags on how to get the fd (reserved)
+ *
+ * This syscall gets a copy of a file descriptor from another process
+ * based on the pidfd, and file descriptor number. It requires that
+ * the calling process has the ability to ptrace the process represented
+ * by the pidfd. The process which is having its file descriptor copied
+ * is otherwise unaffected.
+ *
+ * Return: On success, a cloexec file descriptor is returned.
+ * On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
+ unsigned int, flags)
+{
+ struct pid *pid;
+ struct fd f;
+ int ret;
+
+ /* flags is currently unused - make sure it's unset */
+ if (flags)
+ return -EINVAL;
+
+ f = fdget(pidfd);
+ if (!f.file)
+ return -EBADF;
+
+ pid = pidfd_pid(f.file);
+ if (IS_ERR(pid))
+ ret = PTR_ERR(pid);
+ else
+ ret = pidfd_getfd(pid, fd);
+
+ fdput(f);
+ return ret;
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1ef6f75d92f1..fada22dc4ab6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2770,8 +2770,6 @@ void register_console(struct console *newcon)
* for us.
*/
logbuf_lock_irqsave(flags);
- console_seq = syslog_seq;
- console_idx = syslog_idx;
/*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
@@ -2783,6 +2781,8 @@ void register_console(struct console *newcon)
*/
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
+ console_seq = syslog_seq;
+ console_idx = syslog_idx;
logbuf_unlock_irqrestore(flags);
}
console_unlock();
diff --git a/kernel/profile.c b/kernel/profile.c
index 4b144b02ca5d..6f69a4195d56 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,18 +442,18 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file,
return err;
}
-static const struct file_operations prof_cpu_mask_proc_fops = {
- .open = prof_cpu_mask_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = prof_cpu_mask_proc_write,
+static const struct proc_ops prof_cpu_mask_proc_ops = {
+ .proc_open = prof_cpu_mask_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = prof_cpu_mask_proc_write,
};
void create_prof_cpu_mask(void)
{
/* create /proc/irq/prof_cpu_mask */
- proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
+ proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops);
}
/*
@@ -517,10 +517,10 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
return count;
}
-static const struct file_operations proc_profile_operations = {
- .read = read_profile,
- .write = write_profile,
- .llseek = default_llseek,
+static const struct proc_ops profile_proc_ops = {
+ .proc_read = read_profile,
+ .proc_write = write_profile,
+ .proc_lseek = default_llseek,
};
int __ref create_proc_profile(void)
@@ -548,7 +548,7 @@ int __ref create_proc_profile(void)
err = 0;
#endif
entry = proc_create("profile", S_IWUSR | S_IRUGO,
- NULL, &proc_profile_operations);
+ NULL, &profile_proc_ops);
if (!entry)
goto err_state_onl;
proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 7644eda17d62..1cc940fef17c 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
config TREE_RCU
bool
- default y if !PREEMPTION && SMP
+ default y if SMP
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -17,6 +17,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
default y if PREEMPTION
+ select TREE_RCU
help
This option selects the RCU implementation that is
designed for very large SMP systems with hundreds or
@@ -78,7 +79,7 @@ config TASKS_RCU
user-mode execution as quiescent states.
config RCU_STALL_COMMON
- def_bool ( TREE_RCU || PREEMPT_RCU )
+ def_bool TREE_RCU
help
This option enables RCU CPU stall code that is common between
the TINY and TREE variants of RCU. The purpose is to allow
@@ -86,13 +87,13 @@ config RCU_STALL_COMMON
making these warnings mandatory for the tree variants.
config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
+ def_bool ( TREE_RCU || TREE_SRCU )
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
range 2 64 if 64BIT
range 2 32 if !64BIT
- depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ depends on TREE_RCU && RCU_EXPERT
default 64 if 64BIT
default 32 if !64BIT
help
@@ -112,7 +113,7 @@ config RCU_FANOUT_LEAF
int "Tree-based hierarchical RCU leaf-level fanout value"
range 2 64 if 64BIT
range 2 32 if !64BIT
- depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ depends on TREE_RCU && RCU_EXPERT
default 16
help
This option controls the leaf-level fanout of hierarchical
@@ -187,7 +188,7 @@ config RCU_BOOST_DELAY
config RCU_NOCB_CPU
bool "Offload RCU callback processing from boot-selected CPUs"
- depends on TREE_RCU || PREEMPT_RCU
+ depends on TREE_RCU
depends on RCU_EXPERT || NO_HZ_FULL
default n
help
@@ -200,8 +201,8 @@ config RCU_NOCB_CPU
specified at boot time by the rcu_nocbs parameter. For each
such CPU, a kthread ("rcuox/N") will be created to invoke
callbacks, where the "N" is the CPU being offloaded, and where
- the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched
- (!PREEMPT kernels). Nothing prevents this kthread from running
+ the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched
+ (!PREEMPTION kernels). Nothing prevents this kthread from running
on the specified CPUs, but (1) the kthreads may be preempted
between each callback, and (2) affinity or cgroups can be used
to force the kthreads to run on whatever set of CPUs is desired.
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 020e8b6a644b..82d5fba48b2f 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -9,6 +9,5 @@ obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TINY_RCU) += tiny.o
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ab504fbc76ca..05f936ed167a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -198,33 +198,6 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-void kfree(const void *);
-
-/*
- * Reclaim the specified callback, either by invoking it (non-lazy case)
- * or freeing it directly (lazy case). Return true if lazy, false otherwise.
- */
-static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
-{
- rcu_callback_t f;
- unsigned long offset = (unsigned long)head->func;
-
- rcu_lock_acquire(&rcu_callback_map);
- if (__is_kfree_rcu_offset(offset)) {
- trace_rcu_invoke_kfree_callback(rn, head, offset);
- kfree((void *)head - offset);
- rcu_lock_release(&rcu_callback_map);
- return true;
- } else {
- trace_rcu_invoke_callback(rn, head);
- f = head->func;
- WRITE_ONCE(head->func, (rcu_callback_t)0L);
- f(head);
- rcu_lock_release(&rcu_callback_map);
- return false;
- }
-}
-
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_ftrace_dump;
@@ -281,7 +254,7 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
-#if defined(SRCU) || !defined(TINY_RCU)
+#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
#include <linux/rcu_node_tree.h>
@@ -418,7 +391,7 @@ do { \
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
-#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_SRCU
void srcu_init(void);
@@ -454,7 +427,7 @@ enum rcutorture_type {
INVALID_RCU_FLAVOR
};
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+#if defined(CONFIG_TREE_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
unsigned long *gp_seq);
void do_trace_rcu_torture_read(const char *rcutorturename,
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index cbc87b804db9..5f4fd3b8777c 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -20,14 +20,10 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
rclp->head = NULL;
rclp->tail = &rclp->head;
rclp->len = 0;
- rclp->len_lazy = 0;
}
/*
* Enqueue an rcu_head structure onto the specified callback list.
- * This function assumes that the callback is non-lazy because it
- * is intended for use by no-CBs CPUs, which do not distinguish
- * between lazy and non-lazy RCU callbacks.
*/
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
{
@@ -54,7 +50,6 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
else
drclp->tail = &drclp->head;
drclp->len = srclp->len;
- drclp->len_lazy = srclp->len_lazy;
if (!rhp) {
rcu_cblist_init(srclp);
} else {
@@ -62,16 +57,12 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
srclp->head = rhp;
srclp->tail = &rhp->next;
WRITE_ONCE(srclp->len, 1);
- srclp->len_lazy = 0;
}
}
/*
* Dequeue the oldest rcu_head structure from the specified callback
- * list. This function assumes that the callback is non-lazy, but
- * the caller can later invoke rcu_cblist_dequeued_lazy() if it
- * finds otherwise (and if it cares about laziness). This allows
- * different users to have different ways of determining laziness.
+ * list.
*/
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
{
@@ -161,7 +152,6 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
for (i = 0; i < RCU_CBLIST_NSEGS; i++)
rsclp->tails[i] = &rsclp->head;
rcu_segcblist_set_len(rsclp, 0);
- rsclp->len_lazy = 0;
rsclp->enabled = 1;
}
@@ -173,7 +163,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
{
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
- WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
rsclp->enabled = 0;
}
@@ -253,11 +242,9 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
* absolutely not OK for it to ever miss posting a callback.
*/
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
- struct rcu_head *rhp, bool lazy)
+ struct rcu_head *rhp)
{
rcu_segcblist_inc_len(rsclp);
- if (lazy)
- rsclp->len_lazy++;
smp_mb(); /* Ensure counts are updated before callback is enqueued. */
rhp->next = NULL;
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
@@ -275,15 +262,13 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
* period. You have been warned.
*/
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
- struct rcu_head *rhp, bool lazy)
+ struct rcu_head *rhp)
{
int i;
if (rcu_segcblist_n_cbs(rsclp) == 0)
return false;
rcu_segcblist_inc_len(rsclp);
- if (lazy)
- rsclp->len_lazy++;
smp_mb(); /* Ensure counts are updated before callback is entrained. */
rhp->next = NULL;
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
@@ -307,8 +292,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
- rclp->len_lazy += rsclp->len_lazy;
- rsclp->len_lazy = 0;
rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
}
@@ -361,9 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
- rsclp->len_lazy += rclp->len_lazy;
rcu_segcblist_add_len(rsclp, rclp->len);
- rclp->len_lazy = 0;
rclp->len = 0;
}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 815c2fdd3fcc..5c293afc07b8 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,15 +15,6 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
return READ_ONCE(rclp->len);
}
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
-{
- rclp->len_lazy--;
-}
-
void rcu_cblist_init(struct rcu_cblist *rclp);
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
@@ -59,18 +50,6 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
#endif
}
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
-{
- return rsclp->len_lazy;
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
-{
- return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy;
-}
-
/*
* Is the specified rcu_segcblist enabled, for example, not corresponding
* to an offline CPU?
@@ -106,9 +85,9 @@ struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
- struct rcu_head *rhp, bool lazy);
+ struct rcu_head *rhp);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
- struct rcu_head *rhp, bool lazy);
+ struct rcu_head *rhp);
void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 5f884d560384..da94b89cd531 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
"Shutdown at end of performance tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
@@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished;
static wait_queue_head_t shutdown_wq;
static u64 t_rcu_perf_writer_started;
static u64 t_rcu_perf_writer_finished;
-static unsigned long b_rcu_perf_writer_started;
-static unsigned long b_rcu_perf_writer_finished;
+static unsigned long b_rcu_gp_test_started;
+static unsigned long b_rcu_gp_test_finished;
static DEFINE_PER_CPU(atomic_t, n_async_inflight);
#define MAX_MEAS 10000
@@ -378,10 +379,10 @@ rcu_perf_writer(void *arg)
if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t;
if (gp_exp) {
- b_rcu_perf_writer_started =
+ b_rcu_gp_test_started =
cur_ops->exp_completed() / 2;
} else {
- b_rcu_perf_writer_started = cur_ops->get_gp_seq();
+ b_rcu_gp_test_started = cur_ops->get_gp_seq();
}
}
@@ -429,10 +430,10 @@ retry:
PERFOUT_STRING("Test complete");
t_rcu_perf_writer_finished = t;
if (gp_exp) {
- b_rcu_perf_writer_finished =
+ b_rcu_gp_test_finished =
cur_ops->exp_completed() / 2;
} else {
- b_rcu_perf_writer_finished =
+ b_rcu_gp_test_finished =
cur_ops->get_gp_seq();
}
if (shutdown) {
@@ -515,8 +516,8 @@ rcu_perf_cleanup(void)
t_rcu_perf_writer_finished -
t_rcu_perf_writer_started,
ngps,
- rcuperf_seq_diff(b_rcu_perf_writer_finished,
- b_rcu_perf_writer_started));
+ rcuperf_seq_diff(b_rcu_gp_test_finished,
+ b_rcu_gp_test_started));
for (i = 0; i < nrealwriters; i++) {
if (!writer_durations)
break;
@@ -584,6 +585,159 @@ rcu_perf_shutdown(void *arg)
return -EINVAL;
}
+/*
+ * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number
+ * of iterations and measure total time and number of GP for all iterations to complete.
+ */
+
+torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
+torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
+torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
+
+static struct task_struct **kfree_reader_tasks;
+static int kfree_nrealthreads;
+static atomic_t n_kfree_perf_thread_started;
+static atomic_t n_kfree_perf_thread_ended;
+
+struct kfree_obj {
+ char kfree_obj[8];
+ struct rcu_head rh;
+};
+
+static int
+kfree_perf_thread(void *arg)
+{
+ int i, loop = 0;
+ long me = (long)arg;
+ struct kfree_obj *alloc_ptr;
+ u64 start_time, end_time;
+
+ VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+
+ start_time = ktime_get_mono_fast_ns();
+
+ if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) {
+ if (gp_exp)
+ b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
+ else
+ b_rcu_gp_test_started = cur_ops->get_gp_seq();
+ }
+
+ do {
+ for (i = 0; i < kfree_alloc_num; i++) {
+ alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
+ if (!alloc_ptr)
+ return -ENOMEM;
+
+ kfree_rcu(alloc_ptr, rh);
+ }
+
+ cond_resched();
+ } while (!torture_must_stop() && ++loop < kfree_loops);
+
+ if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
+ end_time = ktime_get_mono_fast_ns();
+
+ if (gp_exp)
+ b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
+ else
+ b_rcu_gp_test_finished = cur_ops->get_gp_seq();
+
+ pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n",
+ (unsigned long long)(end_time - start_time), kfree_loops,
+ rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started));
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+
+ torture_kthread_stopping("kfree_perf_thread");
+ return 0;
+}
+
+static void
+kfree_perf_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (kfree_reader_tasks) {
+ for (i = 0; i < kfree_nrealthreads; i++)
+ torture_stop_kthread(kfree_perf_thread,
+ kfree_reader_tasks[i]);
+ kfree(kfree_reader_tasks);
+ }
+
+ torture_cleanup_end();
+}
+
+/*
+ * shutdown kthread. Just waits to be awakened, then shuts down system.
+ */
+static int
+kfree_perf_shutdown(void *arg)
+{
+ do {
+ wait_event(shutdown_wq,
+ atomic_read(&n_kfree_perf_thread_ended) >=
+ kfree_nrealthreads);
+ } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
+
+ smp_mb(); /* Wake before output. */
+
+ kfree_perf_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+static int __init
+kfree_perf_init(void)
+{
+ long i;
+ int firsterr = 0;
+
+ kfree_nrealthreads = compute_real(kfree_nthreads);
+ /* Start up the kthreads. */
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
+ shutdown_task);
+ if (firsterr)
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
+ GFP_KERNEL);
+ if (kfree_reader_tasks == NULL) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ for (i = 0; i < kfree_nrealthreads; i++) {
+ firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
+ kfree_reader_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+
+ while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
+ schedule_timeout_uninterruptible(1);
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ kfree_perf_cleanup();
+ return firsterr;
+}
+
static int __init
rcu_perf_init(void)
{
@@ -616,6 +770,9 @@ rcu_perf_init(void)
if (cur_ops->init)
cur_ops->init();
+ if (kfree_rcu_test)
+ return kfree_perf_init();
+
nrealwriters = compute_real(nwriters);
nrealreaders = compute_real(nreaders);
atomic_set(&n_rcu_perf_reader_started, 0);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index dee043feb71f..1aeecc165b21 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1661,43 +1661,52 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp)
struct rcu_fwd_cb {
struct rcu_head rh;
struct rcu_fwd_cb *rfc_next;
+ struct rcu_fwd *rfc_rfp;
int rfc_gps;
};
-static DEFINE_SPINLOCK(rcu_fwd_lock);
-static struct rcu_fwd_cb *rcu_fwd_cb_head;
-static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head;
-static long n_launders_cb;
-static unsigned long rcu_fwd_startat;
-static bool rcu_fwd_emergency_stop;
+
#define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */
#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */
#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */
#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */
+#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
+
struct rcu_launder_hist {
long n_launders;
unsigned long launder_gp_seq;
};
-#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
-static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
-static unsigned long rcu_launder_gp_seq_start;
-static void rcu_torture_fwd_cb_hist(void)
+struct rcu_fwd {
+ spinlock_t rcu_fwd_lock;
+ struct rcu_fwd_cb *rcu_fwd_cb_head;
+ struct rcu_fwd_cb **rcu_fwd_cb_tail;
+ long n_launders_cb;
+ unsigned long rcu_fwd_startat;
+ struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
+ unsigned long rcu_launder_gp_seq_start;
+};
+
+struct rcu_fwd *rcu_fwds;
+bool rcu_fwd_emergency_stop;
+
+static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
{
unsigned long gps;
unsigned long gps_old;
int i;
int j;
- for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
- if (n_launders_hist[i].n_launders > 0)
+ for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
+ if (rfp->n_launders_hist[i].n_launders > 0)
break;
pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
- __func__, jiffies - rcu_fwd_startat);
- gps_old = rcu_launder_gp_seq_start;
+ __func__, jiffies - rfp->rcu_fwd_startat);
+ gps_old = rfp->rcu_launder_gp_seq_start;
for (j = 0; j <= i; j++) {
- gps = n_launders_hist[j].launder_gp_seq;
+ gps = rfp->n_launders_hist[j].launder_gp_seq;
pr_cont(" %ds/%d: %ld:%ld",
- j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders,
+ j + 1, FWD_CBS_HIST_DIV,
+ rfp->n_launders_hist[j].n_launders,
rcutorture_seq_diff(gps, gps_old));
gps_old = gps;
}
@@ -1711,26 +1720,27 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
int i;
struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh);
struct rcu_fwd_cb **rfcpp;
+ struct rcu_fwd *rfp = rfcp->rfc_rfp;
rfcp->rfc_next = NULL;
rfcp->rfc_gps++;
- spin_lock_irqsave(&rcu_fwd_lock, flags);
- rfcpp = rcu_fwd_cb_tail;
- rcu_fwd_cb_tail = &rfcp->rfc_next;
+ spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
+ rfcpp = rfp->rcu_fwd_cb_tail;
+ rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
WRITE_ONCE(*rfcpp, rfcp);
- WRITE_ONCE(n_launders_cb, n_launders_cb + 1);
- i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
- if (i >= ARRAY_SIZE(n_launders_hist))
- i = ARRAY_SIZE(n_launders_hist) - 1;
- n_launders_hist[i].n_launders++;
- n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
- spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+ WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
+ i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
+ if (i >= ARRAY_SIZE(rfp->n_launders_hist))
+ i = ARRAY_SIZE(rfp->n_launders_hist) - 1;
+ rfp->n_launders_hist[i].n_launders++;
+ rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
}
// Give the scheduler a chance, even on nohz_full CPUs.
static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
{
- if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
// Real call_rcu() floods hit userspace, so emulate that.
if (need_resched() || (iter & 0xfff))
schedule();
@@ -1744,23 +1754,23 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
* Free all callbacks on the rcu_fwd_cb_head list, either because the
* test is over or because we hit an OOM event.
*/
-static unsigned long rcu_torture_fwd_prog_cbfree(void)
+static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp)
{
unsigned long flags;
unsigned long freed = 0;
struct rcu_fwd_cb *rfcp;
for (;;) {
- spin_lock_irqsave(&rcu_fwd_lock, flags);
- rfcp = rcu_fwd_cb_head;
+ spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
+ rfcp = rfp->rcu_fwd_cb_head;
if (!rfcp) {
- spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
break;
}
- rcu_fwd_cb_head = rfcp->rfc_next;
- if (!rcu_fwd_cb_head)
- rcu_fwd_cb_tail = &rcu_fwd_cb_head;
- spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+ rfp->rcu_fwd_cb_head = rfcp->rfc_next;
+ if (!rfp->rcu_fwd_cb_head)
+ rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
kfree(rfcp);
freed++;
rcu_torture_fwd_prog_cond_resched(freed);
@@ -1774,7 +1784,8 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
}
/* Carry out need_resched()/cond_resched() forward-progress testing. */
-static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
+static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
+ int *tested, int *tested_tries)
{
unsigned long cver;
unsigned long dur;
@@ -1804,8 +1815,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
sd = cur_ops->stall_dur() + 1;
sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
dur = sd4 + torture_random(&trs) % (sd - sd4);
- WRITE_ONCE(rcu_fwd_startat, jiffies);
- stopat = rcu_fwd_startat + dur;
+ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
+ stopat = rfp->rcu_fwd_startat + dur;
while (time_before(jiffies, stopat) &&
!shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
@@ -1840,7 +1851,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
}
/* Carry out call_rcu() forward-progress testing. */
-static void rcu_torture_fwd_prog_cr(void)
+static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
{
unsigned long cver;
unsigned long flags;
@@ -1864,23 +1875,23 @@ static void rcu_torture_fwd_prog_cr(void)
/* Loop continuously posting RCU callbacks. */
WRITE_ONCE(rcu_fwd_cb_nodelay, true);
cur_ops->sync(); /* Later readers see above write. */
- WRITE_ONCE(rcu_fwd_startat, jiffies);
- stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
+ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
+ stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
n_launders = 0;
- n_launders_cb = 0;
+ rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread
n_launders_sa = 0;
n_max_cbs = 0;
n_max_gps = 0;
- for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
- n_launders_hist[i].n_launders = 0;
+ for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++)
+ rfp->n_launders_hist[i].n_launders = 0;
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
- rcu_launder_gp_seq_start = gps;
+ rfp->rcu_launder_gp_seq_start = gps;
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
while (time_before(jiffies, stopat) &&
!shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
- rfcp = READ_ONCE(rcu_fwd_cb_head);
+ rfcp = READ_ONCE(rfp->rcu_fwd_cb_head);
rfcpn = NULL;
if (rfcp)
rfcpn = READ_ONCE(rfcp->rfc_next);
@@ -1888,7 +1899,7 @@ static void rcu_torture_fwd_prog_cr(void)
if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
break;
- rcu_fwd_cb_head = rfcpn;
+ rfp->rcu_fwd_cb_head = rfcpn;
n_launders++;
n_launders_sa++;
} else {
@@ -1900,6 +1911,7 @@ static void rcu_torture_fwd_prog_cr(void)
n_max_cbs++;
n_launders_sa = 0;
rfcp->rfc_gps = 0;
+ rfcp->rfc_rfp = rfp;
}
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
@@ -1910,22 +1922,22 @@ static void rcu_torture_fwd_prog_cr(void)
}
}
stoppedat = jiffies;
- n_launders_cb_snap = READ_ONCE(n_launders_cb);
+ n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
- (void)rcu_torture_fwd_prog_cbfree();
+ (void)rcu_torture_fwd_prog_cbfree(rfp);
if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
!shutdown_time_arrived()) {
WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
__func__,
- stoppedat - rcu_fwd_startat, jiffies - stoppedat,
+ stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat,
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
- rcu_torture_fwd_cb_hist();
+ rcu_torture_fwd_cb_hist(rfp);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
@@ -1940,20 +1952,22 @@ static void rcu_torture_fwd_prog_cr(void)
static int rcutorture_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
+ struct rcu_fwd *rfp = rcu_fwds;
+
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
- rcu_torture_fwd_cb_hist();
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
+ rcu_torture_fwd_cb_hist(rfp);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree());
+ __func__, rcu_torture_fwd_prog_cbfree(rfp));
rcu_barrier();
pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree());
+ __func__, rcu_torture_fwd_prog_cbfree(rfp));
rcu_barrier();
pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree());
+ __func__, rcu_torture_fwd_prog_cbfree(rfp));
smp_mb(); /* Frees before return to avoid redoing OOM. */
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
pr_info("%s returning after OOM processing.\n", __func__);
@@ -1967,6 +1981,7 @@ static struct notifier_block rcutorture_oom_nb = {
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
@@ -1978,8 +1993,8 @@ static int rcu_torture_fwd_prog(void *args)
schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
WRITE_ONCE(rcu_fwd_emergency_stop, false);
register_oom_notifier(&rcutorture_oom_nb);
- rcu_torture_fwd_prog_nr(&tested, &tested_tries);
- rcu_torture_fwd_prog_cr();
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
+ rcu_torture_fwd_prog_cr(rfp);
unregister_oom_notifier(&rcutorture_oom_nb);
/* Avoid slow periods, better to test when busy. */
@@ -1995,6 +2010,8 @@ static int rcu_torture_fwd_prog(void *args)
/* If forward-progress checking is requested and feasible, spawn the thread. */
static int __init rcu_torture_fwd_prog_init(void)
{
+ struct rcu_fwd *rfp;
+
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 ||
@@ -2013,8 +2030,12 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- return torture_create_kthread(rcu_torture_fwd_prog,
- NULL, fwd_prog_task);
+ rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
+ if (!rfp)
+ return -ENOMEM;
+ spin_lock_init(&rfp->rcu_fwd_lock);
+ rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
}
/* Callback function for RCU barrier testing. */
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 44d6606b8325..6208c1dae5c9 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
* Workqueue handler to drive one grace period and invoke any callbacks
- * that become ready as a result. Single-CPU and !PREEMPT operation
+ * that become ready as a result. Single-CPU and !PREEMPTION operation
* means that we get away with murder on synchronization. ;-)
*/
void srcu_drive_gp(struct work_struct *wp)
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 5dffade2d7cd..657e6a7d1c03 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = srcu_get_delay(ssp);
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
@@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long flags;
struct srcu_data *sdp;
unsigned long t;
+ unsigned long tlast;
/* If the local srcu_data structure has callbacks, not idle. */
local_irq_save(flags);
@@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
+ tlast = READ_ONCE(ssp->srcu_last_gp_end);
if (exp_holdoff == 0 ||
- time_in_range_open(t, ssp->srcu_last_gp_end,
- ssp->srcu_last_gp_end + exp_holdoff))
+ time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
@@ -853,7 +854,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
local_irq_save(flags);
sdp = this_cpu_ptr(ssp->sda);
spin_lock_rcu_node(sdp);
- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_gp_seq));
s = rcu_seq_snap(&ssp->srcu_gp_seq);
@@ -1052,7 +1053,7 @@ void srcu_barrier(struct srcu_struct *ssp)
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head, 0)) {
+ &sdp->srcu_barrier_head)) {
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
atomic_dec(&ssp->srcu_barrier_cpu_cnt);
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 477b4eb44af5..dd572ce7c747 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
+#include <linux/slab.h>
#include "rcu.h"
@@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user)
}
}
+/*
+ * Reclaim the specified callback, either by invoking it for non-kfree cases or
+ * freeing it directly (for kfree). Return true if kfreeing, false otherwise.
+ */
+static inline bool rcu_reclaim_tiny(struct rcu_head *head)
+{
+ rcu_callback_t f;
+ unsigned long offset = (unsigned long)head->func;
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (__is_kfree_rcu_offset(offset)) {
+ trace_rcu_invoke_kfree_callback("", head, offset);
+ kfree((void *)head - offset);
+ rcu_lock_release(&rcu_callback_map);
+ return true;
+ }
+
+ trace_rcu_invoke_callback("", head);
+ f = head->func;
+ WRITE_ONCE(head->func, (rcu_callback_t)0L);
+ f(head);
+ rcu_lock_release(&rcu_callback_map);
+ return false;
+}
+
/* Invoke the RCU callbacks whose grace period has elapsed. */
static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
@@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
prefetch(next);
debug_rcu_head_unqueue(list);
local_bh_disable();
- __rcu_reclaim("", list);
+ rcu_reclaim_tiny(list);
local_bh_enable();
list = next;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1694a6b57ad8..d91c9156fab2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -43,7 +43,6 @@
#include <uapi/linux/sched/types.h>
#include <linux/prefetch.h>
#include <linux/delay.h>
-#include <linux/stop_machine.h>
#include <linux/random.h>
#include <linux/trace_events.h>
#include <linux/suspend.h>
@@ -55,6 +54,7 @@
#include <linux/oom.h>
#include <linux/smpboot.h>
#include <linux/jiffies.h>
+#include <linux/slab.h>
#include <linux/sched/isolation.h>
#include <linux/sched/clock.h>
#include "../time/tick-internal.h"
@@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
};
-struct rcu_state rcu_state = {
+static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
.gp_state = RCU_GP_IDLE,
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
@@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
* held, but the bit corresponding to the current CPU will be stable
* in most contexts.
*/
-unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
{
return READ_ONCE(rnp->qsmaskinitnext);
}
@@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void)
*
* No ordering, as we are sampling CPU-local information.
*/
-bool rcu_dynticks_curr_cpu_in_eqs(void)
+static bool rcu_dynticks_curr_cpu_in_eqs(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
* Snapshot the ->dynticks counter with full ordering so as to allow
* stable comparison of this counter with past and future snapshots.
*/
-int rcu_dynticks_snap(struct rcu_data *rdp)
+static int rcu_dynticks_snap(struct rcu_data *rdp)
{
int snap = atomic_add_return(0, &rdp->dynticks);
@@ -529,16 +529,6 @@ static struct rcu_node *rcu_get_root(void)
}
/*
- * Convert a ->gp_state value to a character string.
- */
-static const char *gp_state_getname(short gs)
-{
- if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
- return "???";
- return gp_state_names[gs];
-}
-
-/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
@@ -577,7 +567,7 @@ static void rcu_eqs_enter(bool user)
}
lockdep_assert_irqs_disabled();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks);
+ trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
do_nocb_deferred_wakeup(rdp);
@@ -650,14 +640,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
* leave it in non-RCU-idle state.
*/
if (rdp->dynticks_nmi_nesting != 1) {
- trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
+ trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
+ atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
rdp->dynticks_nmi_nesting - 2);
return;
}
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks);
+ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
if (irq)
@@ -744,7 +735,7 @@ static void rcu_eqs_exit(bool user)
rcu_dynticks_task_exit();
rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
+ trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -800,8 +791,8 @@ void rcu_user_exit(void)
*/
static __always_inline void rcu_nmi_enter_common(bool irq)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
long incby = 2;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
/* Complain about underflow. */
WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
@@ -828,12 +819,17 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
} else if (tick_nohz_full_cpu(rdp->cpu) &&
rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
- rdp->rcu_forced_tick = true;
- tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+ raw_spin_lock_rcu_node(rdp->mynode);
+ // Recheck under lock.
+ if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+ rdp->rcu_forced_tick = true;
+ tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+ }
+ raw_spin_unlock_rcu_node(rdp->mynode);
}
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, rdp->dynticks);
+ rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
rdp->dynticks_nmi_nesting + incby);
barrier();
@@ -898,6 +894,7 @@ void rcu_irq_enter_irqson(void)
*/
static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
{
+ raw_lockdep_assert_held_rcu_node(rdp->mynode);
WRITE_ONCE(rdp->rcu_urgent_qs, false);
WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
@@ -1934,7 +1931,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
struct rcu_node *rnp_p;
raw_lockdep_assert_held_rcu_node(rnp);
- if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
rnp->qsmask != 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2146,7 +2143,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
trace_rcu_batch_start(rcu_state.name,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), 0);
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
@@ -2168,7 +2164,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
if (unlikely(bl > 100))
tlimit = local_clock() + rcu_resched_ns;
trace_rcu_batch_start(rcu_state.name,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
if (offloaded)
@@ -2179,9 +2174,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ rcu_callback_t f;
+
debug_rcu_head_unqueue(rhp);
- if (__rcu_reclaim(rcu_state.name, rhp))
- rcu_cblist_dequeued_lazy(&rcl);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_callback(rcu_state.name, rhp);
+
+ f = rhp->func;
+ WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
+ f(rhp);
+
+ rcu_lock_release(&rcu_callback_map);
+
/*
* Stop only if limit reached and CPU has something to do.
* Note: The rcl structure counts down from zero.
@@ -2294,7 +2299,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
- if (!IS_ENABLED(CONFIG_PREEMPTION) ||
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
rcu_preempt_blocked_readers_cgp(rnp)) {
/*
* No point in scanning bits because they
@@ -2308,14 +2313,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
continue;
}
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
- if ((rnp->qsmask & bit) != 0) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (f(rdp)) {
- mask |= bit;
- rcu_disable_urgency_upon_qs(rdp);
- }
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (f(rdp)) {
+ mask |= rdp->grpmask;
+ rcu_disable_urgency_upon_qs(rdp);
}
}
if (mask != 0) {
@@ -2474,8 +2476,8 @@ static void rcu_cpu_kthread(unsigned int cpu)
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
int spincnt;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
@@ -2583,7 +2585,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
* is expected to specify a CPU.
*/
static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
+__call_rcu(struct rcu_head *head, rcu_callback_t func)
{
unsigned long flags;
struct rcu_data *rdp;
@@ -2618,18 +2620,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
}
+
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
return; // Enqueued onto ->nocb_bypass, so just leave.
/* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
- rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
+ rcu_segcblist_enqueue(&rdp->cblist, head);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
(unsigned long)func,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist));
else
trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
@@ -2679,28 +2680,230 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, 0);
+ __call_rcu(head, func);
}
EXPORT_SYMBOL_GPL(call_rcu);
+
+/* Maximum number of jiffies to wait before draining a batch. */
+#define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_N_BATCHES 2
+
+/**
+ * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+ * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @krcp: Pointer to @kfree_rcu_cpu structure
+ */
+
+struct kfree_rcu_cpu_work {
+ struct rcu_work rcu_work;
+ struct rcu_head *head_free;
+ struct kfree_rcu_cpu *krcp;
+};
+
+/**
+ * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+ * @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
+ * @lock: Synchronize access to this structure
+ * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+ * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
+ * @initialized: The @lock and @rcu_work fields have been initialized
+ *
+ * This is a per-CPU structure. The reason that it is not included in
+ * the rcu_data structure is to permit this code to be extracted from
+ * the RCU files. Such extraction could allow further optimization of
+ * the interactions with the slab allocators.
+ */
+struct kfree_rcu_cpu {
+ struct rcu_head *head;
+ struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
+ spinlock_t lock;
+ struct delayed_work monitor_work;
+ bool monitor_todo;
+ bool initialized;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+
+/*
+ * This function is invoked in workqueue context after a grace period.
+ * It frees all the objects queued on ->head_free.
+ */
+static void kfree_rcu_work(struct work_struct *work)
+{
+ unsigned long flags;
+ struct rcu_head *head, *next;
+ struct kfree_rcu_cpu *krcp;
+ struct kfree_rcu_cpu_work *krwp;
+
+ krwp = container_of(to_rcu_work(work),
+ struct kfree_rcu_cpu_work, rcu_work);
+ krcp = krwp->krcp;
+ spin_lock_irqsave(&krcp->lock, flags);
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // List "head" is now private, so traverse locklessly.
+ for (; head; head = next) {
+ unsigned long offset = (unsigned long)head->func;
+
+ next = head->next;
+ // Potentially optimize with kfree_bulk in future.
+ debug_rcu_head_unqueue(head);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+
+ if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
+ /* Could be optimized with kfree_bulk() in future. */
+ kfree((void *)head - offset);
+ }
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
+ * Schedule the kfree batch RCU work to run in workqueue context after a GP.
+ *
+ * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
+ * timeout has been reached.
+ */
+static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+{
+ int i;
+ struct kfree_rcu_cpu_work *krwp = NULL;
+
+ lockdep_assert_held(&krcp->lock);
+ for (i = 0; i < KFREE_N_BATCHES; i++)
+ if (!krcp->krw_arr[i].head_free) {
+ krwp = &(krcp->krw_arr[i]);
+ break;
+ }
+
+ // If a previous RCU batch is in progress, we cannot immediately
+ // queue another one, so return false to tell caller to retry.
+ if (!krwp)
+ return false;
+
+ krwp->head_free = krcp->head;
+ krcp->head = NULL;
+ INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
+ queue_rcu_work(system_wq, &krwp->rcu_work);
+ return true;
+}
+
+static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
+ unsigned long flags)
+{
+ // Attempt to start a new batch.
+ krcp->monitor_todo = false;
+ if (queue_kfree_rcu_work(krcp)) {
+ // Success! Our job is done here.
+ spin_unlock_irqrestore(&krcp->lock, flags);
+ return;
+ }
+
+ // Previous RCU batch still in progress, try again later.
+ krcp->monitor_todo = true;
+ schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+/*
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+ unsigned long flags;
+ struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
+ monitor_work.work);
+
+ spin_lock_irqsave(&krcp->lock, flags);
+ if (krcp->monitor_todo)
+ kfree_rcu_drain_unlock(krcp, flags);
+ else
+ spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+/*
+ * Queue a request for lazy invocation of kfree() after a grace period.
+ *
+ * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
+ * will be kfree'd in workqueue context. This allows us to:
+ *
+ * 1. Batch requests together to reduce the number of grace periods during
+ * heavy kfree_rcu() load.
+ *
+ * 2. It makes it possible to use kfree_bulk() on a large number of
+ * kfree_rcu() requests thus reducing cache misses and the per-object
+ * overhead of kfree().
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, 1);
+ unsigned long flags;
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ if (krcp->initialized)
+ spin_lock(&krcp->lock);
+
+ // Queue the object but don't yet schedule the batch.
+ if (debug_rcu_head_queue(head)) {
+ // Probable double kfree_rcu(), just leak.
+ WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
+ __func__, head);
+ goto unlock_return;
+ }
+ head->func = func;
+ head->next = krcp->head;
+ krcp->head = head;
+
+ // Set timer to drain after KFREE_DRAIN_JIFFIES.
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !krcp->monitor_todo) {
+ krcp->monitor_todo = true;
+ schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ }
+
+unlock_return:
+ if (krcp->initialized)
+ spin_unlock(&krcp->lock);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
+void __init kfree_rcu_scheduler_running(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ for_each_online_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ spin_lock_irqsave(&krcp->lock, flags);
+ if (!krcp->head || krcp->monitor_todo) {
+ spin_unlock_irqrestore(&krcp->lock, flags);
+ continue;
+ }
+ krcp->monitor_todo = true;
+ schedule_delayed_work_on(cpu, &krcp->monitor_work,
+ KFREE_DRAIN_JIFFIES);
+ spin_unlock_irqrestore(&krcp->lock, flags);
+ }
+}
+
/*
* During early boot, any blocking grace-period wait automatically
- * implies a grace period. Later on, this is never the case for PREEMPT.
+ * implies a grace period. Later on, this is never the case for PREEMPTION.
*
- * Howevr, because a context switch is a grace period for !PREEMPT, any
+ * Howevr, because a context switch is a grace period for !PREEMPTION, any
* blocking grace-period wait automatically implies a grace period if
* there is only one CPU online at any point time during execution of
* either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
@@ -2896,7 +3099,7 @@ static void rcu_barrier_func(void *unused)
debug_rcu_head_queue(&rdp->barrier_head);
rcu_nocb_lock(rdp);
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
- if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
+ if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
@@ -3557,12 +3760,29 @@ static void __init rcu_dump_rcu_node_tree(void)
struct workqueue_struct *rcu_gp_wq;
struct workqueue_struct *rcu_par_gp_wq;
+static void __init kfree_rcu_batch_init(void)
+{
+ int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ spin_lock_init(&krcp->lock);
+ for (i = 0; i < KFREE_N_BATCHES; i++)
+ krcp->krw_arr[i].krcp = krcp;
+ INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ krcp->initialized = true;
+ }
+}
+
void __init rcu_init(void)
{
int cpu;
rcu_early_boot_tests();
+ kfree_rcu_batch_init();
rcu_bootup_announce();
rcu_init_geometry();
rcu_init_one();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 055c31781d3a..0c87e4c161c2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -16,7 +16,6 @@
#include <linux/cpumask.h>
#include <linux/seqlock.h>
#include <linux/swait.h>
-#include <linux/stop_machine.h>
#include <linux/rcu_node_tree.h>
#include "rcu_segcblist.h"
@@ -182,8 +181,8 @@ struct rcu_data {
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
+ bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
#ifdef CONFIG_RCU_FAST_NO_HZ
- bool all_lazy; /* All CPU's CBs lazy at idle start? */
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
@@ -368,18 +367,6 @@ struct rcu_state {
#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */
#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */
-static const char * const gp_state_names[] = {
- "RCU_GP_IDLE",
- "RCU_GP_WAIT_GPS",
- "RCU_GP_DONE_GPS",
- "RCU_GP_ONOFF",
- "RCU_GP_INIT",
- "RCU_GP_WAIT_FQS",
- "RCU_GP_DOING_FQS",
- "RCU_GP_CLEANUP",
- "RCU_GP_CLEANED",
-};
-
/*
* In order to export the rcu_state name to the tracing tools, it
* needs to be added in the __tracepoint_string section.
@@ -403,8 +390,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
#define RCU_NAME rcu_name
#endif /* #else #ifdef CONFIG_TRACING */
-int rcu_dynticks_snap(struct rcu_data *rdp);
-
/* Forward declarations for tree_plugin.h */
static void rcu_bootup_announce(void);
static void rcu_qs(void);
@@ -415,7 +400,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_flavor_sched_clock_irq(int user);
-void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d632cd019597..dcbd75791f39 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void)
}
/*
- * Return then value that expedited-grace-period counter will have
+ * Return the value that the expedited-grace-period counter will have
* at the end of the current grace period.
*/
static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
@@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void)
}
/*
- * Take a snapshot of the expedited-grace-period counter.
+ * Take a snapshot of the expedited-grace-period counter, which is the
+ * earliest value that will indicate that a full grace period has
+ * elapsed since the current time.
*/
static unsigned long rcu_exp_gp_seq_snap(void)
{
@@ -134,7 +136,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
rcu_for_each_node_breadth_first(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
- rnp->expmask = rnp->expmaskinit;
+ WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -143,31 +145,26 @@ static void __maybe_unused sync_exp_reset_tree(void)
* Return non-zero if there is no RCU expedited grace period in progress
* for the specified rcu_node structure, in other words, if all CPUs and
* tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period. Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the specificed rcu_node structure's ->lock
+ * for the current expedited grace period.
*/
-static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+static bool sync_rcu_exp_done(struct rcu_node *rnp)
{
raw_lockdep_assert_held_rcu_node(rnp);
-
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
}
/*
- * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
- * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
- * itself
+ * Like sync_rcu_exp_done(), but where the caller does not hold the
+ * rcu_node's ->lock.
*/
-static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
{
unsigned long flags;
bool ret;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ret = sync_rcu_preempt_exp_done(rnp);
+ ret = sync_rcu_exp_done(rnp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return ret;
@@ -181,8 +178,6 @@ static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
* which the task was queued or to one of that rcu_node structure's ancestors,
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
- *
- * Caller must hold the specified rcu_node structure's ->lock.
*/
static void __rcu_report_exp_rnp(struct rcu_node *rnp,
bool wake, unsigned long flags)
@@ -190,8 +185,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
{
unsigned long mask;
+ raw_lockdep_assert_held_rcu_node(rnp);
for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp)) {
+ if (!sync_rcu_exp_done(rnp)) {
if (!rnp->expmask)
rcu_initiate_boost(rnp, flags);
else
@@ -211,7 +207,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
rnp = rnp->parent;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
WARN_ON_ONCE(!(rnp->expmask & mask));
- rnp->expmask &= ~mask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
}
}
@@ -234,14 +230,23 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
unsigned long mask, bool wake)
{
+ int cpu;
unsigned long flags;
+ struct rcu_data *rdp;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
- rnp->expmask &= ~mask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp)
+ continue;
+ rdp->rcu_forced_tick_exp = false;
+ tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ }
__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
}
@@ -345,8 +350,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
/* Each pass checks a CPU for identity, offline, and idle. */
mask_ofl_test = 0;
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long mask = rdp->grpmask;
int snap;
if (raw_smp_processor_id() == cpu ||
@@ -372,12 +377,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long mask = rdp->grpmask;
- if (!(mask_ofl_ipi & mask))
- continue;
retry_ipi:
if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
mask_ofl_test |= mask;
@@ -389,10 +392,10 @@ retry_ipi:
}
ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
put_cpu();
- if (!ret) {
- mask_ofl_ipi &= ~mask;
+ /* The CPU will report the QS in response to the IPI. */
+ if (!ret)
continue;
- }
+
/* Failed, raced with CPU hotplug operation. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if ((rnp->qsmaskinitnext & mask) &&
@@ -403,13 +406,12 @@ retry_ipi:
schedule_timeout_uninterruptible(1);
goto retry_ipi;
}
- /* CPU really is offline, so we can ignore it. */
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
+ /* CPU really is offline, so we must report its QS. */
+ if (rnp->expmask & mask)
+ mask_ofl_test |= mask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Report quiescent states for those that went offline. */
- mask_ofl_test |= mask_ofl_ipi;
if (mask_ofl_test)
rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
}
@@ -456,29 +458,61 @@ static void sync_rcu_exp_select_cpus(void)
flush_work(&rnp->rew.rew_work);
}
-static void synchronize_sched_expedited_wait(void)
+/*
+ * Wait for the expedited grace period to elapse, within time limit.
+ * If the time limit is exceeded without the grace period elapsing,
+ * return false, otherwise return true.
+ */
+static bool synchronize_rcu_expedited_wait_once(long tlimit)
+{
+ int t;
+ struct rcu_node *rnp_root = rcu_get_root();
+
+ t = swait_event_timeout_exclusive(rcu_state.expedited_wq,
+ sync_rcu_exp_done_unlocked(rnp_root),
+ tlimit);
+ // Workqueues should not be signaled.
+ if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root))
+ return true;
+ WARN_ON(t < 0); /* workqueues should not be signaled. */
+ return false;
+}
+
+/*
+ * Wait for the expedited grace period to elapse, issuing any needed
+ * RCU CPU stall warnings along the way.
+ */
+static void synchronize_rcu_expedited_wait(void)
{
int cpu;
unsigned long jiffies_stall;
unsigned long jiffies_start;
unsigned long mask;
int ndetected;
+ struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root();
- int ret;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
jiffies_stall = rcu_jiffies_till_stall_check();
jiffies_start = jiffies;
+ if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ if (synchronize_rcu_expedited_wait_once(1))
+ return;
+ rcu_for_each_leaf_node(rnp) {
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->rcu_forced_tick_exp)
+ continue;
+ rdp->rcu_forced_tick_exp = true;
+ tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ }
+ }
+ }
for (;;) {
- ret = swait_event_timeout_exclusive(
- rcu_state.expedited_wq,
- sync_rcu_preempt_exp_done_unlocked(rnp_root),
- jiffies_stall);
- if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
+ if (synchronize_rcu_expedited_wait_once(jiffies_stall))
return;
- WARN_ON(ret < 0); /* workqueues should not be signaled. */
if (rcu_cpu_stall_suppress)
continue;
panic_on_rcu_stall();
@@ -491,7 +525,7 @@ static void synchronize_sched_expedited_wait(void)
struct rcu_data *rdp;
mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(rnp->expmask & mask))
+ if (!(READ_ONCE(rnp->expmask) & mask))
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -503,17 +537,18 @@ static void synchronize_sched_expedited_wait(void)
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
jiffies - jiffies_start, rcu_state.expedited_sequence,
- rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ READ_ONCE(rnp_root->expmask),
+ ".T"[!!rnp_root->exp_tasks]);
if (ndetected) {
pr_err("blocking rcu_node structures:");
rcu_for_each_node_breadth_first(rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
- if (sync_rcu_preempt_exp_done_unlocked(rnp))
+ if (sync_rcu_exp_done_unlocked(rnp))
continue;
pr_cont(" l=%u:%d-%d:%#lx/%c",
rnp->level, rnp->grplo, rnp->grphi,
- rnp->expmask,
+ READ_ONCE(rnp->expmask),
".T"[!!rnp->exp_tasks]);
}
pr_cont("\n");
@@ -521,7 +556,7 @@ static void synchronize_sched_expedited_wait(void)
rcu_for_each_leaf_node(rnp) {
for_each_leaf_node_possible_cpu(rnp, cpu) {
mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(rnp->expmask & mask))
+ if (!(READ_ONCE(rnp->expmask) & mask))
continue;
dump_cpu_task(cpu);
}
@@ -540,15 +575,14 @@ static void rcu_exp_wait_wake(unsigned long s)
{
struct rcu_node *rnp;
- synchronize_sched_expedited_wait();
- rcu_exp_gp_seq_end();
- trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
+ synchronize_rcu_expedited_wait();
- /*
- * Switch over to wakeup mode, allowing the next GP, but -only- the
- * next GP, to proceed.
- */
+ // Switch over to wakeup mode, allowing the next GP to proceed.
+ // End the previous grace period only after acquiring the mutex
+ // to ensure that only one GP runs concurrently with wakeups.
mutex_lock(&rcu_state.exp_wake_mutex);
+ rcu_exp_gp_seq_end();
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
@@ -559,7 +593,7 @@ static void rcu_exp_wait_wake(unsigned long s)
spin_unlock(&rnp->exp_lock);
}
smp_mb(); /* All above changes before wakeup. */
- wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);
+ wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
}
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
mutex_unlock(&rcu_state.exp_wake_mutex);
@@ -610,7 +644,7 @@ static void rcu_exp_handler(void *unused)
* critical section. If also enabled or idle, immediately
* report the quiescent state, otherwise defer.
*/
- if (!t->rcu_read_lock_nesting) {
+ if (!rcu_preempt_depth()) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
rcu_dynticks_curr_cpu_in_eqs()) {
rcu_report_exp_rdp(rdp);
@@ -634,7 +668,7 @@ static void rcu_exp_handler(void *unused)
* can have caused this quiescent state to already have been
* reported, so we really do need to check ->expmask.
*/
- if (t->rcu_read_lock_nesting > 0) {
+ if (rcu_preempt_depth() > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
rdp->exp_deferred_qs = true;
@@ -670,7 +704,7 @@ static void rcu_exp_handler(void *unused)
}
}
-/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */
+/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
static void sync_sched_exp_online_cleanup(int cpu)
{
}
@@ -785,7 +819,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
* implementations, it is still unfriendly to real-time workloads, so is
* thus not recommended for any sort of common-case code. In fact, if
* you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
+ * your code to batch your updates, and then use a single synchronize_rcu()
* instead.
*
* This has the same semantics as (but is more brutal than) synchronize_rcu().
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fa08d55f7040..c6ea81cd4189 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -220,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* blocked tasks.
*/
if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
- rnp->gp_tasks = &t->rcu_node_entry;
+ WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
}
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
@@ -290,8 +290,8 @@ void rcu_note_context_switch(bool preempt)
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
- WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
- if (t->rcu_read_lock_nesting > 0 &&
+ WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
+ if (rcu_preempt_depth() > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
@@ -340,7 +340,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
*/
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
- return rnp->gp_tasks != NULL;
+ return READ_ONCE(rnp->gp_tasks) != NULL;
}
/* Bias and limit values for ->rcu_read_lock_nesting. */
@@ -348,6 +348,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
#define RCU_NEST_NMAX (-INT_MAX / 2)
#define RCU_NEST_PMAX (INT_MAX / 2)
+static void rcu_preempt_read_enter(void)
+{
+ current->rcu_read_lock_nesting++;
+}
+
+static void rcu_preempt_read_exit(void)
+{
+ current->rcu_read_lock_nesting--;
+}
+
+static void rcu_preempt_depth_set(int val)
+{
+ current->rcu_read_lock_nesting = val;
+}
+
/*
* Preemptible RCU implementation for rcu_read_lock().
* Just increment ->rcu_read_lock_nesting, shared state will be updated
@@ -355,9 +370,9 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
*/
void __rcu_read_lock(void)
{
- current->rcu_read_lock_nesting++;
+ rcu_preempt_read_enter();
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
- WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX);
+ WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
barrier(); /* critical section after entry code. */
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -373,19 +388,19 @@ void __rcu_read_unlock(void)
{
struct task_struct *t = current;
- if (t->rcu_read_lock_nesting != 1) {
- --t->rcu_read_lock_nesting;
+ if (rcu_preempt_depth() != 1) {
+ rcu_preempt_read_exit();
} else {
barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = -RCU_NEST_BIAS;
+ rcu_preempt_depth_set(-RCU_NEST_BIAS);
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
+ rcu_preempt_depth_set(0);
}
if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
- int rrln = t->rcu_read_lock_nesting;
+ int rrln = rcu_preempt_depth();
WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX);
}
@@ -444,15 +459,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
local_irq_restore(flags);
return;
}
- t->rcu_read_unlock_special.b.deferred_qs = false;
- if (special.b.need_qs) {
+ t->rcu_read_unlock_special.s = 0;
+ if (special.b.need_qs)
rcu_qs();
- t->rcu_read_unlock_special.b.need_qs = false;
- if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) {
- local_irq_restore(flags);
- return;
- }
- }
/*
* Respond to a request by an expedited grace period for a
@@ -460,17 +469,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->exp_deferred_qs) {
+ if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
- if (!t->rcu_read_unlock_special.s) {
- local_irq_restore(flags);
- return;
- }
- }
/* Clean up if blocked during RCU read-side critical section. */
if (special.b.blocked) {
- t->rcu_read_unlock_special.b.blocked = false;
/*
* Remove this task from the list it blocked on. The task
@@ -485,7 +488,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
(!empty_norm || rnp->qsmask));
- empty_exp = sync_rcu_preempt_exp_done(rnp);
+ empty_exp = sync_rcu_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
@@ -493,7 +496,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
rnp->gp_seq, t->pid);
if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp->gp_tasks = np;
+ WRITE_ONCE(rnp->gp_tasks, np);
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
@@ -509,7 +512,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
* so we must take a snapshot of the expedited state.
*/
- empty_exp_now = sync_rcu_preempt_exp_done(rnp);
+ empty_exp_now = sync_rcu_exp_done(rnp);
if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gp_seq,
@@ -551,7 +554,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
- t->rcu_read_lock_nesting <= 0;
+ rcu_preempt_depth() <= 0;
}
/*
@@ -564,16 +567,16 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
static void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
- bool couldrecurse = t->rcu_read_lock_nesting >= 0;
+ bool couldrecurse = rcu_preempt_depth() >= 0;
if (!rcu_preempt_need_deferred_qs(t))
return;
if (couldrecurse)
- t->rcu_read_lock_nesting -= RCU_NEST_BIAS;
+ rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS);
local_irq_save(flags);
rcu_preempt_deferred_qs_irqrestore(t, flags);
if (couldrecurse)
- t->rcu_read_lock_nesting += RCU_NEST_BIAS;
+ rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS);
}
/*
@@ -610,9 +613,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- t->rcu_read_unlock_special.b.exp_hint = false;
exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
- (rdp->grpmask & rnp->expmask) ||
+ (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
tick_nohz_full_cpu(rdp->cpu);
// Need to defer quiescent state until everything is enabled.
if (irqs_were_disabled && use_softirq &&
@@ -640,7 +642,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_restore(flags);
return;
}
- WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
rcu_preempt_deferred_qs_irqrestore(t, flags);
}
@@ -648,8 +649,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
- * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock
- * must be held by the caller.
+ * invoked -before- updating this rnp's ->gp_seq.
*
* Also, if there are blocked tasks on the list, they automatically
* block the newly created grace period, so set up ->gp_tasks accordingly.
@@ -659,11 +659,12 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
struct task_struct *t;
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
+ raw_lockdep_assert_held_rcu_node(rnp);
if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
dump_blkd_tasks(rnp, 10);
if (rcu_preempt_has_tasks(rnp) &&
(rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
- rnp->gp_tasks = rnp->blkd_tasks.next;
+ WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
t = container_of(rnp->gp_tasks, struct task_struct,
rcu_node_entry);
trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
@@ -686,7 +687,7 @@ static void rcu_flavor_sched_clock_irq(int user)
if (user || rcu_is_cpu_rrupt_from_idle()) {
rcu_note_voluntary_context_switch(current);
}
- if (t->rcu_read_lock_nesting > 0 ||
+ if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
if (rcu_preempt_need_deferred_qs(t)) {
@@ -696,13 +697,13 @@ static void rcu_flavor_sched_clock_irq(int user)
} else if (rcu_preempt_need_deferred_qs(t)) {
rcu_preempt_deferred_qs(t); /* Report deferred QS. */
return;
- } else if (!t->rcu_read_lock_nesting) {
+ } else if (!rcu_preempt_depth()) {
rcu_qs(); /* Report immediate QS. */
return;
}
/* If GP is oldish, ask for help from rcu_read_unlock_special(). */
- if (t->rcu_read_lock_nesting > 0 &&
+ if (rcu_preempt_depth() > 0 &&
__this_cpu_read(rcu_data.core_needs_qs) &&
__this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
!t->rcu_read_unlock_special.b.need_qs &&
@@ -723,11 +724,11 @@ void exit_rcu(void)
struct task_struct *t = current;
if (unlikely(!list_empty(&current->rcu_node_entry))) {
- t->rcu_read_lock_nesting = 1;
+ rcu_preempt_depth_set(1);
barrier();
WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
- } else if (unlikely(t->rcu_read_lock_nesting)) {
- t->rcu_read_lock_nesting = 1;
+ } else if (unlikely(rcu_preempt_depth())) {
+ rcu_preempt_depth_set(1);
} else {
return;
}
@@ -757,7 +758,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
- __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks);
+ __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks,
+ rnp->exp_tasks);
pr_info("%s: ->blkd_tasks", __func__);
i = 0;
list_for_each(lhp, &rnp->blkd_tasks) {
@@ -788,7 +790,7 @@ static void __init rcu_bootup_announce(void)
}
/*
- * Note a quiescent state for PREEMPT=n. Because we do not need to know
+ * Note a quiescent state for PREEMPTION=n. Because we do not need to know
* how many quiescent states passed, just if there was at least one since
* the start of the grace period, this just sets a flag. The caller must
* have disabled preemption.
@@ -838,7 +840,7 @@ void rcu_all_qs(void)
EXPORT_SYMBOL_GPL(rcu_all_qs);
/*
- * Note a PREEMPT=n context switch. The caller must have disabled interrupts.
+ * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
*/
void rcu_note_context_switch(bool preempt)
{
@@ -1262,10 +1264,9 @@ static void rcu_prepare_for_idle(void)
/*
* This code is invoked when a CPU goes idle, at which point we want
* to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode. This is handled by a
- * state machine implemented by rcu_prepare_for_idle() below.
+ * the energy-efficient dyntick-idle mode.
*
- * The following three proprocessor symbols control this state machine:
+ * The following preprocessor symbol controls this:
*
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
* to sleep in dyntick-idle mode with RCU callbacks pending. This
@@ -1274,21 +1275,15 @@ static void rcu_prepare_for_idle(void)
* number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
* system. And if you are -that- concerned about energy efficiency,
* just power the system down and be done with it!
- * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
- * permitted to sleep in dyntick-idle mode with only lazy RCU
- * callbacks pending. Setting this too high can OOM your system.
*
- * The values below work well in practice. If future workloads require
+ * The value below works well in practice. If future workloads require
* adjustment, they can be converted into kernel config parameters, though
* making the state machine smarter might be a better option.
*/
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
-#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
module_param(rcu_idle_gp_delay, int, 0644);
-static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
-module_param(rcu_idle_lazy_gp_delay, int, 0644);
/*
* Try to advance callbacks on the current CPU, but only if it has been
@@ -1327,8 +1322,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
/*
* Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
* to invoke. If the CPU has callbacks, try to advance them. Tell the
- * caller to set the timeout based on whether or not there are non-lazy
- * callbacks.
+ * caller about what to set the timeout.
*
* The caller must have disabled interrupts.
*/
@@ -1354,25 +1348,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
}
rdp->last_accelerate = jiffies;
- /* Request timer delay depending on laziness, and round. */
- rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist);
- if (rdp->all_lazy) {
- dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
- } else {
- dj = round_up(rcu_idle_gp_delay + jiffies,
- rcu_idle_gp_delay) - jiffies;
- }
+ /* Request timer and round. */
+ dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
+
*nextevt = basemono + dj * TICK_NSEC;
return 0;
}
/*
- * Prepare a CPU for idle from an RCU perspective. The first major task
- * is to sense whether nohz mode has been enabled or disabled via sysfs.
- * The second major task is to check to see if a non-lazy callback has
- * arrived at a CPU that previously had only lazy callbacks. The third
- * major task is to accelerate (that is, assign grace-period numbers to)
- * any recently arrived callbacks.
+ * Prepare a CPU for idle from an RCU perspective. The first major task is to
+ * sense whether nohz mode has been enabled or disabled via sysfs. The second
+ * major task is to accelerate (that is, assign grace-period numbers to) any
+ * recently arrived callbacks.
*
* The caller must have disabled interrupts.
*/
@@ -1399,17 +1386,6 @@ static void rcu_prepare_for_idle(void)
return;
/*
- * If a non-lazy callback arrived at a CPU having only lazy
- * callbacks, invoke RCU core for the side-effect of recalculating
- * idle duration on re-entry to idle.
- */
- if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) {
- rdp->all_lazy = false;
- invoke_rcu_core();
- return;
- }
-
- /*
* If we have not yet accelerated this jiffy, accelerate all
* callbacks on this CPU.
*/
@@ -2321,6 +2297,8 @@ static void __init rcu_organize_nocb_kthreads(void)
{
int cpu;
bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
int ls = rcu_nocb_gp_stride;
int nl = 0; /* Next GP kthread. */
struct rcu_data *rdp;
@@ -2343,21 +2321,31 @@ static void __init rcu_organize_nocb_kthreads(void)
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu >= nl) {
/* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
rdp->nocb_gp_rdp = rdp;
rdp_gp = rdp;
- if (!firsttime && dump_tree)
- pr_cont("\n");
- firsttime = false;
- pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu);
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
} else {
/* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
rdp->nocb_gp_rdp = rdp_gp;
rdp_prev->nocb_next_cb_rdp = rdp;
- pr_alert(" %d", cpu);
+ if (dump_tree)
+ pr_cont(" %d", cpu);
}
rdp_prev = rdp;
}
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
}
/*
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index c0b8c458d8a6..55f9b84790d3 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)
//
// Printing RCU CPU stall warnings
-#ifdef CONFIG_PREEMPTION
+#ifdef CONFIG_PREEMPT_RCU
/*
* Dump detailed information for all tasks blocking the current RCU
@@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
return ndetected;
}
-#else /* #ifdef CONFIG_PREEMPTION */
+#else /* #ifdef CONFIG_PREEMPT_RCU */
/*
* Because preemptible RCU does not exist, we never have to check for
@@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
{
return 0;
}
-#endif /* #else #ifdef CONFIG_PREEMPTION */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
* Dump stacks of all tasks running on stalled CPUs. First try using
@@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
+ sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- ".l"[rdp->all_lazy],
- ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
- ".D"[!!rdp->tick_nohz_enabled_snap]);
+ !!rdp->tick_nohz_enabled_snap);
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -279,6 +277,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+static const char * const gp_state_names[] = {
+ [RCU_GP_IDLE] = "RCU_GP_IDLE",
+ [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
+ [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS",
+ [RCU_GP_ONOFF] = "RCU_GP_ONOFF",
+ [RCU_GP_INIT] = "RCU_GP_INIT",
+ [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS",
+ [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS",
+ [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP",
+ [RCU_GP_CLEANED] = "RCU_GP_CLEANED",
+};
+
+/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+ if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+ return "???";
+ return gp_state_names[gs];
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1861103662db..6c4b862f57d6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -40,6 +40,7 @@
#include <linux/rcupdate_wait.h>
#include <linux/sched/isolation.h>
#include <linux/kprobes.h>
+#include <linux/slab.h>
#define CREATE_TRACE_POINTS
@@ -51,9 +52,7 @@
#define MODULE_PARAM_PREFIX "rcupdate."
#ifndef CONFIG_TINY_RCU
-extern int rcu_expedited; /* from sysctl */
module_param(rcu_expedited, int, 0);
-extern int rcu_normal; /* from sysctl */
module_param(rcu_normal, int, 0);
static int rcu_normal_after_boot;
module_param(rcu_normal_after_boot, int, 0);
@@ -218,6 +217,7 @@ static int __init rcu_set_runtime_mode(void)
{
rcu_test_sync_prims();
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ kfree_rcu_scheduler_running();
rcu_test_sync_prims();
return 0;
}
@@ -435,7 +435,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE)
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
@@ -853,14 +853,22 @@ static void test_callback(struct rcu_head *r)
DEFINE_STATIC_SRCU(early_srcu);
+struct early_boot_kfree_rcu {
+ struct rcu_head rh;
+};
+
static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
static struct rcu_head shead;
+ struct early_boot_kfree_rcu *rhp;
call_rcu(&head, test_callback);
if (IS_ENABLED(CONFIG_SRCU))
call_srcu(&early_srcu, &shead, test_callback);
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (!WARN_ON_ONCE(!rhp))
+ kfree_rcu(rhp, rh);
}
void rcu_early_boot_tests(void)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 1152259a4ca0..12bca64dff73 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + __sched_clock_offset;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return sched_clock();
preempt_disable_notrace();
@@ -393,7 +393,7 @@ void sched_clock_tick(void)
if (sched_clock_stable())
return;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return;
lockdep_assert_irqs_disabled();
@@ -460,7 +460,7 @@ void __init sched_clock_init(void)
u64 sched_clock_cpu(int cpu)
{
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return 0;
return sched_clock();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90e4b00ace89..fc1dfc007604 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -919,17 +919,17 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
-unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
/* Task currently refcounted: use back-annotated (effective) value */
if (p->uclamp[clamp_id].active)
- return p->uclamp[clamp_id].value;
+ return (unsigned long)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
- return uc_eff.value;
+ return (unsigned long)uc_eff.value;
}
/*
@@ -1253,7 +1253,8 @@ static void __init init_uclamp(void)
mutex_init(&uclamp_mutex);
for_each_possible_cpu(cpu) {
- memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+ memset(&cpu_rq(cpu)->uclamp, 0,
+ sizeof(struct uclamp_rq)*UCLAMP_CNT);
cpu_rq(cpu)->uclamp_flags = 0;
}
@@ -4504,7 +4505,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio, delta;
+ int old_prio;
struct rq_flags rf;
struct rq *rq;
@@ -4538,19 +4539,18 @@ void set_user_nice(struct task_struct *p, long nice)
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
- delta = p->prio - old_prio;
- if (queued) {
+ if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_curr(rq);
- }
if (running)
set_next_task(rq, p);
+
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ p->sched_class->prio_changed(rq, p, old_prio);
+
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -7100,6 +7100,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent)
sched_online_group(tg, parent);
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* Propagate the effective uclamp value for the new group */
+ cpu_util_update_eff(css);
+#endif
+
return 0;
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 9b8916fd00a2..7fbaee24c824 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -238,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
*/
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
- util = uclamp_util_with(rq, util, p);
+ util = uclamp_rq_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index b7abca987d94..1a2719e1350a 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -46,6 +46,8 @@ static int convert_prio(int prio)
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ * @fitness_fn: A pointer to a function to do custom checks whether the CPU
+ * fits a specific criteria so that we only return those CPUs.
*
* Note: This function returns the recommended CPUs as calculated during the
* current invocation. By the time the call returns, the CPUs may have in
@@ -57,7 +59,8 @@ static int convert_prio(int prio)
* Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
- struct cpumask *lowest_mask)
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu))
{
int idx = 0;
int task_pri = convert_prio(p->prio);
@@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
continue;
if (lowest_mask) {
+ int cpu;
+
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
@@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
* condition, simply act as though we never hit this
* priority level and continue on.
*/
- if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ if (cpumask_empty(lowest_mask))
+ continue;
+
+ if (!fitness_fn)
+ return 1;
+
+ /* Ensure the capacity of the CPUs fit the task */
+ for_each_cpu(cpu, lowest_mask) {
+ if (!fitness_fn(p, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
+ }
+
+ /*
+ * If no CPU at the current priority can fit the task
+ * continue looking
+ */
+ if (cpumask_empty(lowest_mask))
continue;
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 7dc20a3232e7..32dd520db11f 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -18,7 +18,9 @@ struct cpupri {
};
#ifdef CONFIG_SMP
-int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu));
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d43318a489f2..cff3e656566d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int ticks)
+ int ticks)
{
u64 other, cputime = TICK_NSEC * ticks;
@@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime);
- } else if (p == rq->idle) {
+ } else if (p == this_rq()->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime);
@@ -392,14 +392,12 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
static void irqtime_account_idle_ticks(int ticks)
{
- struct rq *rq = this_rq();
-
- irqtime_account_process_tick(current, 0, rq, ticks);
+ irqtime_account_process_tick(current, 0, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int nr_ticks) { }
+ int nr_ticks) { }
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -473,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
void account_process_tick(struct task_struct *p, int user_tick)
{
u64 cputime, steal;
- struct rq *rq = this_rq();
if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq, 1);
+ irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -493,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (user_tick)
account_user_time(p, cputime);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime);
else
account_idle_time(cputime);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f7e4579e746c..879d3ccf3806 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void)
int cpu;
sched_debug_header(NULL);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ /*
+ * Need to reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI or stop_machine.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
print_cpu(NULL, cpu);
-
+ }
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ba749f579714..fe4e0d775375 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p)
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
@@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+ if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
- se->avg.runnable_load_sum = runnable_sum;
- se->avg.runnable_load_avg = runnable_load_avg;
if (se->on_rq) {
+ delta_sum = runnable_load_sum -
+ se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
}
+
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3520,7 +3521,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
@@ -3556,7 +3557,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
- cfs_rq_util_change(cfs_rq, flags);
+ cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
@@ -3614,7 +3615,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*
* IOW we're enqueueing a task on a new CPU.
*/
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
} else if (decayed) {
@@ -3711,6 +3712,20 @@ static inline unsigned long task_util_est(struct task_struct *p)
return max(task_util(p), _task_util_est(p));
}
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return clamp(task_util_est(p),
+ uclamp_eff_value(p, UCLAMP_MIN),
+ uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return task_util_est(p);
+}
+#endif
+
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
@@ -3822,7 +3837,7 @@ done:
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return fits_capacity(task_util_est(p), capacity);
+ return fits_capacity(uclamp_task_util(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -3857,7 +3872,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
@@ -5196,6 +5211,20 @@ static inline void update_overutilized_status(struct rq *rq)
static inline void update_overutilized_status(struct rq *rq) { }
#endif
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
+}
+
+#ifdef CONFIG_SMP
+static int sched_idle_cpu(int cpu)
+{
+ return sched_idle_rq(cpu_rq(cpu));
+}
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5310,6 +5339,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
+ bool was_sched_idle = sched_idle_rq(rq);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5356,6 +5386,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
sub_nr_running(rq, 1);
+ /* balance early to pull high priority tasks */
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ rq->next_balance = jiffies;
+
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -5378,15 +5412,6 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
-/* CPU only has SCHED_IDLE tasks enqueued */
-static int sched_idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
- rq->nr_running);
-}
-
static unsigned long cpu_load(struct rq *rq)
{
return cfs_rq_load_avg(&rq->cfs);
@@ -5588,7 +5613,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1, si_cpu = -1;
+ int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
@@ -5597,6 +5622,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ if (sched_idle_cpu(i))
+ return i;
+
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5619,12 +5647,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
- if (sched_idle_cpu(i)) {
- si_cpu = i;
- continue;
- }
-
+ } else if (shallowest_idle_cpu == -1) {
load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
@@ -5633,11 +5656,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
}
- if (shallowest_idle_cpu != -1)
- return shallowest_idle_cpu;
- if (si_cpu != -1)
- return si_cpu;
- return least_loaded_cpu;
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5790,7 +5809,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
*/
static int select_idle_smt(struct task_struct *p, int target)
{
- int cpu, si_cpu = -1;
+ int cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5798,13 +5817,11 @@ static int select_idle_smt(struct task_struct *p, int target)
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- if (available_idle_cpu(cpu))
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
- return si_cpu;
+ return -1;
}
#else /* CONFIG_SCHED_SMT */
@@ -5828,12 +5845,13 @@ static inline int select_idle_smt(struct task_struct *p, int target)
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
int this = smp_processor_id();
- int cpu, nr = INT_MAX, si_cpu = -1;
+ int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5859,15 +5877,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = cpu_clock(this);
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
- return si_cpu;
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
- if (available_idle_cpu(cpu))
+ return -1;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
break;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
time = cpu_clock(this) - time;
@@ -6268,9 +6284,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- /* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
+ spare_cap = cpu_cap - util;
+
+ /*
+ * Skip CPUs that cannot satisfy the capacity request.
+ * IOW, placing the task there would make the CPU
+ * overutilized. Take uclamp into account to see how
+ * much capacity we can get out of the CPU; this is
+ * aligned with schedutil_cpu_util().
+ */
+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
continue;
@@ -6285,7 +6310,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
- spare_cap = cpu_cap - util;
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
@@ -7780,29 +7804,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
- struct sched_group_capacity *sgc;
- struct rq *rq = cpu_rq(cpu);
+ unsigned long cpu_cap = capacity_of(cpu);
- /*
- * build_sched_domains() -> init_sched_groups_capacity()
- * gets here before we've attached the domains to the
- * runqueues.
- *
- * Use capacity_of(), which is set irrespective of domains
- * in update_cpu_capacity().
- *
- * This avoids capacity from being 0 and
- * causing divide-by-zero issues on boot.
- */
- if (unlikely(!rq->sd)) {
- capacity += capacity_of(cpu);
- } else {
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
- }
-
- min_capacity = min(capacity, min_capacity);
- max_capacity = max(capacity, max_capacity);
+ capacity += cpu_cap;
+ min_capacity = min(cpu_cap, min_capacity);
+ max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
@@ -8168,14 +8174,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_has_spare:
/*
- * Select not overloaded group with lowest number of
- * idle cpus. We could also compare the spare capacity
- * which is more stable but it can end up that the
- * group has less spare capacity but finally more idle
+ * Select not overloaded group with lowest number of idle cpus
+ * and highest number of running tasks. We could also compare
+ * the spare capacity which is more stable but it can end up
+ * that the group has less spare capacity but finally more idle
* CPUs which means less opportunity to pull tasks.
*/
- if (sgs->idle_cpus >= busiest->idle_cpus)
+ if (sgs->idle_cpus > busiest->idle_cpus)
+ return false;
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
return false;
+
break;
}
@@ -9529,6 +9539,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -9565,7 +9576,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
@@ -9581,9 +9592,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -10333,6 +10345,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
+ if (rq->cfs.nr_running == 1)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
@@ -10423,7 +10438,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ffa959e91227..b743bf38f08f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
/*
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
- * activity happens here and in iterrupts (if any). In that case bypass
+ * activity happens here and in interrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a96db50d40e0..bd006b79b360 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
* Step 2
*/
delta %= 1024;
- contrib = __accumulate_pelt_segments(periods,
- 1024 - sa->period_contrib, delta);
+ if (load) {
+ /*
+ * This relies on the:
+ *
+ * if (!load)
+ * runnable = running = 0;
+ *
+ * clause from ___update_load_sum(); this results in
+ * the below usage of @contrib to dissapear entirely,
+ * so no point in calculating it.
+ */
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
}
sa->period_contrib = delta;
@@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
* this happens during idle_balance() which calls
- * update_blocked_averages()
+ * update_blocked_averages().
+ *
+ * Also see the comment in accumulate_sum().
*/
if (!load)
runnable = running = 0;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ce8f6748678a..ac4bd0ca11cc 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1251,39 +1251,41 @@ static int psi_fop_release(struct inode *inode, struct file *file)
return single_release(inode, file);
}
-static const struct file_operations psi_io_fops = {
- .open = psi_io_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_io_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_io_proc_ops = {
+ .proc_open = psi_io_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_io_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
-static const struct file_operations psi_memory_fops = {
- .open = psi_memory_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_memory_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_memory_proc_ops = {
+ .proc_open = psi_memory_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_memory_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
-static const struct file_operations psi_cpu_fops = {
- .open = psi_cpu_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_cpu_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_cpu_proc_ops = {
+ .proc_open = psi_cpu_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_cpu_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
static int __init psi_proc_init(void)
{
- proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_fops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ if (psi_enable) {
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
+ proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
+ }
return 0;
}
module_init(psi_proc_init);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e591d40fd645..4043abe45459 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -437,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->on_rq;
}
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the uclamp
+ * settings.
+ *
+ * This check is only important for heterogeneous systems where uclamp_min value
+ * is higher than the capacity of a @cpu. For non-heterogeneous system this
+ * function will always return true.
+ *
+ * The function will return true if the capacity of the @cpu is >= the
+ * uclamp_min and false otherwise.
+ *
+ * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
+ * > uclamp_max.
+ */
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned int min_cap;
+ unsigned int max_cap;
+ unsigned int cpu_cap;
+
+ /* Only heterogeneous systems can benefit from this check */
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return true;
+
+ min_cap = uclamp_eff_value(p, UCLAMP_MIN);
+ max_cap = uclamp_eff_value(p, UCLAMP_MAX);
+
+ cpu_cap = capacity_orig_of(cpu);
+
+ return cpu_cap >= min(min_cap, max_cap);
+}
+#else
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ return true;
+}
+#endif
+
#ifdef CONFIG_RT_GROUP_SCHED
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
@@ -1391,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
+ bool test;
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1422,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
*
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
+ *
+ * We take into account the capacity of the CPU to ensure it fits the
+ * requirement of the task - which is only important on heterogeneous
+ * systems like big.LITTLE.
*/
- if (curr && unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ test = curr &&
+ unlikely(rt_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+
+ if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
/*
@@ -1449,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL))
return;
/*
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, NULL))
+ if (p->nr_cpus_allowed != 1 &&
+ cpupri_find(&rq->rd->cpupri, p, NULL, NULL))
return;
/*
@@ -1601,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr))
+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
+ rt_task_fits_capacity(p, cpu))
return 1;
return 0;
@@ -1643,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
- if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask,
+ rt_task_fits_capacity))
return -1; /* No targets found */
/*
@@ -2147,12 +2195,14 @@ skip:
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
- (rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio))
+ bool need_to_push = !task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ rq->curr->prio <= p->prio);
+
+ if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq)))
push_rt_tasks(rq);
}
@@ -2224,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ bool need_to_push = rq->rt.overloaded ||
+ !rt_task_fits_capacity(p, cpu_of(rq));
+
+ if (p->nr_cpus_allowed > 1 && need_to_push)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 280a3c735935..1a88dc8ad11b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2300,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
-unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
- unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+ unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
@@ -2324,18 +2324,10 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
return clamp(util, min_util, max_util);
}
-
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
-{
- return uclamp_util_with(rq, util, NULL);
-}
#else /* CONFIG_UCLAMP_TASK */
-static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
-{
- return util;
-}
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+static inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
return util;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6ec1e595b1d4..dfb64c08a407 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1880,6 +1880,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
}
/*
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
+ * any two given CPUs at this (non-NUMA) topology level.
+ */
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, int cpu)
+{
+ int i;
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ return true;
+
+ /*
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
+ */
+ for_each_cpu(i, cpu_map) {
+ if (i == cpu)
+ continue;
+ /*
+ * We should 'and' all those masks with 'cpu_map' to exactly
+ * match the topology we're about to build, but that can only
+ * remove CPUs, which only lessens our ability to detect
+ * overlaps
+ */
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
@@ -1975,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
has_asym = true;
}
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
+ goto error;
+
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 45eba18a2898..02ce292b9bc0 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
.bit_nr = -1,
},
.wq_entry = {
+ .flags = flags,
.private = current,
.func = var_wake_function,
.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
diff --git a/kernel/signal.c b/kernel/signal.c
index bcd46f547db3..9ad8dea93dbb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1383,7 +1383,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
* must see ->sighand == NULL.
*/
spin_lock_irqsave(&sighand->siglock, *flags);
- if (likely(sighand == tsk->sighand))
+ if (likely(sighand == rcu_access_pointer(tsk->sighand)))
break;
spin_unlock_irqrestore(&sighand->siglock, *flags);
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1fe34a9fabc2..865bb0228ab6 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
* @cpumask were offline; otherwise, 0 if all executions of @fn
* returned 0, any non zero return value if any returned non zero.
*/
-int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
{
int ret;
@@ -453,36 +453,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
return ret;
}
-/**
- * try_stop_cpus - try to stop multiple cpus
- * @cpumask: cpus to stop
- * @fn: function to execute
- * @arg: argument to @fn
- *
- * Identical to stop_cpus() except that it fails with -EAGAIN if
- * someone else is already using the facility.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * -EAGAIN if someone else is already stopping cpus, -ENOENT if
- * @fn(@arg) was not executed at all because all cpus in @cpumask were
- * offline; otherwise, 0 if all executions of @fn returned 0, any non
- * zero return value if any returned non zero.
- */
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
-{
- int ret;
-
- /* static works are used, process one request at a time */
- if (!mutex_trylock(&stop_cpus_mutex))
- return -EAGAIN;
- ret = __stop_cpus(cpumask, fn, arg);
- mutex_unlock(&stop_cpus_mutex);
- return ret;
-}
-
static int cpu_stop_should_run(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
diff --git a/kernel/sys.c b/kernel/sys.c
index a9331f101883..f9bc5c303e3f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2261,6 +2261,8 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
return -EINVAL;
}
+#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2488,6 +2490,29 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = GET_TAGGED_ADDR_CTRL();
break;
+ case PR_SET_IO_FLUSHER:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ if (arg2 == 1)
+ current->flags |= PR_IO_FLUSHER;
+ else if (!arg2)
+ current->flags &= ~PR_IO_FLUSHER;
+ else
+ return -EINVAL;
+ break;
+ case PR_GET_IO_FLUSHER:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index 2a63241a8453..ccb78509f1a8 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -389,4 +389,6 @@ static struct kunit_suite sysctl_test_suite = {
.test_cases = sysctl_test_cases,
};
-kunit_test_suite(sysctl_test_suite);
+kunit_test_suites(&sysctl_test_suite);
+
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 70665934d53e..d396aaaf19a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1268,7 +1268,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_do_static_key,
},
#endif
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+#if defined(CONFIG_TREE_RCU)
{
.procname = "panic_on_rcu_stall",
.data = &sysctl_panic_on_rcu_stall,
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fff5f64981c6..428beb69426a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -293,8 +293,15 @@ static void clocksource_watchdog(struct timer_list *unused)
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+
+ /*
+ * Arm timer if not already pending: could race with concurrent
+ * pair clocksource_stop_watchdog() clocksource_start_watchdog().
+ */
+ if (!timer_pending(&watchdog_timer)) {
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+ }
out:
spin_unlock(&watchdog_lock);
}
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 9e59c9ea92aa..ca4e6d57d68b 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -97,20 +97,20 @@ static int do_getitimer(int which, struct itimerspec64 *value)
return 0;
}
-static int put_itimerval(struct itimerval __user *o,
+static int put_itimerval(struct __kernel_old_itimerval __user *o,
const struct itimerspec64 *i)
{
- struct itimerval v;
+ struct __kernel_old_itimerval v;
v.it_interval.tv_sec = i->it_interval.tv_sec;
v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC;
v.it_value.tv_sec = i->it_value.tv_sec;
v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC;
- return copy_to_user(o, &v, sizeof(struct itimerval)) ? -EFAULT : 0;
+ return copy_to_user(o, &v, sizeof(struct __kernel_old_itimerval)) ? -EFAULT : 0;
}
-SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
+SYSCALL_DEFINE2(getitimer, int, which, struct __kernel_old_itimerval __user *, value)
{
struct itimerspec64 get_buffer;
int error = do_getitimer(which, &get_buffer);
@@ -314,11 +314,11 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
#endif
-static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user *i)
+static int get_itimerval(struct itimerspec64 *o, const struct __kernel_old_itimerval __user *i)
{
- struct itimerval v;
+ struct __kernel_old_itimerval v;
- if (copy_from_user(&v, i, sizeof(struct itimerval)))
+ if (copy_from_user(&v, i, sizeof(struct __kernel_old_itimerval)))
return -EFAULT;
/* Validate the timevals in value. */
@@ -333,8 +333,8 @@ static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user *
return 0;
}
-SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
- struct itimerval __user *, ovalue)
+SYSCALL_DEFINE3(setitimer, int, which, struct __kernel_old_itimerval __user *, value,
+ struct __kernel_old_itimerval __user *, ovalue)
{
struct itimerspec64 set_buffer, get_buffer;
int error;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 704ccd9451b0..cdd7386115ff 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -626,10 +626,12 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
*/
-static unsigned long
-__timespec64_to_jiffies(u64 sec, long nsec)
+
+unsigned long
+timespec64_to_jiffies(const struct timespec64 *value)
{
- nsec = nsec + TICK_NSEC - 1;
+ u64 sec = value->tv_sec;
+ long nsec = value->tv_nsec + TICK_NSEC - 1;
if (sec >= MAX_SEC_IN_JIFFIES){
sec = MAX_SEC_IN_JIFFIES;
@@ -640,18 +642,6 @@ __timespec64_to_jiffies(u64 sec, long nsec)
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
-
-static unsigned long
-__timespec_to_jiffies(unsigned long sec, long nsec)
-{
- return __timespec64_to_jiffies((u64)sec, nsec);
-}
-
-unsigned long
-timespec64_to_jiffies(const struct timespec64 *value)
-{
- return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
-}
EXPORT_SYMBOL(timespec64_to_jiffies);
void
@@ -669,44 +659,6 @@ jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
EXPORT_SYMBOL(jiffies_to_timespec64);
/*
- * We could use a similar algorithm to timespec_to_jiffies (with a
- * different multiplier for usec instead of nsec). But this has a
- * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
- * usec value, since it's not necessarily integral.
- *
- * We could instead round in the intermediate scaled representation
- * (i.e. in units of 1/2^(large scale) jiffies) but that's also
- * perilous: the scaling introduces a small positive error, which
- * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
- * units to the intermediate before shifting) leads to accidental
- * overflow and overestimates.
- *
- * At the cost of one additional multiplication by a constant, just
- * use the timespec implementation.
- */
-unsigned long
-timeval_to_jiffies(const struct timeval *value)
-{
- return __timespec_to_jiffies(value->tv_sec,
- value->tv_usec * NSEC_PER_USEC);
-}
-EXPORT_SYMBOL(timeval_to_jiffies);
-
-void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
-{
- /*
- * Convert jiffies to nanoseconds and separate with
- * one divide.
- */
- u32 rem;
-
- value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
- NSEC_PER_SEC, &rem);
- value->tv_usec = rem / NSEC_PER_USEC;
-}
-EXPORT_SYMBOL(jiffies_to_timeval);
-
-/*
* Convert jiffies/jiffies_64 to clock_t and back.
*/
clock_t jiffies_to_clock_t(unsigned long x)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 25a0fcfa7a5d..91e885194dbc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,6 +141,15 @@ menuconfig FTRACE
if FTRACE
+config BOOTTIME_TRACING
+ bool "Boot-time Tracing support"
+ depends on BOOT_CONFIG && TRACING
+ default y
+ help
+ Enable developer to setup ftrace subsystem via supplemental
+ kernel cmdline at boot time for debugging (tracing) driver
+ initialization and boot process.
+
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
@@ -172,6 +181,77 @@ config FUNCTION_GRAPH_TRACER
the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
+config DYNAMIC_FTRACE
+ bool "enable/disable function tracing dynamically"
+ depends on FUNCTION_TRACER
+ depends on HAVE_DYNAMIC_FTRACE
+ default y
+ help
+ This option will modify all the calls to function tracing
+ dynamically (will patch them out of the binary image and
+ replace them with a No-Op instruction) on boot up. During
+ compile time, a table is made of all the locations that ftrace
+ can function trace, and this table is linked into the kernel
+ image. When this is enabled, functions can be individually
+ enabled, and the functions not enabled will not affect
+ performance of the system.
+
+ See the files in /sys/kernel/debug/tracing:
+ available_filter_functions
+ set_ftrace_filter
+ set_ftrace_notrace
+
+ This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
+ otherwise has native performance as long as no tracing is active.
+
+config DYNAMIC_FTRACE_WITH_REGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
+
+config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+
+config FUNCTION_PROFILER
+ bool "Kernel function profiler"
+ depends on FUNCTION_TRACER
+ default n
+ help
+ This option enables the kernel function profiler. A file is created
+ in debugfs called function_profile_enabled which defaults to zero.
+ When a 1 is echoed into this file profiling begins, and when a
+ zero is entered, profiling stops. A "functions" file is created in
+ the trace_stat directory; this file shows the list of functions that
+ have been hit and their counters.
+
+ If in doubt, say N.
+
+config STACK_TRACER
+ bool "Trace max stack"
+ depends on HAVE_FUNCTION_TRACER
+ select FUNCTION_TRACER
+ select STACKTRACE
+ select KALLSYMS
+ help
+ This special tracer records the maximum stack footprint of the
+ kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
+
+ This tracer works by hooking into every function call that the
+ kernel executes, and keeping a maximum stack depth value and
+ stack-trace saved. If this is configured with DYNAMIC_FTRACE
+ then it will not have any overhead while the stack tracer
+ is disabled.
+
+ To enable the stack tracer on bootup, pass in 'stacktrace'
+ on the kernel command line.
+
+ The stack tracer can also be enabled or disabled via the
+ sysctl kernel.stack_tracer_enabled
+
+ Say N if unsure.
+
config TRACE_PREEMPT_TOGGLE
bool
help
@@ -282,6 +362,19 @@ config HWLAT_TRACER
file. Every time a latency is greater than tracing_thresh, it will
be recorded into the ring buffer.
+config MMIOTRACE
+ bool "Memory mapped IO tracing"
+ depends on HAVE_MMIOTRACE_SUPPORT && PCI
+ select GENERIC_TRACER
+ help
+ Mmiotrace traces Memory Mapped I/O access and is meant for
+ debugging and reverse engineering. It is called from the ioremap
+ implementation and works via page faults. Tracing is disabled by
+ default and can be enabled at run-time.
+
+ See Documentation/trace/mmiotrace.rst.
+ If you are not helping to develop drivers, say N.
+
config ENABLE_DEFAULT_TRACERS
bool "Trace process context switches and events"
depends on !GENERIC_TRACER
@@ -410,30 +503,6 @@ config BRANCH_TRACER
Say N if unsure.
-config STACK_TRACER
- bool "Trace max stack"
- depends on HAVE_FUNCTION_TRACER
- select FUNCTION_TRACER
- select STACKTRACE
- select KALLSYMS
- help
- This special tracer records the maximum stack footprint of the
- kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
-
- This tracer works by hooking into every function call that the
- kernel executes, and keeping a maximum stack depth value and
- stack-trace saved. If this is configured with DYNAMIC_FTRACE
- then it will not have any overhead while the stack tracer
- is disabled.
-
- To enable the stack tracer on bootup, pass in 'stacktrace'
- on the kernel command line.
-
- The stack tracer can also be enabled or disabled via the
- sysctl kernel.stack_tracer_enabled
-
- Say N if unsure.
-
config BLK_DEV_IO_TRACE
bool "Support for tracing block IO actions"
depends on SYSFS
@@ -531,53 +600,6 @@ config DYNAMIC_EVENTS
config PROBE_EVENTS
def_bool n
-config DYNAMIC_FTRACE
- bool "enable/disable function tracing dynamically"
- depends on FUNCTION_TRACER
- depends on HAVE_DYNAMIC_FTRACE
- default y
- help
- This option will modify all the calls to function tracing
- dynamically (will patch them out of the binary image and
- replace them with a No-Op instruction) on boot up. During
- compile time, a table is made of all the locations that ftrace
- can function trace, and this table is linked into the kernel
- image. When this is enabled, functions can be individually
- enabled, and the functions not enabled will not affect
- performance of the system.
-
- See the files in /sys/kernel/debug/tracing:
- available_filter_functions
- set_ftrace_filter
- set_ftrace_notrace
-
- This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
- otherwise has native performance as long as no tracing is active.
-
-config DYNAMIC_FTRACE_WITH_REGS
- def_bool y
- depends on DYNAMIC_FTRACE
- depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
-
-config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
- def_bool y
- depends on DYNAMIC_FTRACE
- depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-
-config FUNCTION_PROFILER
- bool "Kernel function profiler"
- depends on FUNCTION_TRACER
- default n
- help
- This option enables the kernel function profiler. A file is created
- in debugfs called function_profile_enabled which defaults to zero.
- When a 1 is echoed into this file profiling begins, and when a
- zero is entered, profiling stops. A "functions" file is created in
- the trace_stat directory; this file shows the list of functions that
- have been hit and their counters.
-
- If in doubt, say N.
-
config BPF_KPROBE_OVERRIDE
bool "Enable BPF programs to override a kprobed function"
depends on BPF_EVENTS
@@ -592,54 +614,6 @@ config FTRACE_MCOUNT_RECORD
depends on DYNAMIC_FTRACE
depends on HAVE_FTRACE_MCOUNT_RECORD
-config FTRACE_SELFTEST
- bool
-
-config FTRACE_STARTUP_TEST
- bool "Perform a startup test on ftrace"
- depends on GENERIC_TRACER
- select FTRACE_SELFTEST
- help
- This option performs a series of startup tests on ftrace. On bootup
- a series of tests are made to verify that the tracer is
- functioning properly. It will do tests on all the configured
- tracers of ftrace.
-
-config EVENT_TRACE_STARTUP_TEST
- bool "Run selftest on trace events"
- depends on FTRACE_STARTUP_TEST
- default y
- help
- This option performs a test on all trace events in the system.
- It basically just enables each event and runs some code that
- will trigger events (not necessarily the event it enables)
- This may take some time run as there are a lot of events.
-
-config EVENT_TRACE_TEST_SYSCALLS
- bool "Run selftest on syscall events"
- depends on EVENT_TRACE_STARTUP_TEST
- help
- This option will also enable testing every syscall event.
- It only enables the event and disables it and runs various loads
- with the event enabled. This adds a bit more time for kernel boot
- up since it runs this on every system call defined.
-
- TBD - enable a way to actually call the syscalls as we test their
- events
-
-config MMIOTRACE
- bool "Memory mapped IO tracing"
- depends on HAVE_MMIOTRACE_SUPPORT && PCI
- select GENERIC_TRACER
- help
- Mmiotrace traces Memory Mapped I/O access and is meant for
- debugging and reverse engineering. It is called from the ioremap
- implementation and works via page faults. Tracing is disabled by
- default and can be enabled at run-time.
-
- See Documentation/trace/mmiotrace.rst.
- If you are not helping to develop drivers, say N.
-
config TRACING_MAP
bool
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -680,16 +654,6 @@ config TRACE_EVENT_INJECT
If unsure, say N.
-config MMIOTRACE_TEST
- tristate "Test module for mmiotrace"
- depends on MMIOTRACE && m
- help
- This is a dumb module for testing mmiotrace. It is very dangerous
- as it will write garbage to IO memory starting at a given address.
- However, it should be safe to use on e.g. unused portion of VRAM.
-
- Say N, unless you absolutely know what you are doing.
-
config TRACEPOINT_BENCHMARK
bool "Add tracepoint that benchmarks tracepoints"
help
@@ -736,6 +700,81 @@ config RING_BUFFER_BENCHMARK
If unsure, say N.
+config TRACE_EVAL_MAP_FILE
+ bool "Show eval mappings for trace events"
+ depends on TRACING
+ help
+ The "print fmt" of the trace events will show the enum/sizeof names
+ instead of their values. This can cause problems for user space tools
+ that use this string to parse the raw data as user space does not know
+ how to convert the string to its value.
+
+ To fix this, there's a special macro in the kernel that can be used
+ to convert an enum/sizeof into its value. If this macro is used, then
+ the print fmt strings will be converted to their values.
+
+ If something does not get converted properly, this option can be
+ used to show what enums/sizeof the kernel tried to convert.
+
+ This option is for debugging the conversions. A file is created
+ in the tracing directory called "eval_map" that will show the
+ names matched with their values and what trace event system they
+ belong too.
+
+ Normally, the mapping of the strings to values will be freed after
+ boot up or module load. With this option, they will not be freed, as
+ they are needed for the "eval_map" file. Enabling this option will
+ increase the memory footprint of the running kernel.
+
+ If unsure, say N.
+
+config GCOV_PROFILE_FTRACE
+ bool "Enable GCOV profiling on ftrace subsystem"
+ depends on GCOV_KERNEL
+ help
+ Enable GCOV profiling on ftrace subsystem for checking
+ which functions/lines are tested.
+
+ If unsure, say N.
+
+ Note that on a kernel compiled with this config, ftrace will
+ run significantly slower.
+
+config FTRACE_SELFTEST
+ bool
+
+config FTRACE_STARTUP_TEST
+ bool "Perform a startup test on ftrace"
+ depends on GENERIC_TRACER
+ select FTRACE_SELFTEST
+ help
+ This option performs a series of startup tests on ftrace. On bootup
+ a series of tests are made to verify that the tracer is
+ functioning properly. It will do tests on all the configured
+ tracers of ftrace.
+
+config EVENT_TRACE_STARTUP_TEST
+ bool "Run selftest on trace events"
+ depends on FTRACE_STARTUP_TEST
+ default y
+ help
+ This option performs a test on all trace events in the system.
+ It basically just enables each event and runs some code that
+ will trigger events (not necessarily the event it enables)
+ This may take some time run as there are a lot of events.
+
+config EVENT_TRACE_TEST_SYSCALLS
+ bool "Run selftest on syscall events"
+ depends on EVENT_TRACE_STARTUP_TEST
+ help
+ This option will also enable testing every syscall event.
+ It only enables the event and disables it and runs various loads
+ with the event enabled. This adds a bit more time for kernel boot
+ up since it runs this on every system call defined.
+
+ TBD - enable a way to actually call the syscalls as we test their
+ events
+
config RING_BUFFER_STARTUP_TEST
bool "Ring buffer startup self test"
depends on RING_BUFFER
@@ -759,8 +798,18 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
+config MMIOTRACE_TEST
+ tristate "Test module for mmiotrace"
+ depends on MMIOTRACE && m
+ help
+ This is a dumb module for testing mmiotrace. It is very dangerous
+ as it will write garbage to IO memory starting at a given address.
+ However, it should be safe to use on e.g. unused portion of VRAM.
+
+ Say N, unless you absolutely know what you are doing.
+
config PREEMPTIRQ_DELAY_TEST
- tristate "Preempt / IRQ disable delay thread to test latency tracers"
+ tristate "Test module to create a preempt / IRQ disable delay thread to test latency tracers"
depends on m
help
Select this option to build a test module that can help test latency
@@ -774,45 +823,30 @@ config PREEMPTIRQ_DELAY_TEST
If unsure, say N
-config TRACE_EVAL_MAP_FILE
- bool "Show eval mappings for trace events"
- depends on TRACING
- help
- The "print fmt" of the trace events will show the enum/sizeof names
- instead of their values. This can cause problems for user space tools
- that use this string to parse the raw data as user space does not know
- how to convert the string to its value.
-
- To fix this, there's a special macro in the kernel that can be used
- to convert an enum/sizeof into its value. If this macro is used, then
- the print fmt strings will be converted to their values.
-
- If something does not get converted properly, this option can be
- used to show what enums/sizeof the kernel tried to convert.
-
- This option is for debugging the conversions. A file is created
- in the tracing directory called "eval_map" that will show the
- names matched with their values and what trace event system they
- belong too.
+config SYNTH_EVENT_GEN_TEST
+ tristate "Test module for in-kernel synthetic event generation"
+ depends on HIST_TRIGGERS
+ help
+ This option creates a test module to check the base
+ functionality of in-kernel synthetic event definition and
+ generation.
- Normally, the mapping of the strings to values will be freed after
- boot up or module load. With this option, they will not be freed, as
- they are needed for the "eval_map" file. Enabling this option will
- increase the memory footprint of the running kernel.
+ To test, insert the module, and then check the trace buffer
+ for the generated sample events.
- If unsure, say N.
+ If unsure, say N.
-config GCOV_PROFILE_FTRACE
- bool "Enable GCOV profiling on ftrace subsystem"
- depends on GCOV_KERNEL
+config KPROBE_EVENT_GEN_TEST
+ tristate "Test module for in-kernel kprobe event generation"
+ depends on KPROBE_EVENTS
help
- Enable GCOV profiling on ftrace subsystem for checking
- which functions/lines are tested.
+ This option creates a test module to check the base
+ functionality of in-kernel kprobe event definition.
- If unsure, say N.
+ To test, insert the module, and then check the trace buffer
+ for the generated kprobe events.
- Note that on a kernel compiled with this config, ftrace will
- run significantly slower.
+ If unsure, say N.
endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0e63db62225f..f9dcd19165fa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -44,6 +44,8 @@ obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
+obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o
+obj-$(CONFIG_KPROBE_EVENT_GEN_TEST) += kprobe_event_gen_test.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o
@@ -83,6 +85,7 @@ endif
obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
+obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 475e29498bca..0735ae8545d8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -68,14 +68,14 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
- struct ring_buffer *buffer = NULL;
+ struct trace_buffer *buffer = NULL;
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) {
- buffer = blk_tr->trace_buffer.buffer;
+ buffer = blk_tr->array_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len + cgid_len,
@@ -215,7 +215,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
- struct ring_buffer *buffer = NULL;
+ struct trace_buffer *buffer = NULL;
struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
@@ -248,7 +248,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (blk_tracer) {
tracing_record_cmdline(current);
- buffer = blk_tr->trace_buffer.buffer;
+ buffer = blk_tr->array_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len + cgid_len,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e5ef4ae9edb5..19e793aa441a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -703,6 +703,7 @@ struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
u32 sig;
+ enum pid_type type;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
@@ -712,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry)
struct send_signal_irq_work *work;
work = container_of(entry, struct send_signal_irq_work, irq_work);
- group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
+ group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
}
-BPF_CALL_1(bpf_send_signal, u32, sig)
+static int bpf_send_signal_common(u32 sig, enum pid_type type)
{
struct send_signal_irq_work *work = NULL;
@@ -748,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
*/
work->task = current;
work->sig = sig;
+ work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
- return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
+ return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
+}
+
+BPF_CALL_1(bpf_send_signal, u32, sig)
+{
+ return bpf_send_signal_common(sig, PIDTYPE_TGID);
}
static const struct bpf_func_proto bpf_send_signal_proto = {
@@ -762,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
.arg1_type = ARG_ANYTHING,
};
+BPF_CALL_1(bpf_send_signal_thread, u32, sig)
+{
+ return bpf_send_signal_common(sig, PIDTYPE_PID);
+}
+
+static const struct bpf_func_proto bpf_send_signal_thread_proto = {
+ .func = bpf_send_signal_thread,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -822,6 +841,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
default:
return NULL;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9bf1f2cd515e..3f7ee102868a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@
})
/* hash bits for specific function selection */
-#define FTRACE_HASH_BITS 7
-#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
@@ -146,7 +144,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
{
struct trace_array *tr = op->private;
- if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
+ if (tr && this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid))
return;
op->saved_func(ip, parent_ip, op, regs);
@@ -1103,9 +1101,6 @@ struct ftrace_page {
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
-/* estimate from running different kernels */
-#define NR_TO_INIT 10000
-
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
@@ -5464,7 +5459,7 @@ static void __init set_ftrace_early_graph(char *buf, int enable)
struct ftrace_hash *hash;
hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
- if (WARN_ON(!hash))
+ if (MEM_FAIL(!hash, "Failed to allocate hash\n"))
return;
while (buf) {
@@ -5596,8 +5591,8 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
-struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
-struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+struct ftrace_hash __rcu *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash __rcu *ftrace_graph_notrace_hash = EMPTY_HASH;
enum graph_filter_type {
GRAPH_FILTER_NOTRACE = 0,
@@ -5872,8 +5867,15 @@ ftrace_graph_release(struct inode *inode, struct file *file)
mutex_unlock(&graph_lock);
- /* Wait till all users are no longer using the old hash */
- synchronize_rcu();
+ /*
+ * We need to do a hard force of sched synchronization.
+ * This is because we use preempt_disable() to do RCU, but
+ * the function tracers can be called where RCU is not watching
+ * (like before user_exit()). We can not rely on the RCU
+ * infrastructure to do the synchronization, thus we must do it
+ * ourselves.
+ */
+ schedule_on_each_cpu(ftrace_sync);
free_ftrace_hash(old_hash);
}
@@ -6596,7 +6598,7 @@ static void add_to_clear_hash_list(struct list_head *clear_list,
func = kmalloc(sizeof(*func), GFP_KERNEL);
if (!func) {
- WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n");
+ MEM_FAIL(1, "alloc failure, ftrace filter could be stale\n");
return;
}
@@ -6922,7 +6924,7 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->function_pids);
- this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
trace_ignore_this_task(pid_list, next));
}
@@ -6976,7 +6978,7 @@ static void clear_ftrace_pids(struct trace_array *tr)
unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
for_each_possible_cpu(cpu)
- per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
+ per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = false;
rcu_assign_pointer(tr->function_pids, NULL);
@@ -7031,9 +7033,10 @@ static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
struct trace_array *tr = m->private;
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
- if (v == FTRACE_NO_PIDS)
+ if (v == FTRACE_NO_PIDS) {
+ (*pos)++;
return NULL;
-
+ }
return trace_pid_next(pid_list, v, pos);
}
@@ -7100,7 +7103,7 @@ static void ignore_task_cpu(void *data)
pid_list = rcu_dereference_protected(tr->function_pids,
mutex_is_locked(&ftrace_lock));
- this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
trace_ignore_this_task(pid_list, current));
}
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
new file mode 100644
index 000000000000..18b0f1cbb947
--- /dev/null
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test module for in-kernel kprobe event creation and generation.
+ *
+ * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <linux/trace_events.h>
+
+/*
+ * This module is a simple test of basic functionality for in-kernel
+ * kprobe/kretprobe event creation. The first test uses
+ * kprobe_event_gen_cmd_start(), kprobe_event_add_fields() and
+ * kprobe_event_gen_cmd_end() to create a kprobe event, which is then
+ * enabled in order to generate trace output. The second creates a
+ * kretprobe event using kretprobe_event_gen_cmd_start() and
+ * kretprobe_event_gen_cmd_end(), and is also then enabled.
+ *
+ * To test, select CONFIG_KPROBE_EVENT_GEN_TEST and build the module.
+ * Then:
+ *
+ * # insmod kernel/trace/kprobe_event_gen_test.ko
+ * # cat /sys/kernel/debug/tracing/trace
+ *
+ * You should see many instances of the "gen_kprobe_test" and
+ * "gen_kretprobe_test" events in the trace buffer.
+ *
+ * To remove the events, remove the module:
+ *
+ * # rmmod kprobe_event_gen_test
+ *
+ */
+
+static struct trace_event_file *gen_kprobe_test;
+static struct trace_event_file *gen_kretprobe_test;
+
+/*
+ * Test to make sure we can create a kprobe event, then add more
+ * fields.
+ */
+static int __init test_gen_kprobe_cmd(void)
+{
+ struct dynevent_cmd cmd;
+ char *buf;
+ int ret;
+
+ /* Create a buffer to hold the generated command */
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Before generating the command, initialize the cmd object */
+ kprobe_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ /*
+ * Define the gen_kprobe_test event with the first 2 kprobe
+ * fields.
+ */
+ ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
+ "do_sys_open",
+ "dfd=%ax", "filename=%dx");
+ if (ret)
+ goto free;
+
+ /* Use kprobe_event_add_fields to add the rest of the fields */
+
+ ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
+ if (ret)
+ goto free;
+
+ /*
+ * This actually creates the event.
+ */
+ ret = kprobe_event_gen_cmd_end(&cmd);
+ if (ret)
+ goto free;
+
+ /*
+ * Now get the gen_kprobe_test event file. We need to prevent
+ * the instance and event from disappearing from underneath
+ * us, which trace_get_event_file() does (though in this case
+ * we're using the top-level instance which never goes away).
+ */
+ gen_kprobe_test = trace_get_event_file(NULL, "kprobes",
+ "gen_kprobe_test");
+ if (IS_ERR(gen_kprobe_test)) {
+ ret = PTR_ERR(gen_kprobe_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes", "gen_kprobe_test", true);
+ if (ret) {
+ trace_put_event_file(gen_kprobe_test);
+ goto delete;
+ }
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ ret = kprobe_event_delete("gen_kprobe_test");
+ free:
+ kfree(buf);
+
+ goto out;
+}
+
+/*
+ * Test to make sure we can create a kretprobe event.
+ */
+static int __init test_gen_kretprobe_cmd(void)
+{
+ struct dynevent_cmd cmd;
+ char *buf;
+ int ret;
+
+ /* Create a buffer to hold the generated command */
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Before generating the command, initialize the cmd object */
+ kprobe_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ /*
+ * Define the kretprobe event.
+ */
+ ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
+ "do_sys_open",
+ "$retval");
+ if (ret)
+ goto free;
+
+ /*
+ * This actually creates the event.
+ */
+ ret = kretprobe_event_gen_cmd_end(&cmd);
+ if (ret)
+ goto free;
+
+ /*
+ * Now get the gen_kretprobe_test event file. We need to
+ * prevent the instance and event from disappearing from
+ * underneath us, which trace_get_event_file() does (though in
+ * this case we're using the top-level instance which never
+ * goes away).
+ */
+ gen_kretprobe_test = trace_get_event_file(NULL, "kprobes",
+ "gen_kretprobe_test");
+ if (IS_ERR(gen_kretprobe_test)) {
+ ret = PTR_ERR(gen_kretprobe_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes", "gen_kretprobe_test", true);
+ if (ret) {
+ trace_put_event_file(gen_kretprobe_test);
+ goto delete;
+ }
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ ret = kprobe_event_delete("gen_kretprobe_test");
+ free:
+ kfree(buf);
+
+ goto out;
+}
+
+static int __init kprobe_event_gen_test_init(void)
+{
+ int ret;
+
+ ret = test_gen_kprobe_cmd();
+ if (ret)
+ return ret;
+
+ ret = test_gen_kretprobe_cmd();
+ if (ret) {
+ WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+ trace_put_event_file(gen_kretprobe_test);
+ WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
+ }
+
+ return ret;
+}
+
+static void __exit kprobe_event_gen_test_exit(void)
+{
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes",
+ "gen_kprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kprobe_test);
+
+ /* Now unregister and free the event */
+ WARN_ON(kprobe_event_delete("gen_kprobe_test"));
+
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kretprobe_test);
+
+ /* Now unregister and free the event */
+ WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
+}
+
+module_init(kprobe_event_gen_test_init)
+module_exit(kprobe_event_gen_test_exit)
+
+MODULE_AUTHOR("Tom Zanussi");
+MODULE_DESCRIPTION("kprobe event generation test");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3f655371eaf6..61f0e92ace99 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -300,8 +300,6 @@ u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
-#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
-
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -443,7 +441,7 @@ enum {
struct ring_buffer_per_cpu {
int cpu;
atomic_t record_disabled;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
@@ -482,7 +480,7 @@ struct ring_buffer_per_cpu {
struct rb_irq_work irq_work;
};
-struct ring_buffer {
+struct trace_buffer {
unsigned flags;
int cpus;
atomic_t record_disabled;
@@ -518,7 +516,7 @@ struct ring_buffer_iter {
*
* Returns the number of pages used by a per_cpu buffer of the ring buffer.
*/
-size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu)
+size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
{
return buffer->buffers[cpu]->nr_pages;
}
@@ -530,7 +528,7 @@ size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu)
*
* Returns the number of pages that have content in the ring buffer.
*/
-size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu)
+size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
{
size_t read;
size_t cnt;
@@ -573,7 +571,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full)
+int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
{
struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
DEFINE_WAIT(wait);
@@ -684,7 +682,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full)
* Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers,
* zero otherwise.
*/
-__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -742,13 +740,13 @@ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0
-static inline u64 rb_time_stamp(struct ring_buffer *buffer)
+static inline u64 rb_time_stamp(struct trace_buffer *buffer)
{
/* shift to debug/test normalization and TIME_EXTENTS */
return buffer->clock() << DEBUG_SHIFT;
}
-u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
{
u64 time;
@@ -760,7 +758,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
-void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer,
int cpu, u64 *ts)
{
/* Just stupid testing the normalize function and deltas */
@@ -1283,7 +1281,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
}
static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu)
+rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1368,16 +1366,17 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
* __ring_buffer_alloc - allocate a new ring_buffer
* @size: the size in bytes per cpu that is needed.
* @flags: attributes to set for the ring buffer.
+ * @key: ring buffer reader_lock_key.
*
* Currently the only flag that is available is the RB_FL_OVERWRITE
* flag. This flag means that the buffer will overwrite old data
* when the buffer wraps. If this flag is not set, the buffer will
* drop data when the tail hits the head.
*/
-struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
long nr_pages;
int bsize;
int cpu;
@@ -1447,7 +1446,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
* @buffer: the buffer to free.
*/
void
-ring_buffer_free(struct ring_buffer *buffer)
+ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
@@ -1463,18 +1462,18 @@ ring_buffer_free(struct ring_buffer *buffer)
}
EXPORT_SYMBOL_GPL(ring_buffer_free);
-void ring_buffer_set_clock(struct ring_buffer *buffer,
+void ring_buffer_set_clock(struct trace_buffer *buffer,
u64 (*clock)(void))
{
buffer->clock = clock;
}
-void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
+void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs)
{
buffer->time_stamp_abs = abs;
}
-bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
+bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
{
return buffer->time_stamp_abs;
}
@@ -1712,7 +1711,7 @@ static void update_pages_handler(struct work_struct *work)
*
* Returns 0 on success and < 0 on failure.
*/
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -1891,7 +1890,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);
-void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val)
{
mutex_lock(&buffer->mutex);
if (val)
@@ -2206,7 +2205,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
{
struct buffer_page *tail_page = info->tail_page;
struct buffer_page *commit_page = cpu_buffer->commit_page;
- struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct trace_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
@@ -2330,11 +2329,11 @@ static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
/**
* rb_update_event - update event type and data
+ * @cpu_buffer: The per cpu buffer of the @event
* @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
+ * @info: The info to update the @event with (contains length and delta)
*
- * Update the type and data fields of the event. The length
+ * Update the type and data fields of the @event. The length
* is the actual size that is written to the ring buffer,
* and with this, we can determine what to place into the
* data field.
@@ -2609,7 +2608,7 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
}
static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
size_t nr_pages;
size_t dirty;
@@ -2733,7 +2732,7 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
* Call this function before calling another ring_buffer_lock_reserve() and
* call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
*/
-void ring_buffer_nest_start(struct ring_buffer *buffer)
+void ring_buffer_nest_start(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
@@ -2753,7 +2752,7 @@ void ring_buffer_nest_start(struct ring_buffer *buffer)
* Must be called after ring_buffer_nest_start() and after the
* ring_buffer_unlock_commit().
*/
-void ring_buffer_nest_end(struct ring_buffer *buffer)
+void ring_buffer_nest_end(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
@@ -2775,7 +2774,7 @@ void ring_buffer_nest_end(struct ring_buffer *buffer)
*
* Must be paired with ring_buffer_lock_reserve.
*/
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+int ring_buffer_unlock_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -2868,7 +2867,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
}
static __always_inline struct ring_buffer_event *
-rb_reserve_next_event(struct ring_buffer *buffer,
+rb_reserve_next_event(struct trace_buffer *buffer,
struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length)
{
@@ -2961,7 +2960,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* If NULL is returned, then nothing has been allocated or locked.
*/
struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
+ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
@@ -3062,7 +3061,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
* If this function is called, do not call ring_buffer_unlock_commit on
* the event.
*/
-void ring_buffer_discard_commit(struct ring_buffer *buffer,
+void ring_buffer_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3113,7 +3112,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
* Note, like ring_buffer_lock_reserve, the length is the length of the data
* and not the length of the event which would hold the header.
*/
-int ring_buffer_write(struct ring_buffer *buffer,
+int ring_buffer_write(struct trace_buffer *buffer,
unsigned long length,
void *data)
{
@@ -3193,7 +3192,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
*
* The caller should call synchronize_rcu() after this.
*/
-void ring_buffer_record_disable(struct ring_buffer *buffer)
+void ring_buffer_record_disable(struct trace_buffer *buffer)
{
atomic_inc(&buffer->record_disabled);
}
@@ -3206,7 +3205,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
* Note, multiple disables will need the same number of enables
* to truly enable the writing (much like preempt_disable).
*/
-void ring_buffer_record_enable(struct ring_buffer *buffer)
+void ring_buffer_record_enable(struct trace_buffer *buffer)
{
atomic_dec(&buffer->record_disabled);
}
@@ -3223,7 +3222,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
* it works like an on/off switch, where as the disable() version
* must be paired with a enable().
*/
-void ring_buffer_record_off(struct ring_buffer *buffer)
+void ring_buffer_record_off(struct trace_buffer *buffer)
{
unsigned int rd;
unsigned int new_rd;
@@ -3246,7 +3245,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off);
* it works like an on/off switch, where as the enable() version
* must be paired with a disable().
*/
-void ring_buffer_record_on(struct ring_buffer *buffer)
+void ring_buffer_record_on(struct trace_buffer *buffer)
{
unsigned int rd;
unsigned int new_rd;
@@ -3264,7 +3263,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_on);
*
* Returns true if the ring buffer is in a state that it accepts writes.
*/
-bool ring_buffer_record_is_on(struct ring_buffer *buffer)
+bool ring_buffer_record_is_on(struct trace_buffer *buffer)
{
return !atomic_read(&buffer->record_disabled);
}
@@ -3280,7 +3279,7 @@ bool ring_buffer_record_is_on(struct ring_buffer *buffer)
* ring_buffer_record_disable(), as that is a temporary disabling of
* the ring buffer.
*/
-bool ring_buffer_record_is_set_on(struct ring_buffer *buffer)
+bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
{
return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
}
@@ -3295,7 +3294,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer)
*
* The caller should call synchronize_rcu() after this.
*/
-void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3315,7 +3314,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
* Note, multiple disables will need the same number of enables
* to truly enable the writing (much like preempt_disable).
*/
-void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3345,7 +3344,7 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
-u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
{
unsigned long flags;
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3378,7 +3377,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
-unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
@@ -3398,7 +3397,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the entries from.
*/
-unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3417,7 +3416,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
*/
-unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
@@ -3440,7 +3439,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
* @cpu: The per CPU buffer to get the number of overruns from
*/
unsigned long
-ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
@@ -3462,7 +3461,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
* @cpu: The per CPU buffer to get the number of overruns from
*/
unsigned long
-ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
+ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
@@ -3483,7 +3482,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
* @cpu: The per CPU buffer to get the number of events read
*/
unsigned long
-ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3502,7 +3501,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
* Returns the total number of entries in the ring buffer
* (all CPU entries)
*/
-unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+unsigned long ring_buffer_entries(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long entries = 0;
@@ -3525,7 +3524,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries);
* Returns the total number of overruns in the ring buffer
* (all CPU entries)
*/
-unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long overruns = 0;
@@ -3949,7 +3948,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_peek);
static struct ring_buffer_event *
rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
int nr_loops = 0;
@@ -4077,7 +4076,7 @@ rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
* not consume the data.
*/
struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
@@ -4141,7 +4140,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
* and eventually empty the ring buffer if the producer is slower.
*/
struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -4201,7 +4200,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* This overall must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -4331,8 +4330,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
/**
* ring_buffer_size - return the size of the ring buffer (in bytes)
* @buffer: The ring buffer.
+ * @cpu: The CPU to get ring buffer size from.
*/
-unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
/*
* Earlier, this method returned
@@ -4398,7 +4398,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
* @buffer: The ring buffer to reset a per cpu buffer of
* @cpu: The CPU buffer to be reset
*/
-void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
unsigned long flags;
@@ -4435,7 +4435,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
* ring_buffer_reset - reset a ring buffer
* @buffer: The ring buffer to reset all cpu buffers
*/
-void ring_buffer_reset(struct ring_buffer *buffer)
+void ring_buffer_reset(struct trace_buffer *buffer)
{
int cpu;
@@ -4448,7 +4448,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
* rind_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
-bool ring_buffer_empty(struct ring_buffer *buffer)
+bool ring_buffer_empty(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
@@ -4478,7 +4478,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
* @buffer: The ring buffer
* @cpu: The CPU buffer to test
*/
-bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
@@ -4504,14 +4504,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
* @buffer_a: One buffer to swap with
* @buffer_b: The other buffer to swap with
+ * @cpu: the CPU of the buffers to swap
*
* This function is useful for tracers that want to take a "snapshot"
* of a CPU buffer and has another back up buffer lying around.
* it is expected that the tracer handles the cpu buffer not being
* used at the moment.
*/
-int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
- struct ring_buffer *buffer_b, int cpu)
+int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
+ struct trace_buffer *buffer_b, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer_a;
struct ring_buffer_per_cpu *cpu_buffer_b;
@@ -4590,7 +4591,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
+void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = NULL;
@@ -4637,7 +4638,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct buffer_data_page *bpage = data;
@@ -4697,7 +4698,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* >=0 if data has been transferred, returns the offset of consumed data.
* <0 if no data has been transferred.
*/
-int ring_buffer_read_page(struct ring_buffer *buffer,
+int ring_buffer_read_page(struct trace_buffer *buffer,
void **data_page, size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
@@ -4868,12 +4869,12 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_page);
*/
int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
long nr_pages_same;
int cpu_i;
unsigned long nr_pages;
- buffer = container_of(node, struct ring_buffer, node);
+ buffer = container_of(node, struct trace_buffer, node);
if (cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
@@ -4923,7 +4924,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
static struct task_struct *rb_threads[NR_CPUS] __initdata;
struct rb_test_data {
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long events;
unsigned long bytes_written;
unsigned long bytes_alloc;
@@ -5065,7 +5066,7 @@ static __init int rb_hammer_test(void *arg)
static __init int test_ringbuffer(void)
{
struct task_struct *rb_hammer;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
int cpu;
int ret = 0;
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 32149e46551c..8df0aa810950 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -29,7 +29,7 @@ static int reader_finish;
static DECLARE_COMPLETION(read_start);
static DECLARE_COMPLETION(read_done);
-static struct ring_buffer *buffer;
+static struct trace_buffer *buffer;
static struct task_struct *producer;
static struct task_struct *consumer;
static unsigned long read;
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
new file mode 100644
index 000000000000..4aefe003cb7c
--- /dev/null
+++ b/kernel/trace/synth_event_gen_test.c
@@ -0,0 +1,523 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test module for in-kernel sythetic event creation and generation.
+ *
+ * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <linux/trace_events.h>
+
+/*
+ * This module is a simple test of basic functionality for in-kernel
+ * synthetic event creation and generation, the first and second tests
+ * using synth_event_gen_cmd_start() and synth_event_add_field(), the
+ * third uses synth_event_create() to do it all at once with a static
+ * field array.
+ *
+ * Following that are a few examples using the created events to test
+ * various ways of tracing a synthetic event.
+ *
+ * To test, select CONFIG_SYNTH_EVENT_GEN_TEST and build the module.
+ * Then:
+ *
+ * # insmod kernel/trace/synth_event_gen_test.ko
+ * # cat /sys/kernel/debug/tracing/trace
+ *
+ * You should see several events in the trace buffer -
+ * "create_synth_test", "empty_synth_test", and several instances of
+ * "gen_synth_test".
+ *
+ * To remove the events, remove the module:
+ *
+ * # rmmod synth_event_gen_test
+ *
+ */
+
+static struct trace_event_file *create_synth_test;
+static struct trace_event_file *empty_synth_test;
+static struct trace_event_file *gen_synth_test;
+
+/*
+ * Test to make sure we can create a synthetic event, then add more
+ * fields.
+ */
+static int __init test_gen_synth_cmd(void)
+{
+ struct dynevent_cmd cmd;
+ u64 vals[7];
+ char *buf;
+ int ret;
+
+ /* Create a buffer to hold the generated command */
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Before generating the command, initialize the cmd object */
+ synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ /*
+ * Create the empty gen_synth_test synthetic event with the
+ * first 4 fields.
+ */
+ ret = synth_event_gen_cmd_start(&cmd, "gen_synth_test", THIS_MODULE,
+ "pid_t", "next_pid_field",
+ "char[16]", "next_comm_field",
+ "u64", "ts_ns",
+ "u64", "ts_ms");
+ if (ret)
+ goto free;
+
+ /* Use synth_event_add_field to add the rest of the fields */
+
+ ret = synth_event_add_field(&cmd, "unsigned int", "cpu");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "char[64]", "my_string_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "int", "my_int_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ if (ret)
+ goto free;
+
+ /*
+ * Now get the gen_synth_test event file. We need to prevent
+ * the instance and event from disappearing from underneath
+ * us, which trace_get_event_file() does (though in this case
+ * we're using the top-level instance which never goes away).
+ */
+ gen_synth_test = trace_get_event_file(NULL, "synthetic",
+ "gen_synth_test");
+ if (IS_ERR(gen_synth_test)) {
+ ret = PTR_ERR(gen_synth_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic", "gen_synth_test", true);
+ if (ret) {
+ trace_put_event_file(gen_synth_test);
+ goto delete;
+ }
+
+ /* Create some bogus values just for testing */
+
+ vals[0] = 777; /* next_pid_field */
+ vals[1] = (u64)"hula hoops"; /* next_comm_field */
+ vals[2] = 1000000; /* ts_ns */
+ vals[3] = 1000; /* ts_ms */
+ vals[4] = smp_processor_id(); /* cpu */
+ vals[5] = (u64)"thneed"; /* my_string_field */
+ vals[6] = 598; /* my_int_field */
+
+ /* Now generate a gen_synth_test event */
+ ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals));
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ synth_event_delete("gen_synth_test");
+ free:
+ kfree(buf);
+
+ goto out;
+}
+
+/*
+ * Test to make sure we can create an initially empty synthetic event,
+ * then add all the fields.
+ */
+static int __init test_empty_synth_event(void)
+{
+ struct dynevent_cmd cmd;
+ u64 vals[7];
+ char *buf;
+ int ret;
+
+ /* Create a buffer to hold the generated command */
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Before generating the command, initialize the cmd object */
+ synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ /*
+ * Create the empty_synth_test synthetic event with no fields.
+ */
+ ret = synth_event_gen_cmd_start(&cmd, "empty_synth_test", THIS_MODULE);
+ if (ret)
+ goto free;
+
+ /* Use synth_event_add_field to add all of the fields */
+
+ ret = synth_event_add_field(&cmd, "pid_t", "next_pid_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "char[16]", "next_comm_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "u64", "ts_ns");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "u64", "ts_ms");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "unsigned int", "cpu");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "char[64]", "my_string_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "int", "my_int_field");
+ if (ret)
+ goto free;
+
+ /* All fields have been added, close and register the synth event */
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ if (ret)
+ goto free;
+
+ /*
+ * Now get the empty_synth_test event file. We need to
+ * prevent the instance and event from disappearing from
+ * underneath us, which trace_get_event_file() does (though in
+ * this case we're using the top-level instance which never
+ * goes away).
+ */
+ empty_synth_test = trace_get_event_file(NULL, "synthetic",
+ "empty_synth_test");
+ if (IS_ERR(empty_synth_test)) {
+ ret = PTR_ERR(empty_synth_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic", "empty_synth_test", true);
+ if (ret) {
+ trace_put_event_file(empty_synth_test);
+ goto delete;
+ }
+
+ /* Create some bogus values just for testing */
+
+ vals[0] = 777; /* next_pid_field */
+ vals[1] = (u64)"tiddlywinks"; /* next_comm_field */
+ vals[2] = 1000000; /* ts_ns */
+ vals[3] = 1000; /* ts_ms */
+ vals[4] = smp_processor_id(); /* cpu */
+ vals[5] = (u64)"thneed_2.0"; /* my_string_field */
+ vals[6] = 399; /* my_int_field */
+
+ /* Now trace an empty_synth_test event */
+ ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals));
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ synth_event_delete("empty_synth_test");
+ free:
+ kfree(buf);
+
+ goto out;
+}
+
+static struct synth_field_desc create_synth_test_fields[] = {
+ { .type = "pid_t", .name = "next_pid_field" },
+ { .type = "char[16]", .name = "next_comm_field" },
+ { .type = "u64", .name = "ts_ns" },
+ { .type = "u64", .name = "ts_ms" },
+ { .type = "unsigned int", .name = "cpu" },
+ { .type = "char[64]", .name = "my_string_field" },
+ { .type = "int", .name = "my_int_field" },
+};
+
+/*
+ * Test synthetic event creation all at once from array of field
+ * descriptors.
+ */
+static int __init test_create_synth_event(void)
+{
+ u64 vals[7];
+ int ret;
+
+ /* Create the create_synth_test event with the fields above */
+ ret = synth_event_create("create_synth_test",
+ create_synth_test_fields,
+ ARRAY_SIZE(create_synth_test_fields),
+ THIS_MODULE);
+ if (ret)
+ goto out;
+
+ /*
+ * Now get the create_synth_test event file. We need to
+ * prevent the instance and event from disappearing from
+ * underneath us, which trace_get_event_file() does (though in
+ * this case we're using the top-level instance which never
+ * goes away).
+ */
+ create_synth_test = trace_get_event_file(NULL, "synthetic",
+ "create_synth_test");
+ if (IS_ERR(create_synth_test)) {
+ ret = PTR_ERR(create_synth_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(create_synth_test->tr,
+ "synthetic", "create_synth_test", true);
+ if (ret) {
+ trace_put_event_file(create_synth_test);
+ goto delete;
+ }
+
+ /* Create some bogus values just for testing */
+
+ vals[0] = 777; /* next_pid_field */
+ vals[1] = (u64)"tiddlywinks"; /* next_comm_field */
+ vals[2] = 1000000; /* ts_ns */
+ vals[3] = 1000; /* ts_ms */
+ vals[4] = smp_processor_id(); /* cpu */
+ vals[5] = (u64)"thneed"; /* my_string_field */
+ vals[6] = 398; /* my_int_field */
+
+ /* Now generate a create_synth_test event */
+ ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals));
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ ret = synth_event_delete("create_synth_test");
+
+ goto out;
+}
+
+/*
+ * Test tracing a synthetic event by reserving trace buffer space,
+ * then filling in fields one after another.
+ */
+static int __init test_add_next_synth_val(void)
+{
+ struct synth_event_trace_state trace_state;
+ int ret;
+
+ /* Start by reserving space in the trace buffer */
+ ret = synth_event_trace_start(gen_synth_test, &trace_state);
+ if (ret)
+ return ret;
+
+ /* Write some bogus values into the trace buffer, one after another */
+
+ /* next_pid_field */
+ ret = synth_event_add_next_val(777, &trace_state);
+ if (ret)
+ goto out;
+
+ /* next_comm_field */
+ ret = synth_event_add_next_val((u64)"slinky", &trace_state);
+ if (ret)
+ goto out;
+
+ /* ts_ns */
+ ret = synth_event_add_next_val(1000000, &trace_state);
+ if (ret)
+ goto out;
+
+ /* ts_ms */
+ ret = synth_event_add_next_val(1000, &trace_state);
+ if (ret)
+ goto out;
+
+ /* cpu */
+ ret = synth_event_add_next_val(smp_processor_id(), &trace_state);
+ if (ret)
+ goto out;
+
+ /* my_string_field */
+ ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state);
+ if (ret)
+ goto out;
+
+ /* my_int_field */
+ ret = synth_event_add_next_val(395, &trace_state);
+ out:
+ /* Finally, commit the event */
+ ret = synth_event_trace_end(&trace_state);
+
+ return ret;
+}
+
+/*
+ * Test tracing a synthetic event by reserving trace buffer space,
+ * then filling in fields using field names, which can be done in any
+ * order.
+ */
+static int __init test_add_synth_val(void)
+{
+ struct synth_event_trace_state trace_state;
+ int ret;
+
+ /* Start by reserving space in the trace buffer */
+ ret = synth_event_trace_start(gen_synth_test, &trace_state);
+ if (ret)
+ return ret;
+
+ /* Write some bogus values into the trace buffer, using field names */
+
+ ret = synth_event_add_val("ts_ns", 1000000, &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("ts_ms", 1000, &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("next_pid_field", 777, &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("next_comm_field", (u64)"silly putty",
+ &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("my_string_field", (u64)"thneed_9",
+ &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("my_int_field", 3999, &trace_state);
+ out:
+ /* Finally, commit the event */
+ ret = synth_event_trace_end(&trace_state);
+
+ return ret;
+}
+
+/*
+ * Test tracing a synthetic event all at once from array of values.
+ */
+static int __init test_trace_synth_event(void)
+{
+ int ret;
+
+ /* Trace some bogus values just for testing */
+ ret = synth_event_trace(create_synth_test, 7, /* number of values */
+ 444, /* next_pid_field */
+ (u64)"clackers", /* next_comm_field */
+ 1000000, /* ts_ns */
+ 1000, /* ts_ms */
+ smp_processor_id(), /* cpu */
+ (u64)"Thneed", /* my_string_field */
+ 999); /* my_int_field */
+ return ret;
+}
+
+static int __init synth_event_gen_test_init(void)
+{
+ int ret;
+
+ ret = test_gen_synth_cmd();
+ if (ret)
+ return ret;
+
+ ret = test_empty_synth_event();
+ if (ret) {
+ WARN_ON(trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false));
+ trace_put_event_file(gen_synth_test);
+ WARN_ON(synth_event_delete("gen_synth_test"));
+ goto out;
+ }
+
+ ret = test_create_synth_event();
+ if (ret) {
+ WARN_ON(trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false));
+ trace_put_event_file(gen_synth_test);
+ WARN_ON(synth_event_delete("gen_synth_test"));
+
+ WARN_ON(trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic",
+ "empty_synth_test", false));
+ trace_put_event_file(empty_synth_test);
+ WARN_ON(synth_event_delete("empty_synth_test"));
+ goto out;
+ }
+
+ ret = test_add_next_synth_val();
+ WARN_ON(ret);
+
+ ret = test_add_synth_val();
+ WARN_ON(ret);
+
+ ret = test_trace_synth_event();
+ WARN_ON(ret);
+ out:
+ return ret;
+}
+
+static void __exit synth_event_gen_test_exit(void)
+{
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_synth_test);
+
+ /* Now unregister and free the synthetic event */
+ WARN_ON(synth_event_delete("gen_synth_test"));
+
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic",
+ "empty_synth_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(empty_synth_test);
+
+ /* Now unregister and free the synthetic event */
+ WARN_ON(synth_event_delete("empty_synth_test"));
+
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(create_synth_test->tr,
+ "synthetic",
+ "create_synth_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(create_synth_test);
+
+ /* Now unregister and free the synthetic event */
+ WARN_ON(synth_event_delete("create_synth_test"));
+}
+
+module_init(synth_event_gen_test_init)
+module_exit(synth_event_gen_test_exit)
+
+MODULE_AUTHOR("Tom Zanussi");
+MODULE_DESCRIPTION("synthetic event generation test");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5b6ee4aadc26..c797a15a1fc7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -162,8 +162,8 @@ union trace_eval_map_item {
static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
-static int tracing_set_tracer(struct trace_array *tr, const char *buf);
-static void ftrace_trace_userstack(struct ring_buffer *buffer,
+int tracing_set_tracer(struct trace_array *tr, const char *buf);
+static void ftrace_trace_userstack(struct trace_buffer *buffer,
unsigned long flags, int pc);
#define MAX_TRACER_SIZE 100
@@ -338,7 +338,7 @@ int tracing_check_open_get_tr(struct trace_array *tr)
}
int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
@@ -603,7 +603,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
return read;
}
-static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu)
+static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
{
u64 ts;
@@ -619,7 +619,7 @@ static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu)
u64 ftrace_now(int cpu)
{
- return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
+ return buffer_ftrace_now(&global_trace.array_buffer, cpu);
}
/**
@@ -747,22 +747,22 @@ static inline void trace_access_lock_init(void)
#endif
#ifdef CONFIG_STACKTRACE
-static void __ftrace_trace_stack(struct ring_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs);
#else
-static inline void __ftrace_trace_stack(struct ring_buffer *buffer,
+static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
}
static inline void ftrace_trace_stack(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
@@ -780,7 +780,7 @@ trace_event_setup(struct ring_buffer_event *event,
}
static __always_inline struct ring_buffer_event *
-__trace_buffer_lock_reserve(struct ring_buffer *buffer,
+__trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
unsigned long flags, int pc)
@@ -796,8 +796,8 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer,
void tracer_tracing_on(struct trace_array *tr)
{
- if (tr->trace_buffer.buffer)
- ring_buffer_record_on(tr->trace_buffer.buffer);
+ if (tr->array_buffer.buffer)
+ ring_buffer_record_on(tr->array_buffer.buffer);
/*
* This flag is looked at when buffers haven't been allocated
* yet, or by some tracers (like irqsoff), that just want to
@@ -825,7 +825,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
static __always_inline void
-__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event)
{
__this_cpu_write(trace_taskinfo_save, true);
@@ -848,7 +848,7 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve
int __trace_puts(unsigned long ip, const char *str, int size)
{
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
int alloc;
@@ -865,11 +865,14 @@ int __trace_puts(unsigned long ip, const char *str, int size)
alloc = sizeof(*entry) + size + 2; /* possible \n added */
local_save_flags(irq_flags);
- buffer = global_trace.trace_buffer.buffer;
+ buffer = global_trace.array_buffer.buffer;
+ ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
irq_flags, pc);
- if (!event)
- return 0;
+ if (!event) {
+ size = 0;
+ goto out;
+ }
entry = ring_buffer_event_data(event);
entry->ip = ip;
@@ -885,7 +888,8 @@ int __trace_puts(unsigned long ip, const char *str, int size)
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
-
+ out:
+ ring_buffer_nest_end(buffer);
return size;
}
EXPORT_SYMBOL_GPL(__trace_puts);
@@ -898,10 +902,11 @@ EXPORT_SYMBOL_GPL(__trace_puts);
int __trace_bputs(unsigned long ip, const char *str)
{
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct bputs_entry *entry;
unsigned long irq_flags;
int size = sizeof(struct bputs_entry);
+ int ret = 0;
int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
@@ -913,11 +918,13 @@ int __trace_bputs(unsigned long ip, const char *str)
return 0;
local_save_flags(irq_flags);
- buffer = global_trace.trace_buffer.buffer;
+ buffer = global_trace.array_buffer.buffer;
+
+ ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
irq_flags, pc);
if (!event)
- return 0;
+ goto out;
entry = ring_buffer_event_data(event);
entry->ip = ip;
@@ -926,7 +933,10 @@ int __trace_bputs(unsigned long ip, const char *str)
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
- return 1;
+ ret = 1;
+ out:
+ ring_buffer_nest_end(buffer);
+ return ret;
}
EXPORT_SYMBOL_GPL(__trace_bputs);
@@ -1036,9 +1046,9 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
}
EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
-static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
- struct trace_buffer *size_buf, int cpu_id);
-static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
+static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id);
+static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
@@ -1048,7 +1058,7 @@ int tracing_alloc_snapshot_instance(struct trace_array *tr)
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(&tr->max_buffer,
- &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+ &tr->array_buffer, RING_BUFFER_ALL_CPUS);
if (ret < 0)
return ret;
@@ -1251,8 +1261,8 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
void tracer_tracing_off(struct trace_array *tr)
{
- if (tr->trace_buffer.buffer)
- ring_buffer_record_off(tr->trace_buffer.buffer);
+ if (tr->array_buffer.buffer)
+ ring_buffer_record_off(tr->array_buffer.buffer);
/*
* This flag is looked at when buffers haven't been allocated
* yet, or by some tracers (like irqsoff), that just want to
@@ -1294,8 +1304,8 @@ void disable_trace_on_warning(void)
*/
bool tracer_tracing_is_on(struct trace_array *tr)
{
- if (tr->trace_buffer.buffer)
- return ring_buffer_record_is_on(tr->trace_buffer.buffer);
+ if (tr->array_buffer.buffer)
+ return ring_buffer_record_is_on(tr->array_buffer.buffer);
return !tr->buffer_disabled;
}
@@ -1590,8 +1600,8 @@ void latency_fsnotify(struct trace_array *tr)
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct trace_buffer *trace_buf = &tr->trace_buffer;
- struct trace_buffer *max_buf = &tr->max_buffer;
+ struct array_buffer *trace_buf = &tr->array_buffer;
+ struct array_buffer *max_buf = &tr->max_buffer;
struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
@@ -1649,8 +1659,8 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
arch_spin_lock(&tr->max_lock);
- /* Inherit the recordable setting from trace_buffer */
- if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer))
+ /* Inherit the recordable setting from array_buffer */
+ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
ring_buffer_record_on(tr->max_buffer.buffer);
else
ring_buffer_record_off(tr->max_buffer.buffer);
@@ -1659,7 +1669,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data))
goto out_unlock;
#endif
- swap(tr->trace_buffer.buffer, tr->max_buffer.buffer);
+ swap(tr->array_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
@@ -1692,7 +1702,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
arch_spin_lock(&tr->max_lock);
- ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
+ ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu);
if (ret == -EBUSY) {
/*
@@ -1718,7 +1728,7 @@ static int wait_on_pipe(struct trace_iterator *iter, int full)
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
full);
}
@@ -1769,7 +1779,7 @@ static int run_tracer_selftest(struct tracer *type)
* internal tracing to verify that everything is in order.
* If we fail, we do not register this tracer.
*/
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
tr->current_trace = type;
@@ -1795,7 +1805,7 @@ static int run_tracer_selftest(struct tracer *type)
return -1;
}
/* Only reset on passing, to avoid touching corrupted buffers */
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
if (type->use_max_tr) {
@@ -1962,9 +1972,9 @@ int __init register_tracer(struct tracer *type)
return ret;
}
-static void tracing_reset_cpu(struct trace_buffer *buf, int cpu)
+static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
{
- struct ring_buffer *buffer = buf->buffer;
+ struct trace_buffer *buffer = buf->buffer;
if (!buffer)
return;
@@ -1978,9 +1988,9 @@ static void tracing_reset_cpu(struct trace_buffer *buf, int cpu)
ring_buffer_record_enable(buffer);
}
-void tracing_reset_online_cpus(struct trace_buffer *buf)
+void tracing_reset_online_cpus(struct array_buffer *buf)
{
- struct ring_buffer *buffer = buf->buffer;
+ struct trace_buffer *buffer = buf->buffer;
int cpu;
if (!buffer)
@@ -2008,7 +2018,7 @@ void tracing_reset_all_online_cpus(void)
if (!tr->clear_trace)
continue;
tr->clear_trace = false;
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
tracing_reset_online_cpus(&tr->max_buffer);
#endif
@@ -2098,7 +2108,7 @@ int is_tracing_stopped(void)
*/
void tracing_start(void)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long flags;
if (tracing_disabled)
@@ -2117,7 +2127,7 @@ void tracing_start(void)
/* Prevent the buffers from switching */
arch_spin_lock(&global_trace.max_lock);
- buffer = global_trace.trace_buffer.buffer;
+ buffer = global_trace.array_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
@@ -2135,7 +2145,7 @@ void tracing_start(void)
static void tracing_start_tr(struct trace_array *tr)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long flags;
if (tracing_disabled)
@@ -2156,7 +2166,7 @@ static void tracing_start_tr(struct trace_array *tr)
goto out;
}
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
@@ -2172,7 +2182,7 @@ static void tracing_start_tr(struct trace_array *tr)
*/
void tracing_stop(void)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long flags;
raw_spin_lock_irqsave(&global_trace.start_lock, flags);
@@ -2182,7 +2192,7 @@ void tracing_stop(void)
/* Prevent the buffers from switching */
arch_spin_lock(&global_trace.max_lock);
- buffer = global_trace.trace_buffer.buffer;
+ buffer = global_trace.array_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
@@ -2200,7 +2210,7 @@ void tracing_stop(void)
static void tracing_stop_tr(struct trace_array *tr)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long flags;
/* If global, we need to also stop the max tracer */
@@ -2211,7 +2221,7 @@ static void tracing_stop_tr(struct trace_array *tr)
if (tr->stop_count++)
goto out;
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
@@ -2442,7 +2452,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
struct ring_buffer_event *
-trace_buffer_lock_reserve(struct ring_buffer *buffer,
+trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
unsigned long flags, int pc)
@@ -2561,10 +2571,10 @@ void trace_buffered_event_disable(void)
preempt_enable();
}
-static struct ring_buffer *temp_buffer;
+static struct trace_buffer *temp_buffer;
struct ring_buffer_event *
-trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
+trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
struct trace_event_file *trace_file,
int type, unsigned long len,
unsigned long flags, int pc)
@@ -2572,7 +2582,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
struct ring_buffer_event *entry;
int val;
- *current_rb = trace_file->tr->trace_buffer.buffer;
+ *current_rb = trace_file->tr->array_buffer.buffer;
if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
(EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
@@ -2610,6 +2620,7 @@ static DEFINE_MUTEX(tracepoint_printk_mutex);
static void output_printk(struct trace_event_buffer *fbuffer)
{
struct trace_event_call *event_call;
+ struct trace_event_file *file;
struct trace_event *event;
unsigned long flags;
struct trace_iterator *iter = tracepoint_print_iter;
@@ -2623,6 +2634,12 @@ static void output_printk(struct trace_event_buffer *fbuffer)
!event_call->event.funcs->trace)
return;
+ file = fbuffer->trace_file;
+ if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
+ (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
+ !filter_match_preds(file->filter, fbuffer->entry)))
+ return;
+
event = &fbuffer->trace_file->event_call->event;
spin_lock_irqsave(&tracepoint_iter_lock, flags);
@@ -2673,9 +2690,9 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
if (static_key_false(&tracepoint_printk_key.key))
output_printk(fbuffer);
- event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
+ event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc);
+ fbuffer->flags, fbuffer->pc, fbuffer->regs);
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -2689,7 +2706,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
# define STACK_SKIP 3
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc,
struct pt_regs *regs)
@@ -2710,7 +2727,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
* Similar to trace_buffer_unlock_commit_regs() but do not dump stack.
*/
void
-trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
__buffer_unlock_commit(buffer, event);
@@ -2845,7 +2862,7 @@ trace_function(struct trace_array *tr,
int pc)
{
struct trace_event_call *call = &event_function;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -2883,7 +2900,7 @@ struct ftrace_stacks {
static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
-static void __ftrace_trace_stack(struct ring_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
@@ -2958,7 +2975,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
}
static inline void ftrace_trace_stack(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
@@ -2971,7 +2988,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
@@ -3009,7 +3026,7 @@ void trace_dump_stack(int skip)
/* Skip 1 to skip this function. */
skip++;
#endif
- __ftrace_trace_stack(global_trace.trace_buffer.buffer,
+ __ftrace_trace_stack(global_trace.array_buffer.buffer,
flags, skip, preempt_count(), NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3018,7 +3035,7 @@ EXPORT_SYMBOL_GPL(trace_dump_stack);
static DEFINE_PER_CPU(int, user_stack_count);
static void
-ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
+ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
@@ -3063,7 +3080,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
-static void ftrace_trace_userstack(struct ring_buffer *buffer,
+static void ftrace_trace_userstack(struct trace_buffer *buffer,
unsigned long flags, int pc)
{
}
@@ -3109,7 +3126,7 @@ static int alloc_percpu_trace_buffer(void)
struct trace_buffer_struct *buffers;
buffers = alloc_percpu(struct trace_buffer_struct);
- if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+ if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
return -ENOMEM;
trace_percpu_buffer = buffers;
@@ -3154,7 +3171,7 @@ void trace_printk_init_buffers(void)
* directly here. If the global_trace.buffer is already
* allocated here, then this was called by module code.
*/
- if (global_trace.trace_buffer.buffer)
+ if (global_trace.array_buffer.buffer)
tracing_start_cmdline_record();
}
EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
@@ -3188,7 +3205,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct trace_array *tr = &global_trace;
struct bprint_entry *entry;
unsigned long flags;
@@ -3213,11 +3230,12 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
- goto out;
+ goto out_put;
local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
+ ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
flags, pc);
if (!event)
@@ -3233,6 +3251,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
}
out:
+ ring_buffer_nest_end(buffer);
+out_put:
put_trace_buf();
out_nobuffer:
@@ -3245,7 +3265,7 @@ EXPORT_SYMBOL_GPL(trace_vbprintk);
__printf(3, 0)
static int
-__trace_array_vprintk(struct ring_buffer *buffer,
+__trace_array_vprintk(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, va_list args)
{
struct trace_event_call *call = &event_print;
@@ -3275,6 +3295,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
+ ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
flags, pc);
if (!event)
@@ -3289,6 +3310,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
}
out:
+ ring_buffer_nest_end(buffer);
put_trace_buf();
out_nobuffer:
@@ -3302,7 +3324,7 @@ __printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
- return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
+ return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}
__printf(3, 0)
@@ -3326,7 +3348,7 @@ int trace_array_printk(struct trace_array *tr,
EXPORT_SYMBOL_GPL(trace_array_printk);
__printf(3, 4)
-int trace_array_printk_buf(struct ring_buffer *buffer,
+int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
{
int ret;
@@ -3367,7 +3389,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
if (buf_iter)
event = ring_buffer_iter_peek(buf_iter, ts);
else
- event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
+ event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts,
lost_events);
if (event) {
@@ -3382,7 +3404,7 @@ static struct trace_entry *
__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
unsigned long *missing_events, u64 *ent_ts)
{
- struct ring_buffer *buffer = iter->trace_buffer->buffer;
+ struct trace_buffer *buffer = iter->array_buffer->buffer;
struct trace_entry *ent, *next = NULL;
unsigned long lost_events = 0, next_lost = 0;
int cpu_file = iter->cpu_file;
@@ -3459,7 +3481,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
static void trace_consume(struct trace_iterator *iter)
{
- ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
+ ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts,
&iter->lost_events);
}
@@ -3497,7 +3519,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
unsigned long entries = 0;
u64 ts;
- per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
+ per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0;
buf_iter = trace_buffer_iter(iter, cpu);
if (!buf_iter)
@@ -3511,13 +3533,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
* by the timestamp being before the start of the buffer.
*/
while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
- if (ts >= iter->trace_buffer->time_start)
+ if (ts >= iter->array_buffer->time_start)
break;
entries++;
ring_buffer_read(buf_iter, NULL);
}
- per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
+ per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
}
/*
@@ -3602,7 +3624,7 @@ static void s_stop(struct seq_file *m, void *p)
}
static void
-get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
+get_total_entries_cpu(struct array_buffer *buf, unsigned long *total,
unsigned long *entries, int cpu)
{
unsigned long count;
@@ -3624,7 +3646,7 @@ get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
}
static void
-get_total_entries(struct trace_buffer *buf,
+get_total_entries(struct array_buffer *buf,
unsigned long *total, unsigned long *entries)
{
unsigned long t, e;
@@ -3647,7 +3669,7 @@ unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
if (!tr)
tr = &global_trace;
- get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu);
+ get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu);
return entries;
}
@@ -3659,7 +3681,7 @@ unsigned long trace_total_entries(struct trace_array *tr)
if (!tr)
tr = &global_trace;
- get_total_entries(&tr->trace_buffer, &total, &entries);
+ get_total_entries(&tr->array_buffer, &total, &entries);
return entries;
}
@@ -3676,7 +3698,7 @@ static void print_lat_help_header(struct seq_file *m)
"# \\ / ||||| \\ | / \n");
}
-static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
+static void print_event_info(struct array_buffer *buf, struct seq_file *m)
{
unsigned long total;
unsigned long entries;
@@ -3687,7 +3709,7 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
seq_puts(m, "#\n");
}
-static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
+static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
@@ -3698,7 +3720,7 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
}
-static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
+static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
@@ -3720,7 +3742,7 @@ void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK);
- struct trace_buffer *buf = iter->trace_buffer;
+ struct array_buffer *buf = iter->array_buffer;
struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
struct tracer *type = iter->trace;
unsigned long entries;
@@ -3795,7 +3817,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
cpumask_test_cpu(iter->cpu, iter->started))
return;
- if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
+ if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries)
return;
if (cpumask_available(iter->started))
@@ -3929,7 +3951,7 @@ int trace_empty(struct trace_iterator *iter)
if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
- if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
+ if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu))
return 0;
}
return 1;
@@ -3941,7 +3963,7 @@ int trace_empty(struct trace_iterator *iter)
if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
- if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
+ if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu))
return 0;
}
}
@@ -4031,10 +4053,10 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer,
+ print_func_help_header_irq(iter->array_buffer,
m, trace_flags);
else
- print_func_help_header(iter->trace_buffer, m,
+ print_func_help_header(iter->array_buffer, m,
trace_flags);
}
}
@@ -4192,21 +4214,21 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
#ifdef CONFIG_TRACER_MAX_TRACE
/* Currently only the top directory has a snapshot */
if (tr->current_trace->print_max || snapshot)
- iter->trace_buffer = &tr->max_buffer;
+ iter->array_buffer = &tr->max_buffer;
else
#endif
- iter->trace_buffer = &tr->trace_buffer;
+ iter->array_buffer = &tr->array_buffer;
iter->snapshot = snapshot;
iter->pos = -1;
iter->cpu_file = tracing_get_cpu(inode);
mutex_init(&iter->mutex);
/* Notify the tracer early; before we stop tracing. */
- if (iter->trace && iter->trace->open)
+ if (iter->trace->open)
iter->trace->open(iter);
/* Annotate start of buffers if we had overruns */
- if (ring_buffer_overruns(iter->trace_buffer->buffer))
+ if (ring_buffer_overruns(iter->array_buffer->buffer))
iter->iter_flags |= TRACE_FILE_ANNOTATE;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -4220,7 +4242,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ ring_buffer_read_prepare(iter->array_buffer->buffer,
cpu, GFP_KERNEL);
}
ring_buffer_read_prepare_sync();
@@ -4231,7 +4253,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ ring_buffer_read_prepare(iter->array_buffer->buffer,
cpu, GFP_KERNEL);
ring_buffer_read_prepare_sync();
ring_buffer_read_start(iter->buffer_iter[cpu]);
@@ -4357,7 +4379,7 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
int cpu = tracing_get_cpu(inode);
- struct trace_buffer *trace_buf = &tr->trace_buffer;
+ struct array_buffer *trace_buf = &tr->array_buffer;
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->current_trace->print_max)
@@ -4554,20 +4576,13 @@ out_err:
return count;
}
-static ssize_t
-tracing_cpumask_write(struct file *filp, const char __user *ubuf,
- size_t count, loff_t *ppos)
+int tracing_set_cpumask(struct trace_array *tr,
+ cpumask_var_t tracing_cpumask_new)
{
- struct trace_array *tr = file_inode(filp)->i_private;
- cpumask_var_t tracing_cpumask_new;
- int err, cpu;
-
- if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
- return -ENOMEM;
+ int cpu;
- err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
- if (err)
- goto err_unlock;
+ if (!tr)
+ return -EINVAL;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
@@ -4578,24 +4593,47 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
*/
if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
- ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
+ atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
+ ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
- ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
+ atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
+ ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
}
}
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
+
+ return 0;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_array *tr = file_inode(filp)->i_private;
+ cpumask_var_t tracing_cpumask_new;
+ int err;
+
+ if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+ if (err)
+ goto err_free;
+
+ err = tracing_set_cpumask(tr, tracing_cpumask_new);
+ if (err)
+ goto err_free;
+
free_cpumask_var(tracing_cpumask_new);
return count;
-err_unlock:
+err_free:
free_cpumask_var(tracing_cpumask_new);
return err;
@@ -4726,7 +4764,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
ftrace_pid_follow_fork(tr, enabled);
if (mask == TRACE_ITER_OVERWRITE) {
- ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
+ ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled);
#ifdef CONFIG_TRACER_MAX_TRACE
ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
#endif
@@ -4740,7 +4778,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
return 0;
}
-static int trace_set_options(struct trace_array *tr, char *option)
+int trace_set_options(struct trace_array *tr, char *option)
{
char *cmp;
int neg = 0;
@@ -5361,14 +5399,12 @@ static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
* Paranoid! If ptr points to end, we don't want to increment past it.
* This really should never happen.
*/
+ (*pos)++;
ptr = update_eval_map(ptr);
if (WARN_ON_ONCE(!ptr))
return NULL;
ptr++;
-
- (*pos)++;
-
ptr = update_eval_map(ptr);
return ptr;
@@ -5534,11 +5570,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
int tracer_init(struct tracer *t, struct trace_array *tr)
{
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
return t->init(tr);
}
-static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
+static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
{
int cpu;
@@ -5548,8 +5584,8 @@ static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
#ifdef CONFIG_TRACER_MAX_TRACE
/* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
- struct trace_buffer *size_buf, int cpu_id)
+static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id)
{
int cpu, ret = 0;
@@ -5587,10 +5623,10 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ring_buffer_expanded = true;
/* May be called before buffers are initialized */
- if (!tr->trace_buffer.buffer)
+ if (!tr->array_buffer.buffer)
return 0;
- ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
+ ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
if (ret < 0)
return ret;
@@ -5601,8 +5637,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
if (ret < 0) {
- int r = resize_buffer_duplicate_size(&tr->trace_buffer,
- &tr->trace_buffer, cpu);
+ int r = resize_buffer_duplicate_size(&tr->array_buffer,
+ &tr->array_buffer, cpu);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -5633,15 +5669,15 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&tr->trace_buffer, size);
+ set_buffer_entries(&tr->array_buffer, size);
else
- per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
+ per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size;
return ret;
}
-static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
- unsigned long size, int cpu_id)
+ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu_id)
{
int ret = size;
@@ -5720,7 +5756,7 @@ static void add_tracer_options(struct trace_array *tr, struct tracer *t)
create_trace_option_files(tr, t);
}
-static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -5979,7 +6015,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
iter->tr = tr;
- iter->trace_buffer = &tr->trace_buffer;
+ iter->array_buffer = &tr->array_buffer;
iter->cpu_file = tracing_get_cpu(inode);
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6039,7 +6075,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
*/
return EPOLLIN | EPOLLRDNORM;
else
- return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
filp, poll_table);
}
@@ -6356,8 +6392,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
for_each_tracing_cpu(cpu) {
/* fill in the size from first enabled cpu */
if (size == 0)
- size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
- if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
+ size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries;
+ if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) {
buf_size_same = 0;
break;
}
@@ -6373,7 +6409,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
} else
r = sprintf(buf, "X\n");
} else
- r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);
+ r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10);
mutex_unlock(&trace_types_lock);
@@ -6420,7 +6456,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
- size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
+ size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10;
if (!ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
@@ -6470,7 +6506,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
enum event_trigger_type tt = ETT_NONE;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
ssize_t written;
@@ -6499,7 +6535,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (cnt < FAULTED_SIZE)
size += FAULTED_SIZE - cnt;
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
irq_flags, preempt_count());
if (unlikely(!event))
@@ -6550,7 +6586,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct raw_data_entry *entry;
unsigned long irq_flags;
ssize_t written;
@@ -6579,7 +6615,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
irq_flags, preempt_count());
if (!event)
@@ -6634,13 +6670,13 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tr->clock_id = i;
- ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
+ ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func);
/*
* New clock may not be consistent with the previous clock.
* Reset the buffer so that it doesn't have incomparable timestamps.
*/
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->max_buffer.buffer)
@@ -6703,7 +6739,7 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
mutex_lock(&trace_types_lock);
- if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
+ if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer))
seq_puts(m, "delta [absolute]\n");
else
seq_puts(m, "[delta] absolute\n");
@@ -6748,7 +6784,7 @@ int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
goto out;
}
- ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
+ ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs);
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->max_buffer.buffer)
@@ -6797,7 +6833,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
ret = 0;
iter->tr = tr;
- iter->trace_buffer = &tr->max_buffer;
+ iter->array_buffer = &tr->max_buffer;
iter->cpu_file = tracing_get_cpu(inode);
m->private = iter;
file->private_data = m;
@@ -6860,7 +6896,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
#endif
if (tr->allocated_snapshot)
ret = resize_buffer_duplicate_size(&tr->max_buffer,
- &tr->trace_buffer, iter->cpu_file);
+ &tr->array_buffer, iter->cpu_file);
else
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
@@ -6935,7 +6971,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
}
info->iter.snapshot = true;
- info->iter.trace_buffer = &info->iter.tr->max_buffer;
+ info->iter.array_buffer = &info->iter.tr->max_buffer;
return ret;
}
@@ -7310,7 +7346,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
info->iter.tr = tr;
info->iter.cpu_file = tracing_get_cpu(inode);
info->iter.trace = tr->current_trace;
- info->iter.trace_buffer = &tr->trace_buffer;
+ info->iter.array_buffer = &tr->array_buffer;
info->spare = NULL;
/* Force reading ring buffer for first read */
info->read = (unsigned int)-1;
@@ -7355,7 +7391,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
#endif
if (!info->spare) {
- info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
+ info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
if (IS_ERR(info->spare)) {
ret = PTR_ERR(info->spare);
@@ -7373,7 +7409,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
again:
trace_access_lock(iter->cpu_file);
- ret = ring_buffer_read_page(iter->trace_buffer->buffer,
+ ret = ring_buffer_read_page(iter->array_buffer->buffer,
&info->spare,
count,
iter->cpu_file, 0);
@@ -7423,7 +7459,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
if (info->spare)
- ring_buffer_free_read_page(iter->trace_buffer->buffer,
+ ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
kfree(info);
@@ -7433,7 +7469,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
}
struct buffer_ref {
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
void *page;
int cpu;
refcount_t refcount;
@@ -7528,7 +7564,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
again:
trace_access_lock(iter->cpu_file);
- entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
+ entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
@@ -7541,7 +7577,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
refcount_set(&ref->refcount, 1);
- ref->buffer = iter->trace_buffer->buffer;
+ ref->buffer = iter->array_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (IS_ERR(ref->page)) {
ret = PTR_ERR(ref->page);
@@ -7569,7 +7605,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
spd.nr_pages++;
*ppos += PAGE_SIZE;
- entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
+ entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
}
trace_access_unlock(iter->cpu_file);
@@ -7613,7 +7649,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
{
struct inode *inode = file_inode(filp);
struct trace_array *tr = inode->i_private;
- struct trace_buffer *trace_buf = &tr->trace_buffer;
+ struct array_buffer *trace_buf = &tr->array_buffer;
int cpu = tracing_get_cpu(inode);
struct trace_seq *s;
unsigned long cnt;
@@ -7894,7 +7930,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
- WARN_ONCE(!tr->percpu_dir,
+ MEM_FAIL(!tr->percpu_dir,
"Could not create tracefs directory 'per_cpu/%d'\n", cpu);
return tr->percpu_dir;
@@ -8215,7 +8251,7 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
for (cnt = 0; opts[cnt].name; cnt++) {
create_trace_option_file(tr, &topts[cnt], flags,
&opts[cnt]);
- WARN_ONCE(topts[cnt].entry == NULL,
+ MEM_FAIL(topts[cnt].entry == NULL,
"Failed to create trace option: %s",
opts[cnt].name);
}
@@ -8272,7 +8308,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
unsigned long val;
int ret;
@@ -8362,7 +8398,7 @@ static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
static int
-allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
+allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
enum ring_buffer_flags rb_flags;
@@ -8382,8 +8418,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
}
/* Allocate the first page for all buffers */
- set_buffer_entries(&tr->trace_buffer,
- ring_buffer_size(tr->trace_buffer.buffer, 0));
+ set_buffer_entries(&tr->array_buffer,
+ ring_buffer_size(tr->array_buffer.buffer, 0));
return 0;
}
@@ -8392,18 +8428,18 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
{
int ret;
- ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
+ ret = allocate_trace_buffer(tr, &tr->array_buffer, size);
if (ret)
return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
- if (WARN_ON(ret)) {
- ring_buffer_free(tr->trace_buffer.buffer);
- tr->trace_buffer.buffer = NULL;
- free_percpu(tr->trace_buffer.data);
- tr->trace_buffer.data = NULL;
+ if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
+ ring_buffer_free(tr->array_buffer.buffer);
+ tr->array_buffer.buffer = NULL;
+ free_percpu(tr->array_buffer.data);
+ tr->array_buffer.data = NULL;
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
@@ -8417,7 +8453,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return 0;
}
-static void free_trace_buffer(struct trace_buffer *buf)
+static void free_trace_buffer(struct array_buffer *buf)
{
if (buf->buffer) {
ring_buffer_free(buf->buffer);
@@ -8432,7 +8468,7 @@ static void free_trace_buffers(struct trace_array *tr)
if (!tr)
return;
- free_trace_buffer(&tr->trace_buffer);
+ free_trace_buffer(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
free_trace_buffer(&tr->max_buffer);
@@ -8463,6 +8499,34 @@ static void update_tracer_options(struct trace_array *tr)
mutex_unlock(&trace_types_lock);
}
+/* Must have trace_types_lock held */
+struct trace_array *trace_array_find(const char *instance)
+{
+ struct trace_array *tr, *found = NULL;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, instance) == 0) {
+ found = tr;
+ break;
+ }
+ }
+
+ return found;
+}
+
+struct trace_array *trace_array_find_get(const char *instance)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&trace_types_lock);
+ tr = trace_array_find(instance);
+ if (tr)
+ tr->ref++;
+ mutex_unlock(&trace_types_lock);
+
+ return tr;
+}
+
static struct trace_array *trace_array_create(const char *name)
{
struct trace_array *tr;
@@ -8504,7 +8568,7 @@ static struct trace_array *trace_array_create(const char *name)
ret = event_trace_add_tracer(tr->dir, tr);
if (ret) {
- tracefs_remove_recursive(tr->dir);
+ tracefs_remove(tr->dir);
goto out_free_tr;
}
@@ -8539,10 +8603,8 @@ static int instance_mkdir(const char *name)
mutex_lock(&trace_types_lock);
ret = -EEXIST;
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0)
- goto out_unlock;
- }
+ if (trace_array_find(name))
+ goto out_unlock;
tr = trace_array_create(name);
@@ -8564,6 +8626,10 @@ out_unlock:
* NOTE: This function increments the reference counter associated with the
* trace array returned. This makes sure it cannot be freed while in use.
* Use trace_array_put() once the trace array is no longer needed.
+ * If the trace_array is to be freed, trace_array_destroy() needs to
+ * be called after the trace_array_put(), or simply let user space delete
+ * it from the tracefs instances directory. But until the
+ * trace_array_put() is called, user space can not delete it.
*
*/
struct trace_array *trace_array_get_by_name(const char *name)
@@ -8613,7 +8679,7 @@ static int __remove_instance(struct trace_array *tr)
event_trace_del_tracer(tr);
ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
- tracefs_remove_recursive(tr->dir);
+ tracefs_remove(tr->dir);
free_trace_buffers(tr);
for (i = 0; i < tr->nr_topts; i++) {
@@ -8666,12 +8732,9 @@ static int instance_rmdir(const char *name)
mutex_lock(&trace_types_lock);
ret = -ENODEV;
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0) {
- ret = __remove_instance(tr);
- break;
- }
- }
+ tr = trace_array_find(name);
+ if (tr)
+ ret = __remove_instance(tr);
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -8684,7 +8747,7 @@ static __init void create_trace_instances(struct dentry *d_tracer)
trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
instance_mkdir,
instance_rmdir);
- if (WARN_ON(!trace_instance_dir))
+ if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
return;
}
@@ -8754,7 +8817,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
#endif
if (ftrace_create_function_files(tr, d_tracer))
- WARN(1, "Could not allocate function filter files");
+ MEM_FAIL(1, "Could not allocate function filter files");
#ifdef CONFIG_TRACER_SNAPSHOT
trace_create_file("snapshot", 0644, d_tracer,
@@ -9036,13 +9099,13 @@ void trace_init_global_iter(struct trace_iterator *iter)
iter->tr = &global_trace;
iter->trace = iter->tr->current_trace;
iter->cpu_file = RING_BUFFER_ALL_CPUS;
- iter->trace_buffer = &global_trace.trace_buffer;
+ iter->array_buffer = &global_trace.array_buffer;
if (iter->trace && iter->trace->open)
iter->trace->open(iter);
/* Annotate start of buffers if we had overruns */
- if (ring_buffer_overruns(iter->trace_buffer->buffer))
+ if (ring_buffer_overruns(iter->array_buffer->buffer))
iter->iter_flags |= TRACE_FILE_ANNOTATE;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -9083,7 +9146,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -9151,7 +9214,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tr->trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
printk_nmi_direct_exit();
@@ -9306,8 +9369,7 @@ __init static int tracer_alloc_buffers(void)
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
- printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
- WARN_ON(1);
+ MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
goto out_free_savedcmd;
}
@@ -9380,7 +9442,8 @@ void __init early_trace_init(void)
if (tracepoint_printk) {
tracepoint_print_iter =
kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
- if (WARN_ON(!tracepoint_print_iter))
+ if (MEM_FAIL(!tracepoint_print_iter,
+ "Failed to allocate trace iterator\n"))
tracepoint_printk = 0;
else
static_key_enable(&tracepoint_printk_key.key);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 63bf60f79398..99372dd7d168 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -52,6 +52,9 @@ enum trace_type {
#undef __field
#define __field(type, item) type item;
+#undef __field_fn
+#define __field_fn(type, item) type item;
+
#undef __field_struct
#define __field_struct(type, item) __field(type, item)
@@ -71,29 +74,37 @@ enum trace_type {
#define F_STRUCT(args...) args
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
struct struct_name { \
struct trace_entry ent; \
tstruct \
}
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
- filter, regfn) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
#undef FTRACE_ENTRY_PACKED
-#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
- filter) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter) __packed
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed
#include "trace_entries.h"
+/* Use this for memory failure errors */
+#define MEM_FAIL(condition, fmt, ...) ({ \
+ static bool __section(.data.once) __warned; \
+ int __ret_warn_once = !!(condition); \
+ \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ __warned = true; \
+ pr_err("ERROR: " fmt, ##__VA_ARGS__); \
+ } \
+ unlikely(__ret_warn_once); \
+})
+
/*
* syscalls are special, and need special handling, this is why
* they are not included in trace_entries.h
@@ -176,9 +187,9 @@ struct trace_array_cpu {
struct tracer;
struct trace_option_dentry;
-struct trace_buffer {
+struct array_buffer {
struct trace_array *tr;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct trace_array_cpu __percpu *data;
u64 time_start;
int cpu;
@@ -249,7 +260,7 @@ struct cond_snapshot {
struct trace_array {
struct list_head list;
char *name;
- struct trace_buffer trace_buffer;
+ struct array_buffer array_buffer;
#ifdef CONFIG_TRACER_MAX_TRACE
/*
* The max_buffer is used to snapshot the trace when a maximum
@@ -257,12 +268,12 @@ struct trace_array {
* Some tracers will use this to store a maximum trace while
* it continues examining live traces.
*
- * The buffers for the max_buffer are set up the same as the trace_buffer
+ * The buffers for the max_buffer are set up the same as the array_buffer
* When a snapshot is taken, the buffer of the max_buffer is swapped
- * with the buffer of the trace_buffer and the buffers are reset for
- * the trace_buffer so the tracing can continue.
+ * with the buffer of the array_buffer and the buffers are reset for
+ * the array_buffer so the tracing can continue.
*/
- struct trace_buffer max_buffer;
+ struct array_buffer max_buffer;
bool allocated_snapshot;
#endif
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
@@ -346,6 +357,8 @@ extern struct mutex trace_types_lock;
extern int trace_array_get(struct trace_array *tr);
extern int tracing_check_open_get_tr(struct trace_array *tr);
+extern struct trace_array *trace_array_find(const char *instance);
+extern struct trace_array *trace_array_find_get(const char *instance);
extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
@@ -685,7 +698,7 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
-void tracing_reset_online_cpus(struct trace_buffer *buf);
+void tracing_reset_online_cpus(struct array_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -705,7 +718,7 @@ struct dentry *tracing_init_dentry(void);
struct ring_buffer_event;
struct ring_buffer_event *
-trace_buffer_lock_reserve(struct ring_buffer *buffer,
+trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
unsigned long flags,
@@ -717,7 +730,7 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
-void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event);
int trace_empty(struct trace_iterator *iter);
@@ -873,7 +886,7 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern int
trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args);
-int trace_array_printk_buf(struct ring_buffer *buffer,
+int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
enum print_line_t print_trace_line(struct trace_iterator *iter);
@@ -950,22 +963,31 @@ extern void __trace_graph_return(struct trace_array *tr,
unsigned long flags, int pc);
#ifdef CONFIG_DYNAMIC_FTRACE
-extern struct ftrace_hash *ftrace_graph_hash;
-extern struct ftrace_hash *ftrace_graph_notrace_hash;
+extern struct ftrace_hash __rcu *ftrace_graph_hash;
+extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
unsigned long addr = trace->func;
int ret = 0;
+ struct ftrace_hash *hash;
preempt_disable_notrace();
- if (ftrace_hash_empty(ftrace_graph_hash)) {
+ /*
+ * Have to open code "rcu_dereference_sched()" because the
+ * function graph tracer can be called when RCU is not
+ * "watching".
+ * Protected with schedule_on_each_cpu(ftrace_sync)
+ */
+ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible());
+
+ if (ftrace_hash_empty(hash)) {
ret = 1;
goto out;
}
- if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+ if (ftrace_lookup_ip(hash, addr)) {
/*
* This needs to be cleared on the return functions
@@ -1001,10 +1023,20 @@ static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
int ret = 0;
+ struct ftrace_hash *notrace_hash;
preempt_disable_notrace();
- if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+ /*
+ * Have to open code "rcu_dereference_sched()" because the
+ * function graph tracer can be called when RCU is not
+ * "watching".
+ * Protected with schedule_on_each_cpu(ftrace_sync)
+ */
+ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ !preemptible());
+
+ if (ftrace_lookup_ip(notrace_hash, addr))
ret = 1;
preempt_enable_notrace();
@@ -1057,7 +1089,7 @@ struct ftrace_func_command {
extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct trace_array *tr)
{
- return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
+ return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
@@ -1145,6 +1177,11 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd);
void ftrace_create_filter_files(struct ftrace_ops *ops,
struct dentry *parent);
void ftrace_destroy_filter_files(struct ftrace_ops *ops);
+
+extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+ int len, int reset);
+extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+ int len, int reset);
#else
struct ftrace_func_command;
@@ -1367,17 +1404,17 @@ struct trace_subsystem_dir {
};
extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event);
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc,
struct pt_regs *regs);
static inline void trace_buffer_unlock_commit(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc)
{
@@ -1390,7 +1427,7 @@ void trace_buffered_event_disable(void);
void trace_buffered_event_enable(void);
static inline void
-__trace_event_discard_commit(struct ring_buffer *buffer,
+__trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
if (this_cpu_read(trace_buffered_event) == event) {
@@ -1416,7 +1453,7 @@ __trace_event_discard_commit(struct ring_buffer *buffer,
*/
static inline bool
__event_trigger_test_discard(struct trace_event_file *file,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
void *entry,
enum event_trigger_type *tt)
@@ -1451,7 +1488,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
*/
static inline void
event_trigger_unlock_commit(struct trace_event_file *file,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
void *entry, unsigned long irq_flags, int pc)
{
@@ -1482,7 +1519,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
*/
static inline void
event_trigger_unlock_commit_regs(struct trace_event_file *file,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
void *entry, unsigned long irq_flags, int pc,
struct pt_regs *regs)
@@ -1893,6 +1930,15 @@ void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+/* Used from boot time tracer */
+extern int trace_set_options(struct trace_array *tr, char *option);
+extern int tracing_set_tracer(struct trace_array *tr, const char *buf);
+extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu_id);
+extern int tracing_set_cpumask(struct trace_array *tr,
+ cpumask_var_t tracing_cpumask_new);
+
+
#define MAX_EVENT_NAME_LEN 64
extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
@@ -1917,17 +1963,15 @@ extern void tracing_log_err(struct trace_array *tr,
#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
extern struct trace_event_call \
__aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
- FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#undef FTRACE_ENTRY_PACKED
-#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
- FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#include "trace_entries.h"
@@ -1952,6 +1996,9 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
void trace_event_eval_update(struct trace_eval_map **map, int len);
+/* Used from boot time tracer */
+extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
+extern int trigger_process_regex(struct trace_event_file *file, char *buff);
#else
static inline void __init trace_event_init(void) { }
static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
new file mode 100644
index 000000000000..06d7feb5255f
--- /dev/null
+++ b/kernel/trace/trace_boot.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * trace_boot.c
+ * Tracing kernel boot-time
+ */
+
+#define pr_fmt(fmt) "trace_boot: " fmt
+
+#include <linux/bootconfig.h>
+#include <linux/cpumask.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/trace.h>
+#include <linux/trace_events.h>
+
+#include "trace.h"
+
+#define MAX_BUF_LEN 256
+
+static void __init
+trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
+{
+ struct xbc_node *anode;
+ const char *p;
+ char buf[MAX_BUF_LEN];
+ unsigned long v = 0;
+
+ /* Common ftrace options */
+ xbc_node_for_each_array_value(node, "options", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) {
+ pr_err("String is too long: %s\n", p);
+ continue;
+ }
+
+ if (trace_set_options(tr, buf) < 0)
+ pr_err("Failed to set option: %s\n", buf);
+ }
+
+ p = xbc_node_find_value(node, "trace_clock", NULL);
+ if (p && *p != '\0') {
+ if (tracing_set_clock(tr, p) < 0)
+ pr_err("Failed to set trace clock: %s\n", p);
+ }
+
+ p = xbc_node_find_value(node, "buffer_size", NULL);
+ if (p && *p != '\0') {
+ v = memparse(p, NULL);
+ if (v < PAGE_SIZE)
+ pr_err("Buffer size is too small: %s\n", p);
+ if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
+ pr_err("Failed to resize trace buffer to %s\n", p);
+ }
+
+ p = xbc_node_find_value(node, "cpumask", NULL);
+ if (p && *p != '\0') {
+ cpumask_var_t new_mask;
+
+ if (alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ if (cpumask_parse(p, new_mask) < 0 ||
+ tracing_set_cpumask(tr, new_mask) < 0)
+ pr_err("Failed to set new CPU mask %s\n", p);
+ free_cpumask_var(new_mask);
+ }
+ }
+}
+
+#ifdef CONFIG_EVENT_TRACING
+static void __init
+trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node)
+{
+ struct xbc_node *anode;
+ char buf[MAX_BUF_LEN];
+ const char *p;
+
+ xbc_node_for_each_array_value(node, "events", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) {
+ pr_err("String is too long: %s\n", p);
+ continue;
+ }
+
+ if (ftrace_set_clr_event(tr, buf, 1) < 0)
+ pr_err("Failed to enable event: %s\n", p);
+ }
+}
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int __init
+trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
+{
+ struct dynevent_cmd cmd;
+ struct xbc_node *anode;
+ char buf[MAX_BUF_LEN];
+ const char *val;
+ int ret;
+
+ kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN);
+
+ ret = kprobe_event_gen_cmd_start(&cmd, event, NULL);
+ if (ret)
+ return ret;
+
+ xbc_node_for_each_array_value(node, "probes", anode, val) {
+ ret = kprobe_event_add_field(&cmd, val);
+ if (ret)
+ return ret;
+ }
+
+ ret = kprobe_event_gen_cmd_end(&cmd);
+ if (ret)
+ pr_err("Failed to add probe: %s\n", buf);
+
+ return ret;
+}
+#else
+static inline int __init
+trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
+{
+ pr_err("Kprobe event is not supported.\n");
+ return -ENOTSUPP;
+}
+#endif
+
+#ifdef CONFIG_HIST_TRIGGERS
+static int __init
+trace_boot_add_synth_event(struct xbc_node *node, const char *event)
+{
+ struct dynevent_cmd cmd;
+ struct xbc_node *anode;
+ char buf[MAX_BUF_LEN];
+ const char *p;
+ int ret;
+
+ synth_event_cmd_init(&cmd, buf, MAX_BUF_LEN);
+
+ ret = synth_event_gen_cmd_start(&cmd, event, NULL);
+ if (ret)
+ return ret;
+
+ xbc_node_for_each_array_value(node, "fields", anode, p) {
+ ret = synth_event_add_field_str(&cmd, p);
+ if (ret)
+ return ret;
+ }
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ if (ret < 0)
+ pr_err("Failed to add synthetic event: %s\n", buf);
+
+ return ret;
+}
+#else
+static inline int __init
+trace_boot_add_synth_event(struct xbc_node *node, const char *event)
+{
+ pr_err("Synthetic event is not supported.\n");
+ return -ENOTSUPP;
+}
+#endif
+
+static void __init
+trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
+ struct xbc_node *enode)
+{
+ struct trace_event_file *file;
+ struct xbc_node *anode;
+ char buf[MAX_BUF_LEN];
+ const char *p, *group, *event;
+
+ group = xbc_node_get_data(gnode);
+ event = xbc_node_get_data(enode);
+
+ if (!strcmp(group, "kprobes"))
+ if (trace_boot_add_kprobe_event(enode, event) < 0)
+ return;
+ if (!strcmp(group, "synthetic"))
+ if (trace_boot_add_synth_event(enode, event) < 0)
+ return;
+
+ mutex_lock(&event_mutex);
+ file = find_event_file(tr, group, event);
+ if (!file) {
+ pr_err("Failed to find event: %s:%s\n", group, event);
+ goto out;
+ }
+
+ p = xbc_node_find_value(enode, "filter", NULL);
+ if (p && *p != '\0') {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ pr_err("filter string is too long: %s\n", p);
+ else if (apply_event_filter(file, buf) < 0)
+ pr_err("Failed to apply filter: %s\n", buf);
+ }
+
+ xbc_node_for_each_array_value(enode, "actions", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ pr_err("action string is too long: %s\n", p);
+ else if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply an action: %s\n", buf);
+ }
+
+ if (xbc_node_find_value(enode, "enable", NULL)) {
+ if (trace_event_enable_disable(file, 1, 0) < 0)
+ pr_err("Failed to enable event node: %s:%s\n",
+ group, event);
+ }
+out:
+ mutex_unlock(&event_mutex);
+}
+
+static void __init
+trace_boot_init_events(struct trace_array *tr, struct xbc_node *node)
+{
+ struct xbc_node *gnode, *enode;
+
+ node = xbc_node_find_child(node, "event");
+ if (!node)
+ return;
+ /* per-event key starts with "event.GROUP.EVENT" */
+ xbc_node_for_each_child(node, gnode)
+ xbc_node_for_each_child(gnode, enode)
+ trace_boot_init_one_event(tr, gnode, enode);
+}
+#else
+#define trace_boot_enable_events(tr, node) do {} while (0)
+#define trace_boot_init_events(tr, node) do {} while (0)
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void __init
+trace_boot_set_ftrace_filter(struct trace_array *tr, struct xbc_node *node)
+{
+ struct xbc_node *anode;
+ const char *p;
+ char *q;
+
+ xbc_node_for_each_array_value(node, "ftrace.filters", anode, p) {
+ q = kstrdup(p, GFP_KERNEL);
+ if (!q)
+ return;
+ if (ftrace_set_filter(tr->ops, q, strlen(q), 0) < 0)
+ pr_err("Failed to add %s to ftrace filter\n", p);
+ else
+ ftrace_filter_param = true;
+ kfree(q);
+ }
+ xbc_node_for_each_array_value(node, "ftrace.notraces", anode, p) {
+ q = kstrdup(p, GFP_KERNEL);
+ if (!q)
+ return;
+ if (ftrace_set_notrace(tr->ops, q, strlen(q), 0) < 0)
+ pr_err("Failed to add %s to ftrace filter\n", p);
+ else
+ ftrace_filter_param = true;
+ kfree(q);
+ }
+}
+#else
+#define trace_boot_set_ftrace_filter(tr, node) do {} while (0)
+#endif
+
+static void __init
+trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
+{
+ const char *p;
+
+ trace_boot_set_ftrace_filter(tr, node);
+
+ p = xbc_node_find_value(node, "tracer", NULL);
+ if (p && *p != '\0') {
+ if (tracing_set_tracer(tr, p) < 0)
+ pr_err("Failed to set given tracer: %s\n", p);
+ }
+}
+
+static void __init
+trace_boot_init_one_instance(struct trace_array *tr, struct xbc_node *node)
+{
+ trace_boot_set_instance_options(tr, node);
+ trace_boot_init_events(tr, node);
+ trace_boot_enable_events(tr, node);
+ trace_boot_enable_tracer(tr, node);
+}
+
+static void __init
+trace_boot_init_instances(struct xbc_node *node)
+{
+ struct xbc_node *inode;
+ struct trace_array *tr;
+ const char *p;
+
+ node = xbc_node_find_child(node, "instance");
+ if (!node)
+ return;
+
+ xbc_node_for_each_child(node, inode) {
+ p = xbc_node_get_data(inode);
+ if (!p || *p == '\0')
+ continue;
+
+ tr = trace_array_get_by_name(p);
+ if (!tr) {
+ pr_err("Failed to get trace instance %s\n", p);
+ continue;
+ }
+ trace_boot_init_one_instance(tr, inode);
+ trace_array_put(tr);
+ }
+}
+
+static int __init trace_boot_init(void)
+{
+ struct xbc_node *trace_node;
+ struct trace_array *tr;
+
+ trace_node = xbc_find_node("ftrace");
+ if (!trace_node)
+ return 0;
+
+ tr = top_trace_array();
+ if (!tr)
+ return 0;
+
+ /* Global trace array is also one instance */
+ trace_boot_init_one_instance(tr, trace_node);
+ trace_boot_init_instances(trace_node);
+
+ return 0;
+}
+
+fs_initcall(trace_boot_init);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 88e158d27965..eff099123aa2 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,10 +32,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
+ struct trace_buffer *buffer;
struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
- struct ring_buffer *buffer;
unsigned long flags;
int pc;
const char *p;
@@ -55,12 +55,12 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
raw_local_irq_save(flags);
current->trace_recursion |= TRACE_BRANCH_BIT;
- data = this_cpu_ptr(tr->trace_buffer.data);
+ data = this_cpu_ptr(tr->array_buffer.data);
if (atomic_read(&data->disabled))
goto out;
pc = preempt_count();
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
sizeof(*entry), flags, pc);
if (!event)
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 89779eb84a07..9f2e8520b748 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -223,3 +223,215 @@ static __init int init_dynamic_event(void)
return 0;
}
fs_initcall(init_dynamic_event);
+
+/**
+ * dynevent_arg_add - Add an arg to a dynevent_cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd
+ * @arg: The argument to append to the current cmd
+ * @check_arg: An (optional) pointer to a function checking arg sanity
+ *
+ * Append an argument to a dynevent_cmd. The argument string will be
+ * appended to the current cmd string, followed by a separator, if
+ * applicable. Before the argument is added, the @check_arg function,
+ * if present, will be used to check the sanity of the current arg
+ * string.
+ *
+ * The cmd string and separator should be set using the
+ * dynevent_arg_init() before any arguments are added using this
+ * function.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int dynevent_arg_add(struct dynevent_cmd *cmd,
+ struct dynevent_arg *arg,
+ dynevent_check_arg_fn_t check_arg)
+{
+ int ret = 0;
+
+ if (check_arg) {
+ ret = check_arg(arg);
+ if (ret)
+ return ret;
+ }
+
+ ret = seq_buf_printf(&cmd->seq, " %s%c", arg->str, arg->separator);
+ if (ret) {
+ pr_err("String is too long: %s%c\n", arg->str, arg->separator);
+ return -E2BIG;
+ }
+
+ return ret;
+}
+
+/**
+ * dynevent_arg_pair_add - Add an arg pair to a dynevent_cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd
+ * @arg_pair: The argument pair to append to the current cmd
+ * @check_arg: An (optional) pointer to a function checking arg sanity
+ *
+ * Append an argument pair to a dynevent_cmd. An argument pair
+ * consists of a left-hand-side argument and a right-hand-side
+ * argument separated by an operator, which can be whitespace, all
+ * followed by a separator, if applicable. This can be used to add
+ * arguments of the form 'type variable_name;' or 'x+y'.
+ *
+ * The lhs argument string will be appended to the current cmd string,
+ * followed by an operator, if applicable, followd by the rhs string,
+ * followed finally by a separator, if applicable. Before the
+ * argument is added, the @check_arg function, if present, will be
+ * used to check the sanity of the current arg strings.
+ *
+ * The cmd strings, operator, and separator should be set using the
+ * dynevent_arg_pair_init() before any arguments are added using this
+ * function.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int dynevent_arg_pair_add(struct dynevent_cmd *cmd,
+ struct dynevent_arg_pair *arg_pair,
+ dynevent_check_arg_fn_t check_arg)
+{
+ int ret = 0;
+
+ if (check_arg) {
+ ret = check_arg(arg_pair);
+ if (ret)
+ return ret;
+ }
+
+ ret = seq_buf_printf(&cmd->seq, " %s%c%s%c", arg_pair->lhs,
+ arg_pair->operator, arg_pair->rhs,
+ arg_pair->separator);
+ if (ret) {
+ pr_err("field string is too long: %s%c%s%c\n", arg_pair->lhs,
+ arg_pair->operator, arg_pair->rhs,
+ arg_pair->separator);
+ return -E2BIG;
+ }
+
+ return ret;
+}
+
+/**
+ * dynevent_str_add - Add a string to a dynevent_cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd
+ * @str: The string to append to the current cmd
+ *
+ * Append a string to a dynevent_cmd. The string will be appended to
+ * the current cmd string as-is, with nothing prepended or appended.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int dynevent_str_add(struct dynevent_cmd *cmd, const char *str)
+{
+ int ret = 0;
+
+ ret = seq_buf_puts(&cmd->seq, str);
+ if (ret) {
+ pr_err("String is too long: %s\n", str);
+ return -E2BIG;
+ }
+
+ return ret;
+}
+
+/**
+ * dynevent_cmd_init - Initialize a dynevent_cmd object
+ * @cmd: A pointer to the dynevent_cmd struct representing the cmd
+ * @buf: A pointer to the buffer to generate the command into
+ * @maxlen: The length of the buffer the command will be generated into
+ * @type: The type of the cmd, checked against further operations
+ * @run_command: The type-specific function that will actually run the command
+ *
+ * Initialize a dynevent_cmd. A dynevent_cmd is used to build up and
+ * run dynamic event creation commands, such as commands for creating
+ * synthetic and kprobe events. Before calling any of the functions
+ * used to build the command, a dynevent_cmd object should be
+ * instantiated and initialized using this function.
+ *
+ * The initialization sets things up by saving a pointer to the
+ * user-supplied buffer and its length via the @buf and @maxlen
+ * params, and by saving the cmd-specific @type and @run_command
+ * params which are used to check subsequent dynevent_cmd operations
+ * and actually run the command when complete.
+ */
+void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen,
+ enum dynevent_type type,
+ dynevent_create_fn_t run_command)
+{
+ memset(cmd, '\0', sizeof(*cmd));
+
+ seq_buf_init(&cmd->seq, buf, maxlen);
+ cmd->type = type;
+ cmd->run_command = run_command;
+}
+
+/**
+ * dynevent_arg_init - Initialize a dynevent_arg object
+ * @arg: A pointer to the dynevent_arg struct representing the arg
+ * @separator: An (optional) separator, appended after adding the arg
+ *
+ * Initialize a dynevent_arg object. A dynevent_arg represents an
+ * object used to append single arguments to the current command
+ * string. After the arg string is successfully appended to the
+ * command string, the optional @separator is appended. If no
+ * separator was specified when initializing the arg, a space will be
+ * appended.
+ */
+void dynevent_arg_init(struct dynevent_arg *arg,
+ char separator)
+{
+ memset(arg, '\0', sizeof(*arg));
+
+ if (!separator)
+ separator = ' ';
+ arg->separator = separator;
+}
+
+/**
+ * dynevent_arg_pair_init - Initialize a dynevent_arg_pair object
+ * @arg_pair: A pointer to the dynevent_arg_pair struct representing the arg
+ * @operator: An (optional) operator, appended after adding the first arg
+ * @separator: An (optional) separator, appended after adding the second arg
+ *
+ * Initialize a dynevent_arg_pair object. A dynevent_arg_pair
+ * represents an object used to append argument pairs such as 'type
+ * variable_name;' or 'x+y' to the current command string. An
+ * argument pair consists of a left-hand-side argument and a
+ * right-hand-side argument separated by an operator, which can be
+ * whitespace, all followed by a separator, if applicable. After the
+ * first arg string is successfully appended to the command string,
+ * the optional @operator is appended, followed by the second arg and
+ * and optional @separator. If no separator was specified when
+ * initializing the arg, a space will be appended.
+ */
+void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair,
+ char operator, char separator)
+{
+ memset(arg_pair, '\0', sizeof(*arg_pair));
+
+ if (!operator)
+ operator = ' ';
+ arg_pair->operator = operator;
+
+ if (!separator)
+ separator = ' ';
+ arg_pair->separator = separator;
+}
+
+/**
+ * dynevent_create - Create the dynamic event contained in dynevent_cmd
+ * @cmd: The dynevent_cmd object containing the dynamic event creation command
+ *
+ * Once a dynevent_cmd object has been successfully built up via the
+ * dynevent_cmd_init(), dynevent_arg_add() and dynevent_arg_pair_add()
+ * functions, this function runs the final command to actually create
+ * the event.
+ *
+ * Return: 0 if the event was successfully created, error otherwise.
+ */
+int dynevent_create(struct dynevent_cmd *cmd)
+{
+ return cmd->run_command(cmd);
+}
+EXPORT_SYMBOL_GPL(dynevent_create);
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 46898138d2df..d6857a254ede 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -117,4 +117,36 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
#define for_each_dyn_event_safe(pos, n) \
list_for_each_entry_safe(pos, n, &dyn_event_list, list)
+extern void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen,
+ enum dynevent_type type,
+ dynevent_create_fn_t run_command);
+
+typedef int (*dynevent_check_arg_fn_t)(void *data);
+
+struct dynevent_arg {
+ const char *str;
+ char separator; /* e.g. ';', ',', or nothing */
+};
+
+extern void dynevent_arg_init(struct dynevent_arg *arg,
+ char separator);
+extern int dynevent_arg_add(struct dynevent_cmd *cmd,
+ struct dynevent_arg *arg,
+ dynevent_check_arg_fn_t check_arg);
+
+struct dynevent_arg_pair {
+ const char *lhs;
+ const char *rhs;
+ char operator; /* e.g. '=' or nothing */
+ char separator; /* e.g. ';', ',', or nothing */
+};
+
+extern void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair,
+ char operator, char separator);
+
+extern int dynevent_arg_pair_add(struct dynevent_cmd *cmd,
+ struct dynevent_arg_pair *arg_pair,
+ dynevent_check_arg_fn_t check_arg);
+extern int dynevent_str_add(struct dynevent_cmd *cmd, const char *str);
+
#endif
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index fc8e97328e54..f22746f3c132 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -61,15 +61,13 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
TRACE_FN,
F_STRUCT(
- __field( unsigned long, ip )
- __field( unsigned long, parent_ip )
+ __field_fn( unsigned long, ip )
+ __field_fn( unsigned long, parent_ip )
),
F_printk(" %ps <-- %ps",
(void *)__entry->ip, (void *)__entry->parent_ip),
- FILTER_TRACE_FN,
-
perf_ftrace_event_register
);
@@ -84,9 +82,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
__field_desc( int, graph_ent, depth )
),
- F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth),
-
- FILTER_OTHER
+ F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
);
/* Function return entry */
@@ -97,18 +93,16 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
__field_desc( unsigned long, ret, func )
+ __field_desc( unsigned long, ret, overrun )
__field_desc( unsigned long long, ret, calltime)
__field_desc( unsigned long long, ret, rettime )
- __field_desc( unsigned long, ret, overrun )
__field_desc( int, ret, depth )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
- __entry->depth),
-
- FILTER_OTHER
+ __entry->depth)
);
/*
@@ -137,9 +131,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_pid, __entry->next_prio, __entry->next_state,
- __entry->next_cpu),
-
- FILTER_OTHER
+ __entry->next_cpu)
);
/*
@@ -157,9 +149,7 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_pid, __entry->next_prio, __entry->next_state,
- __entry->next_cpu),
-
- FILTER_OTHER
+ __entry->next_cpu)
);
/*
@@ -174,7 +164,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
F_STRUCT(
__field( int, size )
- __dynamic_array(unsigned long, caller )
+ __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
),
F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
@@ -183,9 +173,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
(void *)__entry->caller[0], (void *)__entry->caller[1],
(void *)__entry->caller[2], (void *)__entry->caller[3],
(void *)__entry->caller[4], (void *)__entry->caller[5],
- (void *)__entry->caller[6], (void *)__entry->caller[7]),
-
- FILTER_OTHER
+ (void *)__entry->caller[6], (void *)__entry->caller[7])
);
FTRACE_ENTRY(user_stack, userstack_entry,
@@ -203,9 +191,7 @@ FTRACE_ENTRY(user_stack, userstack_entry,
(void *)__entry->caller[0], (void *)__entry->caller[1],
(void *)__entry->caller[2], (void *)__entry->caller[3],
(void *)__entry->caller[4], (void *)__entry->caller[5],
- (void *)__entry->caller[6], (void *)__entry->caller[7]),
-
- FILTER_OTHER
+ (void *)__entry->caller[6], (void *)__entry->caller[7])
);
/*
@@ -222,9 +208,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
),
F_printk("%ps: %s",
- (void *)__entry->ip, __entry->fmt),
-
- FILTER_OTHER
+ (void *)__entry->ip, __entry->fmt)
);
FTRACE_ENTRY_REG(print, print_entry,
@@ -239,8 +223,6 @@ FTRACE_ENTRY_REG(print, print_entry,
F_printk("%ps: %s",
(void *)__entry->ip, __entry->buf),
- FILTER_OTHER,
-
ftrace_event_register
);
@@ -254,9 +236,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry,
),
F_printk("id:%04x %08x",
- __entry->id, (int)__entry->buf[0]),
-
- FILTER_OTHER
+ __entry->id, (int)__entry->buf[0])
);
FTRACE_ENTRY(bputs, bputs_entry,
@@ -269,9 +249,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
),
F_printk("%ps: %s",
- (void *)__entry->ip, __entry->str),
-
- FILTER_OTHER
+ (void *)__entry->ip, __entry->str)
);
FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -283,16 +261,14 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
__field_desc( resource_size_t, rw, phys )
__field_desc( unsigned long, rw, value )
__field_desc( unsigned long, rw, pc )
- __field_desc( int, rw, map_id )
+ __field_desc( int, rw, map_id )
__field_desc( unsigned char, rw, opcode )
__field_desc( unsigned char, rw, width )
),
F_printk("%lx %lx %lx %d %x %x",
(unsigned long)__entry->phys, __entry->value, __entry->pc,
- __entry->map_id, __entry->opcode, __entry->width),
-
- FILTER_OTHER
+ __entry->map_id, __entry->opcode, __entry->width)
);
FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -304,15 +280,13 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
__field_desc( resource_size_t, map, phys )
__field_desc( unsigned long, map, virt )
__field_desc( unsigned long, map, len )
- __field_desc( int, map, map_id )
+ __field_desc( int, map, map_id )
__field_desc( unsigned char, map, opcode )
),
F_printk("%lx %lx %lx %d %x",
(unsigned long)__entry->phys, __entry->virt, __entry->len,
- __entry->map_id, __entry->opcode),
-
- FILTER_OTHER
+ __entry->map_id, __entry->opcode)
);
@@ -334,9 +308,7 @@ FTRACE_ENTRY(branch, trace_branch,
F_printk("%u:%s:%s (%u)%s",
__entry->line,
__entry->func, __entry->file, __entry->correct,
- __entry->constant ? " CONSTANT" : ""),
-
- FILTER_OTHER
+ __entry->constant ? " CONSTANT" : "")
);
@@ -362,7 +334,5 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
__entry->duration,
__entry->outer_duration,
__entry->nmi_total_ts,
- __entry->nmi_count),
-
- FILTER_OTHER
+ __entry->nmi_count)
);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a5b614cc3887..f38234ecea18 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -24,6 +24,7 @@
#include <linux/delay.h>
#include <trace/events/sched.h>
+#include <trace/syscall.h>
#include <asm/setup.h>
@@ -237,7 +238,7 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
if (!pid_list)
return false;
- data = this_cpu_ptr(tr->trace_buffer.data);
+ data = this_cpu_ptr(tr->array_buffer.data);
return data->ignore_pid;
}
@@ -272,6 +273,7 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
if (!fbuffer->event)
return NULL;
+ fbuffer->regs = NULL;
fbuffer->entry = ring_buffer_event_data(fbuffer->event);
return fbuffer->entry;
}
@@ -546,7 +548,7 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, prev) &&
trace_ignore_this_task(pid_list, next));
}
@@ -560,7 +562,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, next));
}
@@ -571,12 +573,12 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
/* Nothing to do if we are already tracing */
- if (!this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ if (!this_cpu_read(tr->array_buffer.data->ignore_pid))
return;
pid_list = rcu_dereference_sched(tr->filtered_pids);
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, task));
}
@@ -587,13 +589,13 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
/* Nothing to do if we are not tracing */
- if (this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ if (this_cpu_read(tr->array_buffer.data->ignore_pid))
return;
pid_list = rcu_dereference_sched(tr->filtered_pids);
/* Set tracing if current is enabled */
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, current));
}
@@ -625,7 +627,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
}
for_each_possible_cpu(cpu)
- per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false;
+ per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false;
rcu_assign_pointer(tr->filtered_pids, NULL);
@@ -697,7 +699,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- tracefs_remove_recursive(dir->entry);
+ tracefs_remove(dir->entry);
list_del(&dir->list);
__put_system_dir(dir);
}
@@ -716,7 +718,7 @@ static void remove_event_file_dir(struct trace_event_file *file)
}
spin_unlock(&dir->d_lock);
- tracefs_remove_recursive(dir);
+ tracefs_remove(dir);
}
list_del(&file->list);
@@ -1594,7 +1596,7 @@ static void ignore_task_cpu(void *data)
pid_list = rcu_dereference_protected(tr->filtered_pids,
mutex_is_locked(&event_mutex));
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, current));
}
@@ -2017,7 +2019,24 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
*/
head = trace_get_fields(call);
if (list_empty(head)) {
- ret = call->class->define_fields(call);
+ struct trace_event_fields *field = call->class->fields_array;
+ unsigned int offset = sizeof(struct trace_entry);
+
+ for (; field->type; field++) {
+ if (field->type == TRACE_FUNCTION_TYPE) {
+ ret = field->define_fields(call);
+ break;
+ }
+
+ offset = ALIGN(offset, field->align);
+ ret = trace_define_field(call, field->type, field->name,
+ offset, field->size,
+ field->is_signed, field->filter_type);
+ if (ret)
+ break;
+
+ offset += field->size;
+ }
if (ret < 0) {
pr_warn("Could not initialize trace point events/%s\n",
name);
@@ -2535,6 +2554,91 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
return file;
}
+/**
+ * trace_get_event_file - Find and return a trace event file
+ * @instance: The name of the trace instance containing the event
+ * @system: The name of the system containing the event
+ * @event: The name of the event
+ *
+ * Return a trace event file given the trace instance name, trace
+ * system, and trace event name. If the instance name is NULL, it
+ * refers to the top-level trace array.
+ *
+ * This function will look it up and return it if found, after calling
+ * trace_array_get() to prevent the instance from going away, and
+ * increment the event's module refcount to prevent it from being
+ * removed.
+ *
+ * To release the file, call trace_put_event_file(), which will call
+ * trace_array_put() and decrement the event's module refcount.
+ *
+ * Return: The trace event on success, ERR_PTR otherwise.
+ */
+struct trace_event_file *trace_get_event_file(const char *instance,
+ const char *system,
+ const char *event)
+{
+ struct trace_array *tr = top_trace_array();
+ struct trace_event_file *file = NULL;
+ int ret = -EINVAL;
+
+ if (instance) {
+ tr = trace_array_find_get(instance);
+ if (!tr)
+ return ERR_PTR(-ENOENT);
+ } else {
+ ret = trace_array_get(tr);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ mutex_lock(&event_mutex);
+
+ file = find_event_file(tr, system, event);
+ if (!file) {
+ trace_array_put(tr);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Don't let event modules unload while in use */
+ ret = try_module_get(file->event_call->mod);
+ if (!ret) {
+ trace_array_put(tr);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = 0;
+ out:
+ mutex_unlock(&event_mutex);
+
+ if (ret)
+ file = ERR_PTR(ret);
+
+ return file;
+}
+EXPORT_SYMBOL_GPL(trace_get_event_file);
+
+/**
+ * trace_put_event_file - Release a file from trace_get_event_file()
+ * @file: The trace event file
+ *
+ * If a file was retrieved using trace_get_event_file(), this should
+ * be called when it's no longer needed. It will cancel the previous
+ * trace_array_get() called by that function, and decrement the
+ * event's module refcount.
+ */
+void trace_put_event_file(struct trace_event_file *file)
+{
+ mutex_lock(&event_mutex);
+ module_put(file->event_call->mod);
+ mutex_unlock(&event_mutex);
+
+ trace_array_put(file->tr);
+}
+EXPORT_SYMBOL_GPL(trace_put_event_file);
+
#ifdef CONFIG_DYNAMIC_FTRACE
/* Avoid typos */
@@ -3064,7 +3168,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- tracefs_remove_recursive(tr->event_dir);
+ tracefs_remove(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
@@ -3391,8 +3495,8 @@ static void __init
function_test_events_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
+ struct trace_buffer *buffer;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
struct ftrace_entry *entry;
unsigned long flags;
long disabled;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6ac35b9e195d..e7ce7cdac62f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -66,7 +66,12 @@
C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \
C(INVALID_REF_KEY, "Using variable references in keys not supported"), \
C(VAR_NOT_FOUND, "Couldn't find variable"), \
- C(FIELD_NOT_FOUND, "Couldn't find field"),
+ C(FIELD_NOT_FOUND, "Couldn't find field"), \
+ C(EMPTY_ASSIGNMENT, "Empty assignment"), \
+ C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \
+ C(EMPTY_SORT_FIELD, "Empty sort field"), \
+ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
+ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -375,7 +380,7 @@ struct hist_trigger_data {
unsigned int n_save_var_str;
};
-static int synth_event_create(int argc, const char **argv);
+static int create_synth_event(int argc, const char **argv);
static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
static int synth_event_release(struct dyn_event *ev);
static bool synth_event_is_busy(struct dyn_event *ev);
@@ -383,7 +388,7 @@ static bool synth_event_match(const char *system, const char *event,
int argc, const char **argv, struct dyn_event *ev);
static struct dyn_event_operations synth_event_ops = {
- .create = synth_event_create,
+ .create = create_synth_event,
.show = synth_event_show,
.is_busy = synth_event_is_busy,
.free = synth_event_release,
@@ -394,6 +399,7 @@ struct synth_field {
char *type;
char *name;
size_t size;
+ unsigned int offset;
bool is_signed;
bool is_string;
};
@@ -408,6 +414,7 @@ struct synth_event {
struct trace_event_class class;
struct trace_event_call call;
struct tracepoint *tp;
+ struct module *mod;
};
static bool is_synth_event(struct dyn_event *ev)
@@ -470,11 +477,12 @@ struct action_data {
* When a histogram trigger is hit, the values of any
* references to variables, including variables being passed
* as parameters to synthetic events, are collected into a
- * var_ref_vals array. This var_ref_idx is the index of the
- * first param in the array to be passed to the synthetic
- * event invocation.
+ * var_ref_vals array. This var_ref_idx array is an array of
+ * indices into the var_ref_vals array, one for each synthetic
+ * event param, and is passed to the synthetic event
+ * invocation.
*/
- unsigned int var_ref_idx;
+ unsigned int var_ref_idx[TRACING_MAP_VARS_MAX];
struct synth_event *synth_event;
bool use_trace_keyword;
char *synth_event_name;
@@ -608,7 +616,8 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
if (!str)
return;
- strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+ strcpy(last_cmd, "hist:");
+ strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
if (file) {
call = file->event_call;
@@ -662,6 +671,8 @@ static int synth_event_define_fields(struct trace_event_call *call)
if (ret)
break;
+ event->fields[i]->offset = n_u64;
+
if (event->fields[i]->is_string) {
offset += STR_VAR_LEN_MAX;
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
@@ -834,7 +845,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
fmt = synth_field_fmt(se->fields[i]->type);
/* parameter types */
- if (tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", fmt);
snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
@@ -875,14 +886,14 @@ static struct trace_event_functions synth_event_funcs = {
static notrace void trace_event_raw_event_synth(void *__data,
u64 *var_ref_vals,
- unsigned int var_ref_idx)
+ unsigned int *var_ref_idx)
{
struct trace_event_file *trace_file = __data;
struct synth_trace_event *entry;
struct trace_event_buffer fbuffer;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct synth_event *event;
- unsigned int i, n_u64;
+ unsigned int i, n_u64, val_idx;
int fields_size = 0;
event = trace_file->event_call->data;
@@ -896,7 +907,7 @@ static notrace void trace_event_raw_event_synth(void *__data,
* Avoid ring buffer recursion detection, as this event
* is being performed within another event.
*/
- buffer = trace_file->tr->trace_buffer.buffer;
+ buffer = trace_file->tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
@@ -905,15 +916,16 @@ static notrace void trace_event_raw_event_synth(void *__data,
goto out;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ val_idx = var_ref_idx[i];
if (event->fields[i]->is_string) {
- char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
+ char *str_val = (char *)(long)var_ref_vals[val_idx];
char *str_field = (char *)&entry->fields[n_u64];
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
struct synth_field *field = event->fields[i];
- u64 val = var_ref_vals[var_ref_idx + i];
+ u64 val = var_ref_vals[val_idx];
switch (field->size) {
case 1:
@@ -1113,10 +1125,10 @@ static struct tracepoint *alloc_synth_tracepoint(char *name)
}
typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
- unsigned int var_ref_idx);
+ unsigned int *var_ref_idx);
static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
- unsigned int var_ref_idx)
+ unsigned int *var_ref_idx)
{
struct tracepoint *tp = event->tp;
@@ -1155,6 +1167,12 @@ static struct synth_event *find_synth_event(const char *name)
return NULL;
}
+static struct trace_event_fields synth_event_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = synth_event_define_fields },
+ {}
+};
+
static int register_synth_event(struct synth_event *event)
{
struct trace_event_call *call = &event->call;
@@ -1176,7 +1194,7 @@ static int register_synth_event(struct synth_event *event)
INIT_LIST_HEAD(&call->class->fields);
call->event.funcs = &synth_event_funcs;
- call->class->define_fields = synth_event_define_fields;
+ call->class->fields_array = synth_event_fields_array;
ret = register_trace_event(&call->event);
if (!ret) {
@@ -1287,6 +1305,273 @@ struct hist_var_data {
struct hist_trigger_data *hist_data;
};
+static int synth_event_check_arg_fn(void *data)
+{
+ struct dynevent_arg_pair *arg_pair = data;
+ int size;
+
+ size = synth_field_size((char *)arg_pair->lhs);
+
+ return size ? 0 : -EINVAL;
+}
+
+/**
+ * synth_event_add_field - Add a new field to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @type: The type of the new field to add
+ * @name: The name of the new field to add
+ *
+ * Add a new field to a synthetic event cmd object. Field ordering is in
+ * the same order the fields are added.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_field(struct dynevent_cmd *cmd, const char *type,
+ const char *name)
+{
+ struct dynevent_arg_pair arg_pair;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (!type || !name)
+ return -EINVAL;
+
+ dynevent_arg_pair_init(&arg_pair, 0, ';');
+
+ arg_pair.lhs = type;
+ arg_pair.rhs = name;
+
+ ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn);
+ if (ret)
+ return ret;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX)
+ ret = -EINVAL;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_field);
+
+/**
+ * synth_event_add_field_str - Add a new field to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @type_name: The type and name of the new field to add, as a single string
+ *
+ * Add a new field to a synthetic event cmd object, as a single
+ * string. The @type_name string is expected to be of the form 'type
+ * name', which will be appended by ';'. No sanity checking is done -
+ * what's passed in is assumed to already be well-formed. Field
+ * ordering is in the same order the fields are added.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name)
+{
+ struct dynevent_arg arg;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (!type_name)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, ';');
+
+ arg.str = type_name;
+
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX)
+ ret = -EINVAL;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_field_str);
+
+/**
+ * synth_event_add_fields - Add multiple fields to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ *
+ * Add a new set of fields to a synthetic event cmd object. The event
+ * fields that will be defined for the event should be passed in as an
+ * array of struct synth_field_desc, and the number of elements in the
+ * array passed in as n_fields. Field ordering will retain the
+ * ordering given in the fields array.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_fields(struct dynevent_cmd *cmd,
+ struct synth_field_desc *fields,
+ unsigned int n_fields)
+{
+ unsigned int i;
+ int ret = 0;
+
+ for (i = 0; i < n_fields; i++) {
+ if (fields[i].type == NULL || fields[i].name == NULL) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_fields);
+
+/**
+ * __synth_event_gen_cmd_start - Start a synthetic event command from arg list
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the synthetic event
+ * @mod: The module creating the event, NULL if not created from a module
+ * @args: Variable number of arg (pairs), one pair for each field
+ *
+ * NOTE: Users normally won't want to call this function directly, but
+ * rather use the synth_event_gen_cmd_start() wrapper, which
+ * automatically adds a NULL to the end of the arg list. If this
+ * function is used directly, make sure the last arg in the variable
+ * arg list is NULL.
+ *
+ * Generate a synthetic event command to be executed by
+ * synth_event_gen_cmd_end(). This function can be used to generate
+ * the complete command or only the first part of it; in the latter
+ * case, synth_event_add_field(), synth_event_add_field_str(), or
+ * synth_event_add_fields() can be used to add more fields following
+ * this.
+ *
+ * There should be an even number variable args, each pair consisting
+ * of a type followed by a field name.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name,
+ struct module *mod, ...)
+{
+ struct dynevent_arg arg;
+ va_list args;
+ int ret;
+
+ cmd->event_name = name;
+ cmd->private_data = mod;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = name;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ va_start(args, mod);
+ for (;;) {
+ const char *type, *name;
+
+ type = va_arg(args, const char *);
+ if (!type)
+ break;
+ name = va_arg(args, const char *);
+ if (!name)
+ break;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = synth_event_add_field(cmd, type, name);
+ if (ret)
+ break;
+ }
+ va_end(args);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
+
+/**
+ * synth_event_gen_cmd_array_start - Start synthetic event command from an array
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the synthetic event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ *
+ * Generate a synthetic event command to be executed by
+ * synth_event_gen_cmd_end(). This function can be used to generate
+ * the complete command or only the first part of it; in the latter
+ * case, synth_event_add_field(), synth_event_add_field_str(), or
+ * synth_event_add_fields() can be used to add more fields following
+ * this.
+ *
+ * The event fields that will be defined for the event should be
+ * passed in as an array of struct synth_field_desc, and the number of
+ * elements in the array passed in as n_fields. Field ordering will
+ * retain the ordering given in the fields array.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
+ struct module *mod,
+ struct synth_field_desc *fields,
+ unsigned int n_fields)
+{
+ struct dynevent_arg arg;
+ unsigned int i;
+ int ret = 0;
+
+ cmd->event_name = name;
+ cmd->private_data = mod;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (n_fields > SYNTH_FIELDS_MAX)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = name;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < n_fields; i++) {
+ if (fields[i].type == NULL || fields[i].name == NULL)
+ return -EINVAL;
+
+ ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
+
static int __create_synth_event(int argc, const char *name, const char **argv)
{
struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
@@ -1355,29 +1640,123 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
goto out;
}
+/**
+ * synth_event_create - Create a new synthetic event
+ * @name: The name of the new sythetic event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ * @mod: The module creating the event, NULL if not created from a module
+ *
+ * Create a new synthetic event with the given name under the
+ * trace/events/synthetic/ directory. The event fields that will be
+ * defined for the event should be passed in as an array of struct
+ * synth_field_desc, and the number elements in the array passed in as
+ * n_fields. Field ordering will retain the ordering given in the
+ * fields array.
+ *
+ * If the new synthetic event is being created from a module, the mod
+ * param must be non-NULL. This will ensure that the trace buffer
+ * won't contain unreadable events.
+ *
+ * The new synth event should be deleted using synth_event_delete()
+ * function. The new synthetic event can be generated from modules or
+ * other kernel code using trace_synth_event() and related functions.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_create(const char *name, struct synth_field_desc *fields,
+ unsigned int n_fields, struct module *mod)
+{
+ struct dynevent_cmd cmd;
+ char *buf;
+ int ret;
+
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ ret = synth_event_gen_cmd_array_start(&cmd, name, mod,
+ fields, n_fields);
+ if (ret)
+ goto out;
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ out:
+ kfree(buf);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_create);
+
+static int destroy_synth_event(struct synth_event *se)
+{
+ int ret;
+
+ if (se->ref)
+ ret = -EBUSY;
+ else {
+ ret = unregister_synth_event(se);
+ if (!ret) {
+ dyn_event_remove(&se->devent);
+ free_synth_event(se);
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * synth_event_delete - Delete a synthetic event
+ * @event_name: The name of the new sythetic event
+ *
+ * Delete a synthetic event that was created with synth_event_create().
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_delete(const char *event_name)
+{
+ struct synth_event *se = NULL;
+ struct module *mod = NULL;
+ int ret = -ENOENT;
+
+ mutex_lock(&event_mutex);
+ se = find_synth_event(event_name);
+ if (se) {
+ mod = se->mod;
+ ret = destroy_synth_event(se);
+ }
+ mutex_unlock(&event_mutex);
+
+ if (mod) {
+ mutex_lock(&trace_types_lock);
+ /*
+ * It is safest to reset the ring buffer if the module
+ * being unloaded registered any events that were
+ * used. The only worry is if a new module gets
+ * loaded, and takes on the same id as the events of
+ * this module. When printing out the buffer, traced
+ * events left over from this module may be passed to
+ * the new module events and unexpected results may
+ * occur.
+ */
+ tracing_reset_all_online_cpus();
+ mutex_unlock(&trace_types_lock);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_delete);
+
static int create_or_delete_synth_event(int argc, char **argv)
{
const char *name = argv[0];
- struct synth_event *event = NULL;
int ret;
/* trace_run_command() ensures argc != 0 */
if (name[0] == '!') {
- mutex_lock(&event_mutex);
- event = find_synth_event(name + 1);
- if (event) {
- if (event->ref)
- ret = -EBUSY;
- else {
- ret = unregister_synth_event(event);
- if (!ret) {
- dyn_event_remove(&event->devent);
- free_synth_event(event);
- }
- }
- } else
- ret = -ENOENT;
- mutex_unlock(&event_mutex);
+ ret = synth_event_delete(name + 1);
return ret;
}
@@ -1385,7 +1764,474 @@ static int create_or_delete_synth_event(int argc, char **argv)
return ret == -ECANCELED ? -EINVAL : ret;
}
-static int synth_event_create(int argc, const char **argv)
+static int synth_event_run_command(struct dynevent_cmd *cmd)
+{
+ struct synth_event *se;
+ int ret;
+
+ ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
+ if (ret)
+ return ret;
+
+ se = find_synth_event(cmd->event_name);
+ if (WARN_ON(!se))
+ return -ENOENT;
+
+ se->mod = cmd->private_data;
+
+ return ret;
+}
+
+/**
+ * synth_event_cmd_init - Initialize a synthetic event command object
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @buf: A pointer to the buffer used to build the command
+ * @maxlen: The length of the buffer passed in @buf
+ *
+ * Initialize a synthetic event command object. Use this before
+ * calling any of the other dyenvent_cmd functions.
+ */
+void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
+{
+ dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH,
+ synth_event_run_command);
+}
+EXPORT_SYMBOL_GPL(synth_event_cmd_init);
+
+/**
+ * synth_event_trace - Trace a synthetic event
+ * @file: The trace_event_file representing the synthetic event
+ * @n_vals: The number of values in vals
+ * @args: Variable number of args containing the event values
+ *
+ * Trace a synthetic event using the values passed in the variable
+ * argument list.
+ *
+ * The argument list should be a list 'n_vals' u64 values. The number
+ * of vals must match the number of field in the synthetic event, and
+ * must be in the same order as the synthetic event fields.
+ *
+ * All vals should be cast to u64, and string vals are just pointers
+ * to strings, cast to u64. Strings will be copied into space
+ * reserved in the event for the string, using these pointers.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
+{
+ struct trace_event_buffer fbuffer;
+ struct synth_trace_event *entry;
+ struct trace_buffer *buffer;
+ struct synth_event *event;
+ unsigned int i, n_u64;
+ int fields_size = 0;
+ va_list args;
+ int ret = 0;
+
+ /*
+ * Normal event generation doesn't get called at all unless
+ * the ENABLED bit is set (which attaches the probe thus
+ * allowing this code to be called, etc). Because this is
+ * called directly by the user, we don't have that but we
+ * still need to honor not logging when disabled.
+ */
+ if (!(file->flags & EVENT_FILE_FL_ENABLED))
+ return 0;
+
+ event = file->event_call->data;
+
+ if (n_vals != event->n_fields)
+ return -EINVAL;
+
+ if (trace_trigger_soft_disabled(file))
+ return -EINVAL;
+
+ fields_size = event->n_u64 * sizeof(u64);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ buffer = file->tr->array_buffer.buffer;
+ ring_buffer_nest_start(buffer);
+
+ entry = trace_event_buffer_reserve(&fbuffer, file,
+ sizeof(*entry) + fields_size);
+ if (!entry) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ va_start(args, n_vals);
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ u64 val;
+
+ val = va_arg(args, u64);
+
+ if (event->fields[i]->is_string) {
+ char *str_val = (char *)(long)val;
+ char *str_field = (char *)&entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ entry->fields[n_u64] = val;
+ n_u64++;
+ }
+ }
+ va_end(args);
+
+ trace_event_buffer_commit(