bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type
authorDavid Vernet <void@manifault.com>
Tue, 20 Sep 2022 00:00:57 +0000 (19:00 -0500)
committerAndrii Nakryiko <andrii@kernel.org>
Wed, 21 Sep 2022 23:24:17 +0000 (16:24 -0700)
We want to support a ringbuf map type where samples are published from
user-space, to be consumed by BPF programs. BPF currently supports a
kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF
map type.  We'll need to define a new map type for user-space -> kernel,
as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply
to a user-space producer ring buffer, and we'll want to add one or
more helper functions that would not apply for a kernel-producer
ring buffer.

This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type
definition. The map type is useless in its current form, as there is no
way to access or use it for anything until we one or more BPF helpers. A
follow-on patch will therefore add a new helper function that allows BPF
programs to run callbacks on samples that are published to the ring
buffer.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com
include/linux/bpf_types.h
include/uapi/linux/bpf.h
kernel/bpf/ringbuf.c
kernel/bpf/verifier.c
tools/bpf/bpftool/Documentation/bpftool-map.rst
tools/bpf/bpftool/map.c
tools/include/uapi/linux/bpf.h
tools/lib/bpf/libbpf.c

index 2b9112b8017124b1184a4bf7d6b966148b4b25ae..2c6a4f2562a7f8e661aebf932a80e8fa679e2195 100644 (file)
@@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
index 3df78c56c1bf68b2a3213626f4bd852957937291..e18c85324db64708cb6a4157858855ab047df344 100644 (file)
@@ -928,6 +928,7 @@ enum bpf_map_type {
        BPF_MAP_TYPE_INODE_STORAGE,
        BPF_MAP_TYPE_TASK_STORAGE,
        BPF_MAP_TYPE_BLOOM_FILTER,
+       BPF_MAP_TYPE_USER_RINGBUF,
 };
 
 /* Note that tracing related programs such as
index b483aea35f41dbf34187fed2cfc3c8d60123875d..754e915748fb8b7cd3a57227d0faed8af0449537 100644 (file)
@@ -38,10 +38,27 @@ struct bpf_ringbuf {
        struct page **pages;
        int nr_pages;
        spinlock_t spinlock ____cacheline_aligned_in_smp;
-       /* Consumer and producer counters are put into separate pages to allow
-        * mapping consumer page as r/w, but restrict producer page to r/o.
-        * This protects producer position from being modified by user-space
-        * application and ruining in-kernel position tracking.
+       /* Consumer and producer counters are put into separate pages to
+        * allow each position to be mapped with different permissions.
+        * This prevents a user-space application from modifying the
+        * position and ruining in-kernel tracking. The permissions of the
+        * pages depend on who is producing samples: user-space or the
+        * kernel.
+        *
+        * Kernel-producer
+        * ---------------
+        * The producer position and data pages are mapped as r/o in
+        * userspace. For this approach, bits in the header of samples are
+        * used to signal to user-space, and to other producers, whether a
+        * sample is currently being written.
+        *
+        * User-space producer
+        * -------------------
+        * Only the page containing the consumer position is mapped r/o in
+        * user-space. User-space producers also use bits of the header to
+        * communicate to the kernel, but the kernel must carefully check and
+        * validate each sample to ensure that they're correctly formatted, and
+        * fully contained within the ring buffer.
         */
        unsigned long consumer_pos __aligned(PAGE_SIZE);
        unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -224,7 +241,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
        return -ENOTSUPP;
 }
 
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
 {
        struct bpf_ringbuf_map *rb_map;
 
@@ -242,6 +259,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
                                   vma->vm_pgoff + RINGBUF_PGOFF);
 }
 
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+       struct bpf_ringbuf_map *rb_map;
+
+       rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+       if (vma->vm_flags & VM_WRITE) {
+               if (vma->vm_pgoff == 0)
+                       /* Disallow writable mappings to the consumer pointer,
+                        * and allow writable mappings to both the producer
+                        * position, and the ring buffer data itself.
+                        */
+                       return -EPERM;
+       } else {
+               vma->vm_flags &= ~VM_MAYWRITE;
+       }
+       /* remap_vmalloc_range() checks size and offset constraints */
+       return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
 static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
 {
        unsigned long cons_pos, prod_pos;
@@ -269,7 +306,7 @@ const struct bpf_map_ops ringbuf_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc = ringbuf_map_alloc,
        .map_free = ringbuf_map_free,
-       .map_mmap = ringbuf_map_mmap,
+       .map_mmap = ringbuf_map_mmap_kern,
        .map_poll = ringbuf_map_poll,
        .map_lookup_elem = ringbuf_map_lookup_elem,
        .map_update_elem = ringbuf_map_update_elem,
@@ -278,6 +315,19 @@ const struct bpf_map_ops ringbuf_map_ops = {
        .map_btf_id = &ringbuf_map_btf_ids[0],
 };
 
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+       .map_meta_equal = bpf_map_meta_equal,
+       .map_alloc = ringbuf_map_alloc,
+       .map_free = ringbuf_map_free,
+       .map_mmap = ringbuf_map_mmap_user,
+       .map_lookup_elem = ringbuf_map_lookup_elem,
+       .map_update_elem = ringbuf_map_update_elem,
+       .map_delete_elem = ringbuf_map_delete_elem,
+       .map_get_next_key = ringbuf_map_get_next_key,
+       .map_btf_id = &user_ringbuf_map_btf_ids[0],
+};
+
 /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
  * calculate offset from record metadata to ring buffer in pages, rounded
  * down. This page offset is stored as part of record metadata and allows to
index 8c6fbcd0afaf2d0c42a3b9e0fce1a6eecf570f78..83710b60e7087ece273f3d486989e198bb095d84 100644 (file)
@@ -6240,6 +6240,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                    func_id != BPF_FUNC_ringbuf_discard_dynptr)
                        goto error;
                break;
+       case BPF_MAP_TYPE_USER_RINGBUF:
+               goto error;
        case BPF_MAP_TYPE_STACK_TRACE:
                if (func_id != BPF_FUNC_get_stackid)
                        goto error;
@@ -12635,6 +12637,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                case BPF_MAP_TYPE_ARRAY_OF_MAPS:
                case BPF_MAP_TYPE_HASH_OF_MAPS:
                case BPF_MAP_TYPE_RINGBUF:
+               case BPF_MAP_TYPE_USER_RINGBUF:
                case BPF_MAP_TYPE_INODE_STORAGE:
                case BPF_MAP_TYPE_SK_STORAGE:
                case BPF_MAP_TYPE_TASK_STORAGE:
index 7c188a598444c463c2ea6d06711788b9810f0337..7f3b67a8b48f3c6ca968b3b92232fc25eee887d8 100644 (file)
@@ -55,7 +55,7 @@ MAP COMMANDS
 |              | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
 |              | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
 |              | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
-|              | **task_storage** | **bloom_filter** }
+|              | **task_storage** | **bloom_filter** | **user_ringbuf** }
 
 DESCRIPTION
 ===========
index 38b6bc9c26c3bb0d95ec6ceb4c791f54998039c5..9a6ca9f311338ddffa25278b1c8d87838068253e 100644 (file)
@@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv)
                "                 devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
                "                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
                "                 queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
-               "                 task_storage | bloom_filter }\n"
+               "                 task_storage | bloom_filter | user_ringbuf }\n"
                "       " HELP_SPEC_OPTIONS " |\n"
                "                    {-f|--bpffs} | {-n|--nomount} }\n"
                "",
index 3df78c56c1bf68b2a3213626f4bd852957937291..e18c85324db64708cb6a4157858855ab047df344 100644 (file)
@@ -928,6 +928,7 @@ enum bpf_map_type {
        BPF_MAP_TYPE_INODE_STORAGE,
        BPF_MAP_TYPE_TASK_STORAGE,
        BPF_MAP_TYPE_BLOOM_FILTER,
+       BPF_MAP_TYPE_USER_RINGBUF,
 };
 
 /* Note that tracing related programs such as
index 2ca30ccc774c4aff7078c728084a74fe524f4501..d480da05b6defa62afde3ce620d31677742670b9 100644 (file)
@@ -163,6 +163,7 @@ static const char * const map_type_name[] = {
        [BPF_MAP_TYPE_INODE_STORAGE]            = "inode_storage",
        [BPF_MAP_TYPE_TASK_STORAGE]             = "task_storage",
        [BPF_MAP_TYPE_BLOOM_FILTER]             = "bloom_filter",
+       [BPF_MAP_TYPE_USER_RINGBUF]             = "user_ringbuf",
 };
 
 static const char * const prog_type_name[] = {