bpf: Support bpf_list_head in map values

author Kumar Kartikeya Dwivedi <memxor@gmail.com>

Mon, 14 Nov 2022 19:15:25 +0000 (00:45 +0530)

committer Alexei Starovoitov <ast@kernel.org>

Tue, 15 Nov 2022 05:52:45 +0000 (21:52 -0800)
author Kumar Kartikeya Dwivedi <memxor@gmail.com>
Mon, 14 Nov 2022 19:15:25 +0000 (00:45 +0530)
committer Alexei Starovoitov <ast@kernel.org>
Tue, 15 Nov 2022 05:52:45 +0000 (21:52 -0800)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index f08eb2d27de0cf706076c02ada10b238257a1856..05f98e9e5c482eb0a2321ac5dac9fe1e31679992 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -175,6 +175,7 @@ enum btf_field_type {
         BPF_KPTR_UNREF = (1 << 2),
         BPF_KPTR_REF   = (1 << 3),
         BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
+       BPF_LIST_HEAD  = (1 << 4),
  };
  
  struct btf_field_kptr {
@@ -184,11 +185,18 @@ struct btf_field_kptr {
         u32 btf_id;
  };
  
+struct btf_field_list_head {
+       struct btf *btf;
+       u32 value_btf_id;
+       u32 node_offset;
+};
+
  struct btf_field {
         u32 offset;
         enum btf_field_type type;
         union {
                 struct btf_field_kptr kptr;
+               struct btf_field_list_head list_head;
         };
  };
  
@@ -266,6 +274,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
         case BPF_KPTR_UNREF:
         case BPF_KPTR_REF:
                 return "kptr";
+       case BPF_LIST_HEAD:
+               return "bpf_list_head";
         default:
                 WARN_ON_ONCE(1);
                 return "unknown";
@@ -282,6 +292,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
         case BPF_KPTR_UNREF:
         case BPF_KPTR_REF:
                 return sizeof(u64);
+       case BPF_LIST_HEAD:
+               return sizeof(struct bpf_list_head);
         default:
                 WARN_ON_ONCE(1);
                 return 0;
@@ -298,6 +310,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
         case BPF_KPTR_UNREF:
         case BPF_KPTR_REF:
                 return __alignof__(u64);
+       case BPF_LIST_HEAD:
+               return __alignof__(struct bpf_list_head);
         default:
                 WARN_ON_ONCE(1);
                 return 0;
@@ -403,6 +417,9 @@ static inline void zero_map_value(struct bpf_map *map, void *dst)
  void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
                            bool lock_src);
  void bpf_timer_cancel_and_free(void *timer);
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+                       struct bpf_spin_lock *spin_lock);
+
  int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
  
  struct bpf_offload_dev;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index fb4c911d2a034f49060ecbedb1e3ff55b0c7e383..6580448e9f774b1407521b9540bc911dd86decf4 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6888,6 +6888,16 @@ struct bpf_dynptr {
         __u64 :64;
  } __attribute__((aligned(8)));
  
+struct bpf_list_head {
+       __u64 :64;
+       __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_list_node {
+       __u64 :64;
+       __u64 :64;
+} __attribute__((aligned(8)));
+
  struct bpf_sysctl {
         __u32   write;          /* Sysctl is being read (= 0) or written (= 1).
                                  * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c

index 12361d7b24987b6a491763a07a5de2e3e98839d1..c0d73d71c5394e68d037d7f85d89de5886eb3edf 100644 (file)
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3205,9 +3205,15 @@ enum {
  struct btf_field_info {
         enum btf_field_type type;
         u32 off;
-       struct {
-               u32 type_id;
-       } kptr;
+       union {
+               struct {
+                       u32 type_id;
+               } kptr;
+               struct {
+                       const char *node_name;
+                       u32 value_btf_id;
+               } list_head;
+       };
  };
  
  static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
@@ -3261,6 +3267,63 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
         return BTF_FIELD_FOUND;
  }
  
+static const char *btf_find_decl_tag_value(const struct btf *btf,
+                                          const struct btf_type *pt,
+                                          int comp_idx, const char *tag_key)
+{
+       int i;
+
+       for (i = 1; i < btf_nr_types(btf); i++) {
+               const struct btf_type *t = btf_type_by_id(btf, i);
+               int len = strlen(tag_key);
+
+               if (!btf_type_is_decl_tag(t))
+                       continue;
+               if (pt != btf_type_by_id(btf, t->type) ||
+                   btf_type_decl_tag(t)->component_idx != comp_idx)
+                       continue;
+               if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+                       continue;
+               return __btf_name_by_offset(btf, t->name_off) + len;
+       }
+       return NULL;
+}
+
+static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
+                             const struct btf_type *t, int comp_idx,
+                             u32 off, int sz, struct btf_field_info *info)
+{
+       const char *value_type;
+       const char *list_node;
+       s32 id;
+
+       if (!__btf_type_is_struct(t))
+               return BTF_FIELD_IGNORE;
+       if (t->size != sz)
+               return BTF_FIELD_IGNORE;
+       value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+       if (!value_type)
+               return -EINVAL;
+       list_node = strstr(value_type, ":");
+       if (!list_node)
+               return -EINVAL;
+       value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN);
+       if (!value_type)
+               return -ENOMEM;
+       id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+       kfree(value_type);
+       if (id < 0)
+               return id;
+       list_node++;
+       if (str_is_empty(list_node))
+               return -EINVAL;
+       info->type = BPF_LIST_HEAD;
+       info->off = off;
+       info->list_head.value_btf_id = id;
+       info->list_head.node_name = list_node;
+       return BTF_FIELD_FOUND;
+}
+
  static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
                               int *align, int *sz)
  {
@@ -3284,6 +3347,12 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
                         goto end;
                 }
         }
+       if (field_mask & BPF_LIST_HEAD) {
+               if (!strcmp(name, "bpf_list_head")) {
+                       type = BPF_LIST_HEAD;
+                       goto end;
+               }
+       }
         /* Only return BPF_KPTR when all other types with matchable names fail */
         if (field_mask & BPF_KPTR) {
                 type = BPF_KPTR_REF;
@@ -3339,6 +3408,12 @@ static int btf_find_struct_field(const struct btf *btf,
                         if (ret < 0)
                                 return ret;
                         break;
+               case BPF_LIST_HEAD:
+                       ret = btf_find_list_head(btf, t, member_type, i, off, sz,
+                                                idx < info_cnt ? &info[idx] : &tmp);
+                       if (ret < 0)
+                               return ret;
+                       break;
                 default:
                         return -EFAULT;
                 }
@@ -3393,6 +3468,12 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
                         if (ret < 0)
                                 return ret;
                         break;
+               case BPF_LIST_HEAD:
+                       ret = btf_find_list_head(btf, var, var_type, -1, off, sz,
+                                                idx < info_cnt ? &info[idx] : &tmp);
+                       if (ret < 0)
+                               return ret;
+                       break;
                 default:
                         return -EFAULT;
                 }
@@ -3491,11 +3572,52 @@ end_btf:
         return ret;
  }
  
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+                              struct btf_field_info *info)
+{
+       const struct btf_type *t, *n = NULL;
+       const struct btf_member *member;
+       u32 offset;
+       int i;
+
+       t = btf_type_by_id(btf, info->list_head.value_btf_id);
+       /* We've already checked that value_btf_id is a struct type. We
+        * just need to figure out the offset of the list_node, and
+        * verify its type.
+        */
+       for_each_member(i, t, member) {
+               if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off)))
+                       continue;
+               /* Invalid BTF, two members with same name */
+               if (n)
+                       return -EINVAL;
+               n = btf_type_by_id(btf, member->type);
+               if (!__btf_type_is_struct(n))
+                       return -EINVAL;
+               if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off)))
+                       return -EINVAL;
+               offset = __btf_member_bit_offset(n, member);
+               if (offset % 8)
+                       return -EINVAL;
+               offset /= 8;
+               if (offset % __alignof__(struct bpf_list_node))
+                       return -EINVAL;
+
+               field->list_head.btf = (struct btf *)btf;
+               field->list_head.value_btf_id = info->list_head.value_btf_id;
+               field->list_head.node_offset = offset;
+       }
+       if (!n)
+               return -ENOENT;
+       return 0;
+}
+
  struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
                                     u32 field_mask, u32 value_size)
  {
         struct btf_field_info info_arr[BTF_FIELDS_MAX];
         struct btf_record *rec;
+       u32 next_off = 0;
         int ret, i, cnt;
  
         ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
@@ -3517,6 +3639,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
                         ret = -EFAULT;
                         goto end;
                 }
+               if (info_arr[i].off < next_off) {
+                       ret = -EEXIST;
+                       goto end;
+               }
+               next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
  
                 rec->field_mask |= info_arr[i].type;
                 rec->fields[i].offset = info_arr[i].off;
@@ -3539,12 +3666,24 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
                         if (ret < 0)
                                 goto end;
                         break;
+               case BPF_LIST_HEAD:
+                       ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+                       if (ret < 0)
+                               goto end;
+                       break;
                 default:
                         ret = -EFAULT;
                         goto end;
                 }
                 rec->cnt++;
         }
+
+       /* bpf_list_head requires bpf_spin_lock */
+       if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) {
+               ret = -EINVAL;
+               goto end;
+       }
+
         return rec;
  end:
         btf_record_free(rec);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c

index 283f55bbeb70cea987afbc677c729ee6c4d161d7..7bc71995f17c3268cba35144bbd90453f8427ca2 100644 (file)
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1706,6 +1706,38 @@ bpf_base_func_proto(enum bpf_func_id func_id)
         }
  }
  
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+                       struct bpf_spin_lock *spin_lock)
+{
+       struct list_head *head = list_head, *orig_head = list_head;
+
+       BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+       BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+       /* Do the actual list draining outside the lock to not hold the lock for
+        * too long, and also prevent deadlocks if tracing programs end up
+        * executing on entry/exit of functions called inside the critical
+        * section, and end up doing map ops that call bpf_list_head_free for
+        * the same map value again.
+        */
+       __bpf_spin_lock_irqsave(spin_lock);
+       if (!head->next || list_empty(head))
+               goto unlock;
+       head = head->next;
+unlock:
+       INIT_LIST_HEAD(orig_head);
+       __bpf_spin_unlock_irqrestore(spin_lock);
+
+       while (head != orig_head) {
+               void *obj = head;
+
+               obj -= field->list_head.node_offset;
+               head = head->next;
+               /* TODO: Rework later */
+               kfree(obj);
+       }
+}
+
  BTF_SET8_START(tracing_btf_ids)
  #ifdef CONFIG_KEXEC_CORE
  BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 85532d30112491f167e16fd66cf91a2f9a4c5d95..fdbae52f463f20efea76106dd7bbe202bc7a8caf 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -536,6 +536,9 @@ void btf_record_free(struct btf_record *rec)
                                 module_put(rec->fields[i].kptr.module);
                         btf_put(rec->fields[i].kptr.btf);
                         break;
+               case BPF_LIST_HEAD:
+                       /* Nothing to release for bpf_list_head */
+                       break;
                 default:
                         WARN_ON_ONCE(1);
                         continue;
@@ -578,6 +581,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
                                 goto free;
                         }
                         break;
+               case BPF_LIST_HEAD:
+                       /* Nothing to acquire for bpf_list_head */
+                       break;
                 default:
                         ret = -EFAULT;
                         WARN_ON_ONCE(1);
@@ -637,6 +643,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
                 case BPF_KPTR_REF:
                         field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
                         break;
+               case BPF_LIST_HEAD:
+                       if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+                               continue;
+                       bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+                       break;
                 default:
                         WARN_ON_ONCE(1);
                         continue;
@@ -965,7 +976,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
         if (!value_type || value_size != map->value_size)
                 return -EINVAL;
  
-       map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR,
+       map->record = btf_parse_fields(btf, value_type,
+                                      BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD,
                                        map->value_size);
         if (!IS_ERR_OR_NULL(map->record)) {
                 int i;
@@ -1012,6 +1024,14 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
                                         goto free_map_tab;
                                 }
                                 break;
+                       case BPF_LIST_HEAD:
+                               if (map->map_type != BPF_MAP_TYPE_HASH &&
+                                   map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+                                   map->map_type != BPF_MAP_TYPE_ARRAY) {
+                                       ret = -EOPNOTSUPP;
+                                       goto free_map_tab;
+                               }
+                               break;
                         default:
                                 /* Fail if map_type checks are missing for a field type */
                                 ret = -EOPNOTSUPP;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index 07c0259dfc1a64c14bc26996230753527bedf7a3..a50018e2d4a0db548c02a94f2293bbbfc994c5f1 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12814,6 +12814,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
  {
         enum bpf_prog_type prog_type = resolve_prog_type(prog);
  
+       if (btf_record_has_field(map->record, BPF_LIST_HEAD)) {
+               if (is_tracing_prog_type(prog_type)) {
+                       verbose(env, "tracing progs cannot use bpf_list_head yet\n");
+                       return -EINVAL;
+               }
+       }
+
         if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                 if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
                         verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index fb4c911d2a034f49060ecbedb1e3ff55b0c7e383..6580448e9f774b1407521b9540bc911dd86decf4 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6888,6 +6888,16 @@ struct bpf_dynptr {
         __u64 :64;
  } __attribute__((aligned(8)));
  
+struct bpf_list_head {
+       __u64 :64;
+       __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_list_node {
+       __u64 :64;
+       __u64 :64;
+} __attribute__((aligned(8)));
+
  struct bpf_sysctl {
         __u32   write;          /* Sysctl is being read (= 0) or written (= 1).
                                  * Allows 1,2,4-byte read, but no write.
author	Kumar Kartikeya Dwivedi <memxor@gmail.com>
	Mon, 14 Nov 2022 19:15:25 +0000 (00:45 +0530)
committer	Alexei Starovoitov <ast@kernel.org>
	Tue, 15 Nov 2022 05:52:45 +0000 (21:52 -0800)
include/linux/bpf.h		patch \| blob \| blame \| history
include/uapi/linux/bpf.h		patch \| blob \| blame \| history
kernel/bpf/btf.c		patch \| blob \| blame \| history
kernel/bpf/helpers.c		patch \| blob \| blame \| history
kernel/bpf/syscall.c		patch \| blob \| blame \| history
kernel/bpf/verifier.c		patch \| blob \| blame \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| blame \| history