Merge tag 'vfs-6.7.misc' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs
[linux-block.git] / kernel / bpf / syscall.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  */
4 #include <linux/bpf.h>
5 #include <linux/bpf-cgroup.h>
6 #include <linux/bpf_trace.h>
7 #include <linux/bpf_lirc.h>
8 #include <linux/bpf_verifier.h>
9 #include <linux/bsearch.h>
10 #include <linux/btf.h>
11 #include <linux/syscalls.h>
12 #include <linux/slab.h>
13 #include <linux/sched/signal.h>
14 #include <linux/vmalloc.h>
15 #include <linux/mmzone.h>
16 #include <linux/anon_inodes.h>
17 #include <linux/fdtable.h>
18 #include <linux/file.h>
19 #include <linux/fs.h>
20 #include <linux/license.h>
21 #include <linux/filter.h>
22 #include <linux/kernel.h>
23 #include <linux/idr.h>
24 #include <linux/cred.h>
25 #include <linux/timekeeping.h>
26 #include <linux/ctype.h>
27 #include <linux/nospec.h>
28 #include <linux/audit.h>
29 #include <uapi/linux/btf.h>
30 #include <linux/pgtable.h>
31 #include <linux/bpf_lsm.h>
32 #include <linux/poll.h>
33 #include <linux/sort.h>
34 #include <linux/bpf-netns.h>
35 #include <linux/rcupdate_trace.h>
36 #include <linux/memcontrol.h>
37 #include <linux/trace_events.h>
38 #include <net/netfilter/nf_bpf_link.h>
39
40 #include <net/tcx.h>
41
42 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
43                           (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
44                           (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
45 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
46 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
47 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
48                         IS_FD_HASH(map))
49
50 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
51
52 DEFINE_PER_CPU(int, bpf_prog_active);
53 static DEFINE_IDR(prog_idr);
54 static DEFINE_SPINLOCK(prog_idr_lock);
55 static DEFINE_IDR(map_idr);
56 static DEFINE_SPINLOCK(map_idr_lock);
57 static DEFINE_IDR(link_idr);
58 static DEFINE_SPINLOCK(link_idr_lock);
59
60 int sysctl_unprivileged_bpf_disabled __read_mostly =
61         IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
62
63 static const struct bpf_map_ops * const bpf_map_types[] = {
64 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
65 #define BPF_MAP_TYPE(_id, _ops) \
66         [_id] = &_ops,
67 #define BPF_LINK_TYPE(_id, _name)
68 #include <linux/bpf_types.h>
69 #undef BPF_PROG_TYPE
70 #undef BPF_MAP_TYPE
71 #undef BPF_LINK_TYPE
72 };
73
74 /*
75  * If we're handed a bigger struct than we know of, ensure all the unknown bits
76  * are 0 - i.e. new user-space does not rely on any kernel feature extensions
77  * we don't know about yet.
78  *
79  * There is a ToCToU between this function call and the following
80  * copy_from_user() call. However, this is not a concern since this function is
81  * meant to be a future-proofing of bits.
82  */
83 int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
84                              size_t expected_size,
85                              size_t actual_size)
86 {
87         int res;
88
89         if (unlikely(actual_size > PAGE_SIZE))  /* silly large */
90                 return -E2BIG;
91
92         if (actual_size <= expected_size)
93                 return 0;
94
95         if (uaddr.is_kernel)
96                 res = memchr_inv(uaddr.kernel + expected_size, 0,
97                                  actual_size - expected_size) == NULL;
98         else
99                 res = check_zeroed_user(uaddr.user + expected_size,
100                                         actual_size - expected_size);
101         if (res < 0)
102                 return res;
103         return res ? 0 : -E2BIG;
104 }
105
106 const struct bpf_map_ops bpf_map_offload_ops = {
107         .map_meta_equal = bpf_map_meta_equal,
108         .map_alloc = bpf_map_offload_map_alloc,
109         .map_free = bpf_map_offload_map_free,
110         .map_check_btf = map_check_no_btf,
111         .map_mem_usage = bpf_map_offload_map_mem_usage,
112 };
113
114 static void bpf_map_write_active_inc(struct bpf_map *map)
115 {
116         atomic64_inc(&map->writecnt);
117 }
118
119 static void bpf_map_write_active_dec(struct bpf_map *map)
120 {
121         atomic64_dec(&map->writecnt);
122 }
123
124 bool bpf_map_write_active(const struct bpf_map *map)
125 {
126         return atomic64_read(&map->writecnt) != 0;
127 }
128
129 static u32 bpf_map_value_size(const struct bpf_map *map)
130 {
131         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
132             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
133             map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
134             map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
135                 return round_up(map->value_size, 8) * num_possible_cpus();
136         else if (IS_FD_MAP(map))
137                 return sizeof(u32);
138         else
139                 return  map->value_size;
140 }
141
142 static void maybe_wait_bpf_programs(struct bpf_map *map)
143 {
144         /* Wait for any running BPF programs to complete so that
145          * userspace, when we return to it, knows that all programs
146          * that could be running use the new map value.
147          */
148         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
149             map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
150                 synchronize_rcu();
151 }
152
153 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
154                                 void *key, void *value, __u64 flags)
155 {
156         int err;
157
158         /* Need to create a kthread, thus must support schedule */
159         if (bpf_map_is_offloaded(map)) {
160                 return bpf_map_offload_update_elem(map, key, value, flags);
161         } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
162                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
163                 return map->ops->map_update_elem(map, key, value, flags);
164         } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
165                    map->map_type == BPF_MAP_TYPE_SOCKMAP) {
166                 return sock_map_update_elem_sys(map, key, value, flags);
167         } else if (IS_FD_PROG_ARRAY(map)) {
168                 return bpf_fd_array_map_update_elem(map, map_file, key, value,
169                                                     flags);
170         }
171
172         bpf_disable_instrumentation();
173         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
174             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
175                 err = bpf_percpu_hash_update(map, key, value, flags);
176         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
177                 err = bpf_percpu_array_update(map, key, value, flags);
178         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
179                 err = bpf_percpu_cgroup_storage_update(map, key, value,
180                                                        flags);
181         } else if (IS_FD_ARRAY(map)) {
182                 rcu_read_lock();
183                 err = bpf_fd_array_map_update_elem(map, map_file, key, value,
184                                                    flags);
185                 rcu_read_unlock();
186         } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
187                 rcu_read_lock();
188                 err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
189                                                   flags);
190                 rcu_read_unlock();
191         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
192                 /* rcu_read_lock() is not needed */
193                 err = bpf_fd_reuseport_array_update_elem(map, key, value,
194                                                          flags);
195         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
196                    map->map_type == BPF_MAP_TYPE_STACK ||
197                    map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
198                 err = map->ops->map_push_elem(map, value, flags);
199         } else {
200                 rcu_read_lock();
201                 err = map->ops->map_update_elem(map, key, value, flags);
202                 rcu_read_unlock();
203         }
204         bpf_enable_instrumentation();
205         maybe_wait_bpf_programs(map);
206
207         return err;
208 }
209
210 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
211                               __u64 flags)
212 {
213         void *ptr;
214         int err;
215
216         if (bpf_map_is_offloaded(map))
217                 return bpf_map_offload_lookup_elem(map, key, value);
218
219         bpf_disable_instrumentation();
220         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
221             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
222                 err = bpf_percpu_hash_copy(map, key, value);
223         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
224                 err = bpf_percpu_array_copy(map, key, value);
225         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
226                 err = bpf_percpu_cgroup_storage_copy(map, key, value);
227         } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
228                 err = bpf_stackmap_copy(map, key, value);
229         } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
230                 err = bpf_fd_array_map_lookup_elem(map, key, value);
231         } else if (IS_FD_HASH(map)) {
232                 err = bpf_fd_htab_map_lookup_elem(map, key, value);
233         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
234                 err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
235         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
236                    map->map_type == BPF_MAP_TYPE_STACK ||
237                    map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
238                 err = map->ops->map_peek_elem(map, value);
239         } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
240                 /* struct_ops map requires directly updating "value" */
241                 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
242         } else {
243                 rcu_read_lock();
244                 if (map->ops->map_lookup_elem_sys_only)
245                         ptr = map->ops->map_lookup_elem_sys_only(map, key);
246                 else
247                         ptr = map->ops->map_lookup_elem(map, key);
248                 if (IS_ERR(ptr)) {
249                         err = PTR_ERR(ptr);
250                 } else if (!ptr) {
251                         err = -ENOENT;
252                 } else {
253                         err = 0;
254                         if (flags & BPF_F_LOCK)
255                                 /* lock 'ptr' and copy everything but lock */
256                                 copy_map_value_locked(map, value, ptr, true);
257                         else
258                                 copy_map_value(map, value, ptr);
259                         /* mask lock and timer, since value wasn't zero inited */
260                         check_and_init_map_value(map, value);
261                 }
262                 rcu_read_unlock();
263         }
264
265         bpf_enable_instrumentation();
266         maybe_wait_bpf_programs(map);
267
268         return err;
269 }
270
271 /* Please, do not use this function outside from the map creation path
272  * (e.g. in map update path) without taking care of setting the active
273  * memory cgroup (see at bpf_map_kmalloc_node() for example).
274  */
275 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
276 {
277         /* We really just want to fail instead of triggering OOM killer
278          * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
279          * which is used for lower order allocation requests.
280          *
281          * It has been observed that higher order allocation requests done by
282          * vmalloc with __GFP_NORETRY being set might fail due to not trying
283          * to reclaim memory from the page cache, thus we set
284          * __GFP_RETRY_MAYFAIL to avoid such situations.
285          */
286
287         gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
288         unsigned int flags = 0;
289         unsigned long align = 1;
290         void *area;
291
292         if (size >= SIZE_MAX)
293                 return NULL;
294
295         /* kmalloc()'ed memory can't be mmap()'ed */
296         if (mmapable) {
297                 BUG_ON(!PAGE_ALIGNED(size));
298                 align = SHMLBA;
299                 flags = VM_USERMAP;
300         } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
301                 area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
302                                     numa_node);
303                 if (area != NULL)
304                         return area;
305         }
306
307         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
308                         gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
309                         flags, numa_node, __builtin_return_address(0));
310 }
311
312 void *bpf_map_area_alloc(u64 size, int numa_node)
313 {
314         return __bpf_map_area_alloc(size, numa_node, false);
315 }
316
317 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
318 {
319         return __bpf_map_area_alloc(size, numa_node, true);
320 }
321
322 void bpf_map_area_free(void *area)
323 {
324         kvfree(area);
325 }
326
327 static u32 bpf_map_flags_retain_permanent(u32 flags)
328 {
329         /* Some map creation flags are not tied to the map object but
330          * rather to the map fd instead, so they have no meaning upon
331          * map object inspection since multiple file descriptors with
332          * different (access) properties can exist here. Thus, given
333          * this has zero meaning for the map itself, lets clear these
334          * from here.
335          */
336         return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
337 }
338
339 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
340 {
341         map->map_type = attr->map_type;
342         map->key_size = attr->key_size;
343         map->value_size = attr->value_size;
344         map->max_entries = attr->max_entries;
345         map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
346         map->numa_node = bpf_map_attr_numa_node(attr);
347         map->map_extra = attr->map_extra;
348 }
349
350 static int bpf_map_alloc_id(struct bpf_map *map)
351 {
352         int id;
353
354         idr_preload(GFP_KERNEL);
355         spin_lock_bh(&map_idr_lock);
356         id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
357         if (id > 0)
358                 map->id = id;
359         spin_unlock_bh(&map_idr_lock);
360         idr_preload_end();
361
362         if (WARN_ON_ONCE(!id))
363                 return -ENOSPC;
364
365         return id > 0 ? 0 : id;
366 }
367
368 void bpf_map_free_id(struct bpf_map *map)
369 {
370         unsigned long flags;
371
372         /* Offloaded maps are removed from the IDR store when their device
373          * disappears - even if someone holds an fd to them they are unusable,
374          * the memory is gone, all ops will fail; they are simply waiting for
375          * refcnt to drop to be freed.
376          */
377         if (!map->id)
378                 return;
379
380         spin_lock_irqsave(&map_idr_lock, flags);
381
382         idr_remove(&map_idr, map->id);
383         map->id = 0;
384
385         spin_unlock_irqrestore(&map_idr_lock, flags);
386 }
387
388 #ifdef CONFIG_MEMCG_KMEM
389 static void bpf_map_save_memcg(struct bpf_map *map)
390 {
391         /* Currently if a map is created by a process belonging to the root
392          * memory cgroup, get_obj_cgroup_from_current() will return NULL.
393          * So we have to check map->objcg for being NULL each time it's
394          * being used.
395          */
396         if (memcg_bpf_enabled())
397                 map->objcg = get_obj_cgroup_from_current();
398 }
399
400 static void bpf_map_release_memcg(struct bpf_map *map)
401 {
402         if (map->objcg)
403                 obj_cgroup_put(map->objcg);
404 }
405
406 static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
407 {
408         if (map->objcg)
409                 return get_mem_cgroup_from_objcg(map->objcg);
410
411         return root_mem_cgroup;
412 }
413
414 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
415                            int node)
416 {
417         struct mem_cgroup *memcg, *old_memcg;
418         void *ptr;
419
420         memcg = bpf_map_get_memcg(map);
421         old_memcg = set_active_memcg(memcg);
422         ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
423         set_active_memcg(old_memcg);
424         mem_cgroup_put(memcg);
425
426         return ptr;
427 }
428
429 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
430 {
431         struct mem_cgroup *memcg, *old_memcg;
432         void *ptr;
433
434         memcg = bpf_map_get_memcg(map);
435         old_memcg = set_active_memcg(memcg);
436         ptr = kzalloc(size, flags | __GFP_ACCOUNT);
437         set_active_memcg(old_memcg);
438         mem_cgroup_put(memcg);
439
440         return ptr;
441 }
442
443 void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
444                        gfp_t flags)
445 {
446         struct mem_cgroup *memcg, *old_memcg;
447         void *ptr;
448
449         memcg = bpf_map_get_memcg(map);
450         old_memcg = set_active_memcg(memcg);
451         ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
452         set_active_memcg(old_memcg);
453         mem_cgroup_put(memcg);
454
455         return ptr;
456 }
457
458 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
459                                     size_t align, gfp_t flags)
460 {
461         struct mem_cgroup *memcg, *old_memcg;
462         void __percpu *ptr;
463
464         memcg = bpf_map_get_memcg(map);
465         old_memcg = set_active_memcg(memcg);
466         ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
467         set_active_memcg(old_memcg);
468         mem_cgroup_put(memcg);
469
470         return ptr;
471 }
472
473 #else
474 static void bpf_map_save_memcg(struct bpf_map *map)
475 {
476 }
477
478 static void bpf_map_release_memcg(struct bpf_map *map)
479 {
480 }
481 #endif
482
483 static int btf_field_cmp(const void *a, const void *b)
484 {
485         const struct btf_field *f1 = a, *f2 = b;
486
487         if (f1->offset < f2->offset)
488                 return -1;
489         else if (f1->offset > f2->offset)
490                 return 1;
491         return 0;
492 }
493
494 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
495                                   u32 field_mask)
496 {
497         struct btf_field *field;
498
499         if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
500                 return NULL;
501         field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
502         if (!field || !(field->type & field_mask))
503                 return NULL;
504         return field;
505 }
506
507 void btf_record_free(struct btf_record *rec)
508 {
509         int i;
510
511         if (IS_ERR_OR_NULL(rec))
512                 return;
513         for (i = 0; i < rec->cnt; i++) {
514                 switch (rec->fields[i].type) {
515                 case BPF_KPTR_UNREF:
516                 case BPF_KPTR_REF:
517                         if (rec->fields[i].kptr.module)
518                                 module_put(rec->fields[i].kptr.module);
519                         btf_put(rec->fields[i].kptr.btf);
520                         break;
521                 case BPF_LIST_HEAD:
522                 case BPF_LIST_NODE:
523                 case BPF_RB_ROOT:
524                 case BPF_RB_NODE:
525                 case BPF_SPIN_LOCK:
526                 case BPF_TIMER:
527                 case BPF_REFCOUNT:
528                         /* Nothing to release */
529                         break;
530                 default:
531                         WARN_ON_ONCE(1);
532                         continue;
533                 }
534         }
535         kfree(rec);
536 }
537
538 void bpf_map_free_record(struct bpf_map *map)
539 {
540         btf_record_free(map->record);
541         map->record = NULL;
542 }
543
544 struct btf_record *btf_record_dup(const struct btf_record *rec)
545 {
546         const struct btf_field *fields;
547         struct btf_record *new_rec;
548         int ret, size, i;
549
550         if (IS_ERR_OR_NULL(rec))
551                 return NULL;
552         size = offsetof(struct btf_record, fields[rec->cnt]);
553         new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
554         if (!new_rec)
555                 return ERR_PTR(-ENOMEM);
556         /* Do a deep copy of the btf_record */
557         fields = rec->fields;
558         new_rec->cnt = 0;
559         for (i = 0; i < rec->cnt; i++) {
560                 switch (fields[i].type) {
561                 case BPF_KPTR_UNREF:
562                 case BPF_KPTR_REF:
563                         btf_get(fields[i].kptr.btf);
564                         if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
565                                 ret = -ENXIO;
566                                 goto free;
567                         }
568                         break;
569                 case BPF_LIST_HEAD:
570                 case BPF_LIST_NODE:
571                 case BPF_RB_ROOT:
572                 case BPF_RB_NODE:
573                 case BPF_SPIN_LOCK:
574                 case BPF_TIMER:
575                 case BPF_REFCOUNT:
576                         /* Nothing to acquire */
577                         break;
578                 default:
579                         ret = -EFAULT;
580                         WARN_ON_ONCE(1);
581                         goto free;
582                 }
583                 new_rec->cnt++;
584         }
585         return new_rec;
586 free:
587         btf_record_free(new_rec);
588         return ERR_PTR(ret);
589 }
590
591 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
592 {
593         bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
594         int size;
595
596         if (!a_has_fields && !b_has_fields)
597                 return true;
598         if (a_has_fields != b_has_fields)
599                 return false;
600         if (rec_a->cnt != rec_b->cnt)
601                 return false;
602         size = offsetof(struct btf_record, fields[rec_a->cnt]);
603         /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
604          * members are zeroed out. So memcmp is safe to do without worrying
605          * about padding/unused fields.
606          *
607          * While spin_lock, timer, and kptr have no relation to map BTF,
608          * list_head metadata is specific to map BTF, the btf and value_rec
609          * members in particular. btf is the map BTF, while value_rec points to
610          * btf_record in that map BTF.
611          *
612          * So while by default, we don't rely on the map BTF (which the records
613          * were parsed from) matching for both records, which is not backwards
614          * compatible, in case list_head is part of it, we implicitly rely on
615          * that by way of depending on memcmp succeeding for it.
616          */
617         return !memcmp(rec_a, rec_b, size);
618 }
619
620 void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
621 {
622         if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
623                 return;
624         bpf_timer_cancel_and_free(obj + rec->timer_off);
625 }
626
627 extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
628
629 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
630 {
631         const struct btf_field *fields;
632         int i;
633
634         if (IS_ERR_OR_NULL(rec))
635                 return;
636         fields = rec->fields;
637         for (i = 0; i < rec->cnt; i++) {
638                 struct btf_struct_meta *pointee_struct_meta;
639                 const struct btf_field *field = &fields[i];
640                 void *field_ptr = obj + field->offset;
641                 void *xchgd_field;
642
643                 switch (fields[i].type) {
644                 case BPF_SPIN_LOCK:
645                         break;
646                 case BPF_TIMER:
647                         bpf_timer_cancel_and_free(field_ptr);
648                         break;
649                 case BPF_KPTR_UNREF:
650                         WRITE_ONCE(*(u64 *)field_ptr, 0);
651                         break;
652                 case BPF_KPTR_REF:
653                         xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
654                         if (!xchgd_field)
655                                 break;
656
657                         if (!btf_is_kernel(field->kptr.btf)) {
658                                 pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
659                                                                            field->kptr.btf_id);
660                                 migrate_disable();
661                                 __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
662                                                                  pointee_struct_meta->record :
663                                                                  NULL);
664                                 migrate_enable();
665                         } else {
666                                 field->kptr.dtor(xchgd_field);
667                         }
668                         break;
669                 case BPF_LIST_HEAD:
670                         if (WARN_ON_ONCE(rec->spin_lock_off < 0))
671                                 continue;
672                         bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
673                         break;
674                 case BPF_RB_ROOT:
675                         if (WARN_ON_ONCE(rec->spin_lock_off < 0))
676                                 continue;
677                         bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
678                         break;
679                 case BPF_LIST_NODE:
680                 case BPF_RB_NODE:
681                 case BPF_REFCOUNT:
682                         break;
683                 default:
684                         WARN_ON_ONCE(1);
685                         continue;
686                 }
687         }
688 }
689
690 /* called from workqueue */
691 static void bpf_map_free_deferred(struct work_struct *work)
692 {
693         struct bpf_map *map = container_of(work, struct bpf_map, work);
694         struct btf_record *rec = map->record;
695
696         security_bpf_map_free(map);
697         bpf_map_release_memcg(map);
698         /* implementation dependent freeing */
699         map->ops->map_free(map);
700         /* Delay freeing of btf_record for maps, as map_free
701          * callback usually needs access to them. It is better to do it here
702          * than require each callback to do the free itself manually.
703          *
704          * Note that the btf_record stashed in map->inner_map_meta->record was
705          * already freed using the map_free callback for map in map case which
706          * eventually calls bpf_map_free_meta, since inner_map_meta is only a
707          * template bpf_map struct used during verification.
708          */
709         btf_record_free(rec);
710 }
711
712 static void bpf_map_put_uref(struct bpf_map *map)
713 {
714         if (atomic64_dec_and_test(&map->usercnt)) {
715                 if (map->ops->map_release_uref)
716                         map->ops->map_release_uref(map);
717         }
718 }
719
720 /* decrement map refcnt and schedule it for freeing via workqueue
721  * (underlying map implementation ops->map_free() might sleep)
722  */
723 void bpf_map_put(struct bpf_map *map)
724 {
725         if (atomic64_dec_and_test(&map->refcnt)) {
726                 /* bpf_map_free_id() must be called first */
727                 bpf_map_free_id(map);
728                 btf_put(map->btf);
729                 INIT_WORK(&map->work, bpf_map_free_deferred);
730                 /* Avoid spawning kworkers, since they all might contend
731                  * for the same mutex like slab_mutex.
732                  */
733                 queue_work(system_unbound_wq, &map->work);
734         }
735 }
736 EXPORT_SYMBOL_GPL(bpf_map_put);
737
738 void bpf_map_put_with_uref(struct bpf_map *map)
739 {
740         bpf_map_put_uref(map);
741         bpf_map_put(map);
742 }
743
744 static int bpf_map_release(struct inode *inode, struct file *filp)
745 {
746         struct bpf_map *map = filp->private_data;
747
748         if (map->ops->map_release)
749                 map->ops->map_release(map, filp);
750
751         bpf_map_put_with_uref(map);
752         return 0;
753 }
754
755 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
756 {
757         fmode_t mode = f.file->f_mode;
758
759         /* Our file permissions may have been overridden by global
760          * map permissions facing syscall side.
761          */
762         if (READ_ONCE(map->frozen))
763                 mode &= ~FMODE_CAN_WRITE;
764         return mode;
765 }
766
767 #ifdef CONFIG_PROC_FS
768 /* Show the memory usage of a bpf map */
769 static u64 bpf_map_memory_usage(const struct bpf_map *map)
770 {
771         return map->ops->map_mem_usage(map);
772 }
773
774 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
775 {
776         struct bpf_map *map = filp->private_data;
777         u32 type = 0, jited = 0;
778
779         if (map_type_contains_progs(map)) {
780                 spin_lock(&map->owner.lock);
781                 type  = map->owner.type;
782                 jited = map->owner.jited;
783                 spin_unlock(&map->owner.lock);
784         }
785
786         seq_printf(m,
787                    "map_type:\t%u\n"
788                    "key_size:\t%u\n"
789                    "value_size:\t%u\n"
790                    "max_entries:\t%u\n"
791                    "map_flags:\t%#x\n"
792                    "map_extra:\t%#llx\n"
793                    "memlock:\t%llu\n"
794                    "map_id:\t%u\n"
795                    "frozen:\t%u\n",
796                    map->map_type,
797                    map->key_size,
798                    map->value_size,
799                    map->max_entries,
800                    map->map_flags,
801                    (unsigned long long)map->map_extra,
802                    bpf_map_memory_usage(map),
803                    map->id,
804                    READ_ONCE(map->frozen));
805         if (type) {
806                 seq_printf(m, "owner_prog_type:\t%u\n", type);
807                 seq_printf(m, "owner_jited:\t%u\n", jited);
808         }
809 }
810 #endif
811
812 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
813                               loff_t *ppos)
814 {
815         /* We need this handler such that alloc_file() enables
816          * f_mode with FMODE_CAN_READ.
817          */
818         return -EINVAL;
819 }
820
821 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
822                                size_t siz, loff_t *ppos)
823 {
824         /* We need this handler such that alloc_file() enables
825          * f_mode with FMODE_CAN_WRITE.
826          */
827         return -EINVAL;
828 }
829
830 /* called for any extra memory-mapped regions (except initial) */
831 static void bpf_map_mmap_open(struct vm_area_struct *vma)
832 {
833         struct bpf_map *map = vma->vm_file->private_data;
834
835         if (vma->vm_flags & VM_MAYWRITE)
836                 bpf_map_write_active_inc(map);
837 }
838
839 /* called for all unmapped memory region (including initial) */
840 static void bpf_map_mmap_close(struct vm_area_struct *vma)
841 {
842         struct bpf_map *map = vma->vm_file->private_data;
843
844         if (vma->vm_flags & VM_MAYWRITE)
845                 bpf_map_write_active_dec(map);
846 }
847
848 static const struct vm_operations_struct bpf_map_default_vmops = {
849         .open           = bpf_map_mmap_open,
850         .close          = bpf_map_mmap_close,
851 };
852
853 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
854 {
855         struct bpf_map *map = filp->private_data;
856         int err;
857
858         if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
859                 return -ENOTSUPP;
860
861         if (!(vma->vm_flags & VM_SHARED))
862                 return -EINVAL;
863
864         mutex_lock(&map->freeze_mutex);
865
866         if (vma->vm_flags & VM_WRITE) {
867                 if (map->frozen) {
868                         err = -EPERM;
869                         goto out;
870                 }
871                 /* map is meant to be read-only, so do not allow mapping as
872                  * writable, because it's possible to leak a writable page
873                  * reference and allows user-space to still modify it after
874                  * freezing, while verifier will assume contents do not change
875                  */
876                 if (map->map_flags & BPF_F_RDONLY_PROG) {
877                         err = -EACCES;
878                         goto out;
879                 }
880         }
881
882         /* set default open/close callbacks */
883         vma->vm_ops = &bpf_map_default_vmops;
884         vma->vm_private_data = map;
885         vm_flags_clear(vma, VM_MAYEXEC);
886         if (!(vma->vm_flags & VM_WRITE))
887                 /* disallow re-mapping with PROT_WRITE */
888                 vm_flags_clear(vma, VM_MAYWRITE);
889
890         err = map->ops->map_mmap(map, vma);
891         if (err)
892                 goto out;
893
894         if (vma->vm_flags & VM_MAYWRITE)
895                 bpf_map_write_active_inc(map);
896 out:
897         mutex_unlock(&map->freeze_mutex);
898         return err;
899 }
900
901 static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
902 {
903         struct bpf_map *map = filp->private_data;
904
905         if (map->ops->map_poll)
906                 return map->ops->map_poll(map, filp, pts);
907
908         return EPOLLERR;
909 }
910
911 const struct file_operations bpf_map_fops = {
912 #ifdef CONFIG_PROC_FS
913         .show_fdinfo    = bpf_map_show_fdinfo,
914 #endif
915         .release        = bpf_map_release,
916         .read           = bpf_dummy_read,
917         .write          = bpf_dummy_write,
918         .mmap           = bpf_map_mmap,
919         .poll           = bpf_map_poll,
920 };
921
922 int bpf_map_new_fd(struct bpf_map *map, int flags)
923 {
924         int ret;
925
926         ret = security_bpf_map(map, OPEN_FMODE(flags));
927         if (ret < 0)
928                 return ret;
929
930         return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
931                                 flags | O_CLOEXEC);
932 }
933
934 int bpf_get_file_flag(int flags)
935 {
936         if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
937                 return -EINVAL;
938         if (flags & BPF_F_RDONLY)
939                 return O_RDONLY;
940         if (flags & BPF_F_WRONLY)
941                 return O_WRONLY;
942         return O_RDWR;
943 }
944
945 /* helper macro to check that unused fields 'union bpf_attr' are zero */
946 #define CHECK_ATTR(CMD) \
947         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
948                    sizeof(attr->CMD##_LAST_FIELD), 0, \
949                    sizeof(*attr) - \
950                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
951                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
952
953 /* dst and src must have at least "size" number of bytes.
954  * Return strlen on success and < 0 on error.
955  */
956 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
957 {
958         const char *end = src + size;
959         const char *orig_src = src;
960
961         memset(dst, 0, size);
962         /* Copy all isalnum(), '_' and '.' chars. */
963         while (src < end && *src) {
964                 if (!isalnum(*src) &&
965                     *src != '_' && *src != '.')
966                         return -EINVAL;
967                 *dst++ = *src++;
968         }
969
970         /* No '\0' found in "size" number of bytes */
971         if (src == end)
972                 return -EINVAL;
973
974         return src - orig_src;
975 }
976
977 int map_check_no_btf(const struct bpf_map *map,
978                      const struct btf *btf,
979                      const struct btf_type *key_type,
980                      const struct btf_type *value_type)
981 {
982         return -ENOTSUPP;
983 }
984
985 static int map_check_btf(struct bpf_map *map, const struct btf *btf,
986                          u32 btf_key_id, u32 btf_value_id)
987 {
988         const struct btf_type *key_type, *value_type;
989         u32 key_size, value_size;
990         int ret = 0;
991
992         /* Some maps allow key to be unspecified. */
993         if (btf_key_id) {
994                 key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
995                 if (!key_type || key_size != map->key_size)
996                         return -EINVAL;
997         } else {
998                 key_type = btf_type_by_id(btf, 0);
999                 if (!map->ops->map_check_btf)
1000                         return -EINVAL;
1001         }
1002
1003         value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
1004         if (!value_type || value_size != map->value_size)
1005                 return -EINVAL;
1006
1007         map->record = btf_parse_fields(btf, value_type,
1008                                        BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
1009                                        BPF_RB_ROOT | BPF_REFCOUNT,
1010                                        map->value_size);
1011         if (!IS_ERR_OR_NULL(map->record)) {
1012                 int i;
1013
1014                 if (!bpf_capable()) {
1015                         ret = -EPERM;
1016                         goto free_map_tab;
1017                 }
1018                 if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
1019                         ret = -EACCES;
1020                         goto free_map_tab;
1021                 }
1022                 for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
1023                         switch (map->record->field_mask & (1 << i)) {
1024                         case 0:
1025                                 continue;
1026                         case BPF_SPIN_LOCK:
1027                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1028                                     map->map_type != BPF_MAP_TYPE_ARRAY &&
1029                                     map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
1030                                     map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1031                                     map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1032                                     map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1033                                     map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1034                                         ret = -EOPNOTSUPP;
1035                                         goto free_map_tab;
1036                                 }
1037                                 break;
1038                         case BPF_TIMER:
1039                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1040                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1041                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
1042                                         ret = -EOPNOTSUPP;
1043                                         goto free_map_tab;
1044                                 }
1045                                 break;
1046                         case BPF_KPTR_UNREF:
1047                         case BPF_KPTR_REF:
1048                         case BPF_REFCOUNT:
1049                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1050                                     map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
1051                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1052                                     map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
1053                                     map->map_type != BPF_MAP_TYPE_ARRAY &&
1054                                     map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
1055                                     map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1056                                     map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1057                                     map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1058                                     map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1059                                         ret = -EOPNOTSUPP;
1060                                         goto free_map_tab;
1061                                 }
1062                                 break;
1063                         case BPF_LIST_HEAD:
1064                         case BPF_RB_ROOT:
1065                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1066                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1067                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
1068                                         ret = -EOPNOTSUPP;
1069                                         goto free_map_tab;
1070                                 }
1071                                 break;
1072                         default:
1073                                 /* Fail if map_type checks are missing for a field type */
1074                                 ret = -EOPNOTSUPP;
1075                                 goto free_map_tab;
1076                         }
1077                 }
1078         }
1079
1080         ret = btf_check_and_fixup_fields(btf, map->record);
1081         if (ret < 0)
1082                 goto free_map_tab;
1083
1084         if (map->ops->map_check_btf) {
1085                 ret = map->ops->map_check_btf(map, btf, key_type, value_type);
1086                 if (ret < 0)
1087                         goto free_map_tab;
1088         }
1089
1090         return ret;
1091 free_map_tab:
1092         bpf_map_free_record(map);
1093         return ret;
1094 }
1095
1096 #define BPF_MAP_CREATE_LAST_FIELD map_extra
1097 /* called via syscall */
1098 static int map_create(union bpf_attr *attr)
1099 {
1100         const struct bpf_map_ops *ops;
1101         int numa_node = bpf_map_attr_numa_node(attr);
1102         u32 map_type = attr->map_type;
1103         struct bpf_map *map;
1104         int f_flags;
1105         int err;
1106
1107         err = CHECK_ATTR(BPF_MAP_CREATE);
1108         if (err)
1109                 return -EINVAL;
1110
1111         if (attr->btf_vmlinux_value_type_id) {
1112                 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
1113                     attr->btf_key_type_id || attr->btf_value_type_id)
1114                         return -EINVAL;
1115         } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
1116                 return -EINVAL;
1117         }
1118
1119         if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
1120             attr->map_extra != 0)
1121                 return -EINVAL;
1122
1123         f_flags = bpf_get_file_flag(attr->map_flags);
1124         if (f_flags < 0)
1125                 return f_flags;
1126
1127         if (numa_node != NUMA_NO_NODE &&
1128             ((unsigned int)numa_node >= nr_node_ids ||
1129              !node_online(numa_node)))
1130                 return -EINVAL;
1131
1132         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
1133         map_type = attr->map_type;
1134         if (map_type >= ARRAY_SIZE(bpf_map_types))
1135                 return -EINVAL;
1136         map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
1137         ops = bpf_map_types[map_type];
1138         if (!ops)
1139                 return -EINVAL;
1140
1141         if (ops->map_alloc_check) {
1142                 err = ops->map_alloc_check(attr);
1143                 if (err)
1144                         return err;
1145         }
1146         if (attr->map_ifindex)
1147                 ops = &bpf_map_offload_ops;
1148         if (!ops->map_mem_usage)
1149                 return -EINVAL;
1150
1151         /* Intent here is for unprivileged_bpf_disabled to block BPF map
1152          * creation for unprivileged users; other actions depend
1153          * on fd availability and access to bpffs, so are dependent on
1154          * object creation success. Even with unprivileged BPF disabled,
1155          * capability checks are still carried out.
1156          */
1157         if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
1158                 return -EPERM;
1159
1160         /* check privileged map type permissions */
1161         switch (map_type) {
1162         case BPF_MAP_TYPE_ARRAY:
1163         case BPF_MAP_TYPE_PERCPU_ARRAY:
1164         case BPF_MAP_TYPE_PROG_ARRAY:
1165         case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
1166         case BPF_MAP_TYPE_CGROUP_ARRAY:
1167         case BPF_MAP_TYPE_ARRAY_OF_MAPS:
1168         case BPF_MAP_TYPE_HASH:
1169         case BPF_MAP_TYPE_PERCPU_HASH:
1170         case BPF_MAP_TYPE_HASH_OF_MAPS:
1171         case BPF_MAP_TYPE_RINGBUF:
1172         case BPF_MAP_TYPE_USER_RINGBUF:
1173         case BPF_MAP_TYPE_CGROUP_STORAGE:
1174         case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
1175                 /* unprivileged */
1176                 break;
1177         case BPF_MAP_TYPE_SK_STORAGE:
1178         case BPF_MAP_TYPE_INODE_STORAGE:
1179         case BPF_MAP_TYPE_TASK_STORAGE:
1180         case BPF_MAP_TYPE_CGRP_STORAGE:
1181         case BPF_MAP_TYPE_BLOOM_FILTER:
1182         case BPF_MAP_TYPE_LPM_TRIE:
1183         case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
1184         case BPF_MAP_TYPE_STACK_TRACE:
1185         case BPF_MAP_TYPE_QUEUE:
1186         case BPF_MAP_TYPE_STACK:
1187         case BPF_MAP_TYPE_LRU_HASH:
1188         case BPF_MAP_TYPE_LRU_PERCPU_HASH:
1189         case BPF_MAP_TYPE_STRUCT_OPS:
1190         case BPF_MAP_TYPE_CPUMAP:
1191                 if (!bpf_capable())
1192                         return -EPERM;
1193                 break;
1194         case BPF_MAP_TYPE_SOCKMAP:
1195         case BPF_MAP_TYPE_SOCKHASH:
1196         case BPF_MAP_TYPE_DEVMAP:
1197         case BPF_MAP_TYPE_DEVMAP_HASH:
1198         case BPF_MAP_TYPE_XSKMAP:
1199                 if (!capable(CAP_NET_ADMIN))
1200                         return -EPERM;
1201                 break;
1202         default:
1203                 WARN(1, "unsupported map type %d", map_type);
1204                 return -EPERM;
1205         }
1206
1207         map = ops->map_alloc(attr);
1208         if (IS_ERR(map))
1209                 return PTR_ERR(map);
1210         map->ops = ops;
1211         map->map_type = map_type;
1212
1213         err = bpf_obj_name_cpy(map->name, attr->map_name,
1214                                sizeof(attr->map_name));
1215         if (err < 0)
1216                 goto free_map;
1217
1218         atomic64_set(&map->refcnt, 1);
1219         atomic64_set(&map->usercnt, 1);
1220         mutex_init(&map->freeze_mutex);
1221         spin_lock_init(&map->owner.lock);
1222
1223         if (attr->btf_key_type_id || attr->btf_value_type_id ||
1224             /* Even the map's value is a kernel's struct,
1225              * the bpf_prog.o must have BTF to begin with
1226              * to figure out the corresponding kernel's
1227              * counter part.  Thus, attr->btf_fd has
1228              * to be valid also.
1229              */
1230             attr->btf_vmlinux_value_type_id) {
1231                 struct btf *btf;
1232
1233                 btf = btf_get_by_fd(attr->btf_fd);
1234                 if (IS_ERR(btf)) {
1235                         err = PTR_ERR(btf);
1236                         goto free_map;
1237                 }
1238                 if (btf_is_kernel(btf)) {
1239                         btf_put(btf);
1240                         err = -EACCES;
1241                         goto free_map;
1242                 }
1243                 map->btf = btf;
1244
1245                 if (attr->btf_value_type_id) {
1246                         err = map_check_btf(map, btf, attr->btf_key_type_id,
1247                                             attr->btf_value_type_id);
1248                         if (err)
1249                                 goto free_map;
1250                 }
1251
1252                 map->btf_key_type_id = attr->btf_key_type_id;
1253                 map->btf_value_type_id = attr->btf_value_type_id;
1254                 map->btf_vmlinux_value_type_id =
1255                         attr->btf_vmlinux_value_type_id;
1256         }
1257
1258         err = security_bpf_map_alloc(map);
1259         if (err)
1260                 goto free_map;
1261
1262         err = bpf_map_alloc_id(map);
1263         if (err)
1264                 goto free_map_sec;
1265
1266         bpf_map_save_memcg(map);
1267
1268         err = bpf_map_new_fd(map, f_flags);
1269         if (err < 0) {
1270                 /* failed to allocate fd.
1271                  * bpf_map_put_with_uref() is needed because the above
1272                  * bpf_map_alloc_id() has published the map
1273                  * to the userspace and the userspace may
1274                  * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
1275                  */
1276                 bpf_map_put_with_uref(map);
1277                 return err;
1278         }
1279
1280         return err;
1281
1282 free_map_sec:
1283         security_bpf_map_free(map);
1284 free_map:
1285         btf_put(map->btf);
1286         map->ops->map_free(map);
1287         return err;
1288 }
1289
1290 /* if error is returned, fd is released.
1291  * On success caller should complete fd access with matching fdput()
1292  */
1293 struct bpf_map *__bpf_map_get(struct fd f)
1294 {
1295         if (!f.file)
1296                 return ERR_PTR(-EBADF);
1297         if (f.file->f_op != &bpf_map_fops) {
1298                 fdput(f);
1299                 return ERR_PTR(-EINVAL);
1300         }
1301
1302         return f.file->private_data;
1303 }
1304
1305 void bpf_map_inc(struct bpf_map *map)
1306 {
1307         atomic64_inc(&map->refcnt);
1308 }
1309 EXPORT_SYMBOL_GPL(bpf_map_inc);
1310
1311 void bpf_map_inc_with_uref(struct bpf_map *map)
1312 {
1313         atomic64_inc(&map->refcnt);
1314         atomic64_inc(&map->usercnt);
1315 }
1316 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
1317
1318 struct bpf_map *bpf_map_get(u32 ufd)
1319 {
1320         struct fd f = fdget(ufd);
1321         struct bpf_map *map;
1322
1323         map = __bpf_map_get(f);
1324         if (IS_ERR(map))
1325                 return map;
1326
1327         bpf_map_inc(map);
1328         fdput(f);
1329
1330         return map;
1331 }
1332 EXPORT_SYMBOL(bpf_map_get);
1333
1334 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
1335 {
1336         struct fd f = fdget(ufd);
1337         struct bpf_map *map;
1338
1339         map = __bpf_map_get(f);
1340         if (IS_ERR(map))
1341                 return map;
1342
1343         bpf_map_inc_with_uref(map);
1344         fdput(f);
1345
1346         return map;
1347 }
1348
1349 /* map_idr_lock should have been held or the map should have been
1350  * protected by rcu read lock.
1351  */
1352 struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
1353 {
1354         int refold;
1355
1356         refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
1357         if (!refold)
1358                 return ERR_PTR(-ENOENT);
1359         if (uref)
1360                 atomic64_inc(&map->usercnt);
1361
1362         return map;
1363 }
1364
1365 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
1366 {
1367         spin_lock_bh(&map_idr_lock);
1368         map = __bpf_map_inc_not_zero(map, false);
1369         spin_unlock_bh(&map_idr_lock);
1370
1371         return map;
1372 }
1373 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
1374
1375 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
1376 {
1377         return -ENOTSUPP;
1378 }
1379
1380 static void *__bpf_copy_key(void __user *ukey, u64 key_size)
1381 {
1382         if (key_size)
1383                 return vmemdup_user(ukey, key_size);
1384
1385         if (ukey)
1386                 return ERR_PTR(-EINVAL);
1387
1388         return NULL;
1389 }
1390
1391 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
1392 {
1393         if (key_size)
1394                 return kvmemdup_bpfptr(ukey, key_size);
1395
1396         if (!bpfptr_is_null(ukey))
1397                 return ERR_PTR(-EINVAL);
1398
1399         return NULL;
1400 }
1401
1402 /* last field in 'union bpf_attr' used by this command */
1403 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
1404
1405 static int map_lookup_elem(union bpf_attr *attr)
1406 {
1407         void __user *ukey = u64_to_user_ptr(attr->key);
1408         void __user *uvalue = u64_to_user_ptr(attr->value);
1409         int ufd = attr->map_fd;
1410         struct bpf_map *map;
1411         void *key, *value;
1412         u32 value_size;
1413         struct fd f;
1414         int err;
1415
1416         if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1417                 return -EINVAL;
1418
1419         if (attr->flags & ~BPF_F_LOCK)
1420                 return -EINVAL;
1421
1422         f = fdget(ufd);
1423         map = __bpf_map_get(f);
1424         if (IS_ERR(map))
1425                 return PTR_ERR(map);
1426         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1427                 err = -EPERM;
1428                 goto err_put;
1429         }
1430
1431         if ((attr->flags & BPF_F_LOCK) &&
1432             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1433                 err = -EINVAL;
1434                 goto err_put;
1435         }
1436
1437         key = __bpf_copy_key(ukey, map->key_size);
1438         if (IS_ERR(key)) {
1439                 err = PTR_ERR(key);
1440                 goto err_put;
1441         }
1442
1443         value_size = bpf_map_value_size(map);
1444
1445         err = -ENOMEM;
1446         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1447         if (!value)
1448                 goto free_key;
1449
1450         if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
1451                 if (copy_from_user(value, uvalue, value_size))
1452                         err = -EFAULT;
1453                 else
1454                         err = bpf_map_copy_value(map, key, value, attr->flags);
1455                 goto free_value;
1456         }
1457
1458         err = bpf_map_copy_value(map, key, value, attr->flags);
1459         if (err)
1460                 goto free_value;
1461
1462         err = -EFAULT;
1463         if (copy_to_user(uvalue, value, value_size) != 0)
1464                 goto free_value;
1465
1466         err = 0;
1467
1468 free_value:
1469         kvfree(value);
1470 free_key:
1471         kvfree(key);
1472 err_put:
1473         fdput(f);
1474         return err;
1475 }
1476
1477
1478 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1479
1480 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
1481 {
1482         bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1483         bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
1484         int ufd = attr->map_fd;
1485         struct bpf_map *map;
1486         void *key, *value;
1487         u32 value_size;
1488         struct fd f;
1489         int err;
1490
1491         if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1492                 return -EINVAL;
1493
1494         f = fdget(ufd);
1495         map = __bpf_map_get(f);
1496         if (IS_ERR(map))
1497                 return PTR_ERR(map);
1498         bpf_map_write_active_inc(map);
1499         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1500                 err = -EPERM;
1501                 goto err_put;
1502         }
1503
1504         if ((attr->flags & BPF_F_LOCK) &&
1505             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1506                 err = -EINVAL;
1507                 goto err_put;
1508         }
1509
1510         key = ___bpf_copy_key(ukey, map->key_size);
1511         if (IS_ERR(key)) {
1512                 err = PTR_ERR(key);
1513                 goto err_put;
1514         }
1515
1516         value_size = bpf_map_value_size(map);
1517         value = kvmemdup_bpfptr(uvalue, value_size);
1518         if (IS_ERR(value)) {
1519                 err = PTR_ERR(value);
1520                 goto free_key;
1521         }
1522
1523         err = bpf_map_update_value(map, f.file, key, value, attr->flags);
1524
1525         kvfree(value);
1526 free_key:
1527         kvfree(key);
1528 err_put:
1529         bpf_map_write_active_dec(map);
1530         fdput(f);
1531         return err;
1532 }
1533
1534 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1535
1536 static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
1537 {
1538         bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1539         int ufd = attr->map_fd;
1540         struct bpf_map *map;
1541         struct fd f;
1542         void *key;
1543         int err;
1544
1545         if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1546                 return -EINVAL;
1547
1548         f = fdget(ufd);
1549         map = __bpf_map_get(f);
1550         if (IS_ERR(map))
1551                 return PTR_ERR(map);
1552         bpf_map_write_active_inc(map);
1553         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1554                 err = -EPERM;
1555                 goto err_put;
1556         }
1557
1558         key = ___bpf_copy_key(ukey, map->key_size);
1559         if (IS_ERR(key)) {
1560                 err = PTR_ERR(key);
1561                 goto err_put;
1562         }
1563
1564         if (bpf_map_is_offloaded(map)) {
1565                 err = bpf_map_offload_delete_elem(map, key);
1566                 goto out;
1567         } else if (IS_FD_PROG_ARRAY(map) ||
1568                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1569                 /* These maps require sleepable context */
1570                 err = map->ops->map_delete_elem(map, key);
1571                 goto out;
1572         }
1573
1574         bpf_disable_instrumentation();
1575         rcu_read_lock();
1576         err = map->ops->map_delete_elem(map, key);
1577         rcu_read_unlock();
1578         bpf_enable_instrumentation();
1579         maybe_wait_bpf_programs(map);
1580 out:
1581         kvfree(key);
1582 err_put:
1583         bpf_map_write_active_dec(map);
1584         fdput(f);
1585         return err;
1586 }
1587
1588 /* last field in 'union bpf_attr' used by this command */
1589 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1590
1591 static int map_get_next_key(union bpf_attr *attr)
1592 {
1593         void __user *ukey = u64_to_user_ptr(attr->key);
1594         void __user *unext_key = u64_to_user_ptr(attr->next_key);
1595         int ufd = attr->map_fd;
1596         struct bpf_map *map;
1597         void *key, *next_key;
1598         struct fd f;
1599         int err;
1600
1601         if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1602                 return -EINVAL;
1603
1604         f = fdget(ufd);
1605         map = __bpf_map_get(f);
1606         if (IS_ERR(map))
1607                 return PTR_ERR(map);
1608         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1609                 err = -EPERM;
1610                 goto err_put;
1611         }
1612
1613         if (ukey) {
1614                 key = __bpf_copy_key(ukey, map->key_size);
1615                 if (IS_ERR(key)) {
1616                         err = PTR_ERR(key);
1617                         goto err_put;
1618                 }
1619         } else {
1620                 key = NULL;
1621         }
1622
1623         err = -ENOMEM;
1624         next_key = kvmalloc(map->key_size, GFP_USER);
1625         if (!next_key)
1626                 goto free_key;
1627
1628         if (bpf_map_is_offloaded(map)) {
1629                 err = bpf_map_offload_get_next_key(map, key, next_key);
1630                 goto out;
1631         }
1632
1633         rcu_read_lock();
1634         err = map->ops->map_get_next_key(map, key, next_key);
1635         rcu_read_unlock();
1636 out:
1637         if (err)
1638                 goto free_next_key;
1639
1640         err = -EFAULT;
1641         if (copy_to_user(unext_key, next_key, map->key_size) != 0)
1642                 goto free_next_key;
1643
1644         err = 0;
1645
1646 free_next_key:
1647         kvfree(next_key);
1648 free_key:
1649         kvfree(key);
1650 err_put:
1651         fdput(f);
1652         return err;
1653 }
1654
1655 int generic_map_delete_batch(struct bpf_map *map,
1656                              const union bpf_attr *attr,
1657                              union bpf_attr __user *uattr)
1658 {
1659         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1660         u32 cp, max_count;
1661         int err = 0;
1662         void *key;
1663
1664         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1665                 return -EINVAL;
1666
1667         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1668             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1669                 return -EINVAL;
1670         }
1671
1672         max_count = attr->batch.count;
1673         if (!max_count)
1674                 return 0;
1675
1676         key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1677         if (!key)
1678                 return -ENOMEM;
1679
1680         for (cp = 0; cp < max_count; cp++) {
1681                 err = -EFAULT;
1682                 if (copy_from_user(key, keys + cp * map->key_size,
1683                                    map->key_size))
1684                         break;
1685
1686                 if (bpf_map_is_offloaded(map)) {
1687                         err = bpf_map_offload_delete_elem(map, key);
1688                         break;
1689                 }
1690
1691                 bpf_disable_instrumentation();
1692                 rcu_read_lock();
1693                 err = map->ops->map_delete_elem(map, key);
1694                 rcu_read_unlock();
1695                 bpf_enable_instrumentation();
1696                 if (err)
1697                         break;
1698                 cond_resched();
1699         }
1700         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1701                 err = -EFAULT;
1702
1703         kvfree(key);
1704
1705         maybe_wait_bpf_programs(map);
1706         return err;
1707 }
1708
1709 int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
1710                              const union bpf_attr *attr,
1711                              union bpf_attr __user *uattr)
1712 {
1713         void __user *values = u64_to_user_ptr(attr->batch.values);
1714         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1715         u32 value_size, cp, max_count;
1716         void *key, *value;
1717         int err = 0;
1718
1719         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1720                 return -EINVAL;
1721
1722         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1723             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1724                 return -EINVAL;
1725         }
1726
1727         value_size = bpf_map_value_size(map);
1728
1729         max_count = attr->batch.count;
1730         if (!max_count)
1731                 return 0;
1732
1733         key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1734         if (!key)
1735                 return -ENOMEM;
1736
1737         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1738         if (!value) {
1739                 kvfree(key);
1740                 return -ENOMEM;
1741         }
1742
1743         for (cp = 0; cp < max_count; cp++) {
1744                 err = -EFAULT;
1745                 if (copy_from_user(key, keys + cp * map->key_size,
1746                     map->key_size) ||
1747                     copy_from_user(value, values + cp * value_size, value_size))
1748                         break;
1749
1750                 err = bpf_map_update_value(map, map_file, key, value,
1751                                            attr->batch.elem_flags);
1752
1753                 if (err)
1754                         break;
1755                 cond_resched();
1756         }
1757
1758         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1759                 err = -EFAULT;
1760
1761         kvfree(value);
1762         kvfree(key);
1763         return err;
1764 }
1765
1766 #define MAP_LOOKUP_RETRIES 3
1767
1768 int generic_map_lookup_batch(struct bpf_map *map,
1769                                     const union bpf_attr *attr,
1770                                     union bpf_attr __user *uattr)
1771 {
1772         void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1773         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1774         void __user *values = u64_to_user_ptr(attr->batch.values);
1775         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1776         void *buf, *buf_prevkey, *prev_key, *key, *value;
1777         int err, retry = MAP_LOOKUP_RETRIES;
1778         u32 value_size, cp, max_count;
1779
1780         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1781                 return -EINVAL;
1782
1783         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1784             !btf_record_has_field(map->record, BPF_SPIN_LOCK))
1785                 return -EINVAL;
1786
1787         value_size = bpf_map_value_size(map);
1788
1789         max_count = attr->batch.count;
1790         if (!max_count)
1791                 return 0;
1792
1793         if (put_user(0, &uattr->batch.count))
1794                 return -EFAULT;
1795
1796         buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1797         if (!buf_prevkey)
1798                 return -ENOMEM;
1799
1800         buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1801         if (!buf) {
1802                 kvfree(buf_prevkey);
1803                 return -ENOMEM;
1804         }
1805
1806         err = -EFAULT;
1807         prev_key = NULL;
1808         if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1809                 goto free_buf;
1810         key = buf;
1811         value = key + map->key_size;
1812         if (ubatch)
1813                 prev_key = buf_prevkey;
1814
1815         for (cp = 0; cp < max_count;) {
1816                 rcu_read_lock();
1817                 err = map->ops->map_get_next_key(map, prev_key, key);
1818                 rcu_read_unlock();
1819                 if (err)
1820                         break;
1821                 err = bpf_map_copy_value(map, key, value,
1822                                          attr->batch.elem_flags);
1823
1824                 if (err == -ENOENT) {
1825                         if (retry) {
1826                                 retry--;
1827                                 continue;
1828                         }
1829                         err = -EINTR;
1830                         break;
1831                 }
1832
1833                 if (err)
1834                         goto free_buf;
1835
1836                 if (copy_to_user(keys + cp * map->key_size, key,
1837                                  map->key_size)) {
1838                         err = -EFAULT;
1839                         goto free_buf;
1840                 }
1841                 if (copy_to_user(values + cp * value_size, value, value_size)) {
1842                         err = -EFAULT;
1843                         goto free_buf;
1844                 }
1845
1846                 if (!prev_key)
1847                         prev_key = buf_prevkey;
1848
1849                 swap(prev_key, key);
1850                 retry = MAP_LOOKUP_RETRIES;
1851                 cp++;
1852                 cond_resched();
1853         }
1854
1855         if (err == -EFAULT)
1856                 goto free_buf;
1857
1858         if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1859                     (cp && copy_to_user(uobatch, prev_key, map->key_size))))
1860                 err = -EFAULT;
1861
1862 free_buf:
1863         kvfree(buf_prevkey);
1864         kvfree(buf);
1865         return err;
1866 }
1867
1868 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
1869
1870 static int map_lookup_and_delete_elem(union bpf_attr *attr)
1871 {
1872         void __user *ukey = u64_to_user_ptr(attr->key);
1873         void __user *uvalue = u64_to_user_ptr(attr->value);
1874         int ufd = attr->map_fd;
1875         struct bpf_map *map;
1876         void *key, *value;
1877         u32 value_size;
1878         struct fd f;
1879         int err;
1880
1881         if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
1882                 return -EINVAL;
1883
1884         if (attr->flags & ~BPF_F_LOCK)
1885                 return -EINVAL;
1886
1887         f = fdget(ufd);
1888         map = __bpf_map_get(f);
1889         if (IS_ERR(map))
1890                 return PTR_ERR(map);
1891         bpf_map_write_active_inc(map);
1892         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
1893             !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1894                 err = -EPERM;
1895                 goto err_put;
1896         }
1897
1898         if (attr->flags &&
1899             (map->map_type == BPF_MAP_TYPE_QUEUE ||
1900              map->map_type == BPF_MAP_TYPE_STACK)) {
1901                 err = -EINVAL;
1902                 goto err_put;
1903         }
1904
1905         if ((attr->flags & BPF_F_LOCK) &&
1906             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1907                 err = -EINVAL;
1908                 goto err_put;
1909         }
1910
1911         key = __bpf_copy_key(ukey, map->key_size);
1912         if (IS_ERR(key)) {
1913                 err = PTR_ERR(key);
1914                 goto err_put;
1915         }
1916
1917         value_size = bpf_map_value_size(map);
1918
1919         err = -ENOMEM;
1920         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1921         if (!value)
1922                 goto free_key;
1923
1924         err = -ENOTSUPP;
1925         if (map->map_type == BPF_MAP_TYPE_QUEUE ||
1926             map->map_type == BPF_MAP_TYPE_STACK) {
1927                 err = map->ops->map_pop_elem(map, value);
1928         } else if (map->map_type == BPF_MAP_TYPE_HASH ||
1929                    map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
1930                    map->map_type == BPF_MAP_TYPE_LRU_HASH ||
1931                    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
1932                 if (!bpf_map_is_offloaded(map)) {
1933                         bpf_disable_instrumentation();
1934                         rcu_read_lock();
1935                         err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
1936                         rcu_read_unlock();
1937                         bpf_enable_instrumentation();
1938                 }
1939         }
1940
1941         if (err)
1942                 goto free_value;
1943
1944         if (copy_to_user(uvalue, value, value_size) != 0) {
1945                 err = -EFAULT;
1946                 goto free_value;
1947         }
1948
1949         err = 0;
1950
1951 free_value:
1952         kvfree(value);
1953 free_key:
1954         kvfree(key);
1955 err_put:
1956         bpf_map_write_active_dec(map);
1957         fdput(f);
1958         return err;
1959 }
1960
1961 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
1962
1963 static int map_freeze(const union bpf_attr *attr)
1964 {
1965         int err = 0, ufd = attr->map_fd;
1966         struct bpf_map *map;
1967         struct fd f;
1968
1969         if (CHECK_ATTR(BPF_MAP_FREEZE))
1970                 return -EINVAL;
1971
1972         f = fdget(ufd);
1973         map = __bpf_map_get(f);
1974         if (IS_ERR(map))
1975                 return PTR_ERR(map);
1976
1977         if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
1978                 fdput(f);
1979                 return -ENOTSUPP;
1980         }
1981
1982         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1983                 fdput(f);
1984                 return -EPERM;
1985         }
1986
1987         mutex_lock(&map->freeze_mutex);
1988         if (bpf_map_write_active(map)) {
1989                 err = -EBUSY;
1990                 goto err_put;
1991         }
1992         if (READ_ONCE(map->frozen)) {
1993                 err = -EBUSY;
1994                 goto err_put;
1995         }
1996
1997         WRITE_ONCE(map->frozen, true);
1998 err_put:
1999         mutex_unlock(&map->freeze_mutex);
2000         fdput(f);
2001         return err;
2002 }
2003
2004 static const struct bpf_prog_ops * const bpf_prog_types[] = {
2005 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
2006         [_id] = & _name ## _prog_ops,
2007 #define BPF_MAP_TYPE(_id, _ops)
2008 #define BPF_LINK_TYPE(_id, _name)
2009 #include <linux/bpf_types.h>
2010 #undef BPF_PROG_TYPE
2011 #undef BPF_MAP_TYPE
2012 #undef BPF_LINK_TYPE
2013 };
2014
2015 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
2016 {
2017         const struct bpf_prog_ops *ops;
2018
2019         if (type >= ARRAY_SIZE(bpf_prog_types))
2020                 return -EINVAL;
2021         type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
2022         ops = bpf_prog_types[type];
2023         if (!ops)
2024                 return -EINVAL;
2025
2026         if (!bpf_prog_is_offloaded(prog->aux))
2027                 prog->aux->ops = ops;
2028         else
2029                 prog->aux->ops = &bpf_offload_prog_ops;
2030         prog->type = type;
2031         return 0;
2032 }
2033
2034 enum bpf_audit {
2035         BPF_AUDIT_LOAD,
2036         BPF_AUDIT_UNLOAD,
2037         BPF_AUDIT_MAX,
2038 };
2039
2040 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
2041         [BPF_AUDIT_LOAD]   = "LOAD",
2042         [BPF_AUDIT_UNLOAD] = "UNLOAD",
2043 };
2044
2045 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
2046 {
2047         struct audit_context *ctx = NULL;
2048         struct audit_buffer *ab;
2049
2050         if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
2051                 return;
2052         if (audit_enabled == AUDIT_OFF)
2053                 return;
2054         if (!in_irq() && !irqs_disabled())
2055                 ctx = audit_context();
2056         ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
2057         if (unlikely(!ab))
2058                 return;
2059         audit_log_format(ab, "prog-id=%u op=%s",
2060                          prog->aux->id, bpf_audit_str[op]);
2061         audit_log_end(ab);
2062 }
2063
2064 static int bpf_prog_alloc_id(struct bpf_prog *prog)
2065 {
2066         int id;
2067
2068         idr_preload(GFP_KERNEL);
2069         spin_lock_bh(&prog_idr_lock);
2070         id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
2071         if (id > 0)
2072                 prog->aux->id = id;
2073         spin_unlock_bh(&prog_idr_lock);
2074         idr_preload_end();
2075
2076         /* id is in [1, INT_MAX) */
2077         if (WARN_ON_ONCE(!id))
2078                 return -ENOSPC;
2079
2080         return id > 0 ? 0 : id;
2081 }
2082
2083 void bpf_prog_free_id(struct bpf_prog *prog)
2084 {
2085         unsigned long flags;
2086
2087         /* cBPF to eBPF migrations are currently not in the idr store.
2088          * Offloaded programs are removed from the store when their device
2089          * disappears - even if someone grabs an fd to them they are unusable,
2090          * simply waiting for refcnt to drop to be freed.
2091          */
2092         if (!prog->aux->id)
2093                 return;
2094
2095         spin_lock_irqsave(&prog_idr_lock, flags);
2096         idr_remove(&prog_idr, prog->aux->id);
2097         prog->aux->id = 0;
2098         spin_unlock_irqrestore(&prog_idr_lock, flags);
2099 }
2100
2101 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
2102 {
2103         struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
2104
2105         kvfree(aux->func_info);
2106         kfree(aux->func_info_aux);
2107         free_uid(aux->user);
2108         security_bpf_prog_free(aux);
2109         bpf_prog_free(aux->prog);
2110 }
2111
2112 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
2113 {
2114         bpf_prog_kallsyms_del_all(prog);
2115         btf_put(prog->aux->btf);
2116         module_put(prog->aux->mod);
2117         kvfree(prog->aux->jited_linfo);
2118         kvfree(prog->aux->linfo);
2119         kfree(prog->aux->kfunc_tab);
2120         if (prog->aux->attach_btf)
2121                 btf_put(prog->aux->attach_btf);
2122
2123         if (deferred) {
2124                 if (prog->aux->sleepable)
2125                         call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
2126                 else
2127                         call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
2128         } else {
2129                 __bpf_prog_put_rcu(&prog->aux->rcu);
2130         }
2131 }
2132
2133 static void bpf_prog_put_deferred(struct work_struct *work)
2134 {
2135         struct bpf_prog_aux *aux;
2136         struct bpf_prog *prog;
2137
2138         aux = container_of(work, struct bpf_prog_aux, work);
2139         prog = aux->prog;
2140         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
2141         bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
2142         bpf_prog_free_id(prog);
2143         __bpf_prog_put_noref(prog, true);
2144 }
2145
2146 static void __bpf_prog_put(struct bpf_prog *prog)
2147 {
2148         struct bpf_prog_aux *aux = prog->aux;
2149
2150         if (atomic64_dec_and_test(&aux->refcnt)) {
2151                 if (in_irq() || irqs_disabled()) {
2152                         INIT_WORK(&aux->work, bpf_prog_put_deferred);
2153                         schedule_work(&aux->work);
2154                 } else {
2155                         bpf_prog_put_deferred(&aux->work);
2156                 }
2157         }
2158 }
2159
2160 void bpf_prog_put(struct bpf_prog *prog)
2161 {
2162         __bpf_prog_put(prog);
2163 }
2164 EXPORT_SYMBOL_GPL(bpf_prog_put);
2165
2166 static int bpf_prog_release(struct inode *inode, struct file *filp)
2167 {
2168         struct bpf_prog *prog = filp->private_data;
2169
2170         bpf_prog_put(prog);
2171         return 0;
2172 }
2173
2174 struct bpf_prog_kstats {
2175         u64 nsecs;
2176         u64 cnt;
2177         u64 misses;
2178 };
2179
2180 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
2181 {
2182         struct bpf_prog_stats *stats;
2183         unsigned int flags;
2184
2185         stats = this_cpu_ptr(prog->stats);
2186         flags = u64_stats_update_begin_irqsave(&stats->syncp);
2187         u64_stats_inc(&stats->misses);
2188         u64_stats_update_end_irqrestore(&stats->syncp, flags);
2189 }
2190
2191 static void bpf_prog_get_stats(const struct bpf_prog *prog,
2192                                struct bpf_prog_kstats *stats)
2193 {
2194         u64 nsecs = 0, cnt = 0, misses = 0;
2195         int cpu;
2196
2197         for_each_possible_cpu(cpu) {
2198                 const struct bpf_prog_stats *st;
2199                 unsigned int start;
2200                 u64 tnsecs, tcnt, tmisses;
2201
2202                 st = per_cpu_ptr(prog->stats, cpu);
2203                 do {
2204                         start = u64_stats_fetch_begin(&st->syncp);
2205                         tnsecs = u64_stats_read(&st->nsecs);
2206                         tcnt = u64_stats_read(&st->cnt);
2207                         tmisses = u64_stats_read(&st->misses);
2208                 } while (u64_stats_fetch_retry(&st->syncp, start));
2209                 nsecs += tnsecs;
2210                 cnt += tcnt;
2211                 misses += tmisses;
2212         }
2213         stats->nsecs = nsecs;
2214         stats->cnt = cnt;
2215         stats->misses = misses;
2216 }
2217
2218 #ifdef CONFIG_PROC_FS
2219 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
2220 {
2221         const struct bpf_prog *prog = filp->private_data;
2222         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2223         struct bpf_prog_kstats stats;
2224
2225         bpf_prog_get_stats(prog, &stats);
2226         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2227         seq_printf(m,
2228                    "prog_type:\t%u\n"
2229                    "prog_jited:\t%u\n"
2230                    "prog_tag:\t%s\n"
2231                    "memlock:\t%llu\n"
2232                    "prog_id:\t%u\n"
2233                    "run_time_ns:\t%llu\n"
2234                    "run_cnt:\t%llu\n"
2235                    "recursion_misses:\t%llu\n"
2236                    "verified_insns:\t%u\n",
2237                    prog->type,
2238                    prog->jited,
2239                    prog_tag,
2240                    prog->pages * 1ULL << PAGE_SHIFT,
2241                    prog->aux->id,
2242                    stats.nsecs,
2243                    stats.cnt,
2244                    stats.misses,
2245                    prog->aux->verified_insns);
2246 }
2247 #endif
2248
2249 const struct file_operations bpf_prog_fops = {
2250 #ifdef CONFIG_PROC_FS
2251         .show_fdinfo    = bpf_prog_show_fdinfo,
2252 #endif
2253         .release        = bpf_prog_release,
2254         .read           = bpf_dummy_read,
2255         .write          = bpf_dummy_write,
2256 };
2257
2258 int bpf_prog_new_fd(struct bpf_prog *prog)
2259 {
2260         int ret;
2261
2262         ret = security_bpf_prog(prog);
2263         if (ret < 0)
2264                 return ret;
2265
2266         return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
2267                                 O_RDWR | O_CLOEXEC);
2268 }
2269
2270 static struct bpf_prog *____bpf_prog_get(struct fd f)
2271 {
2272         if (!f.file)
2273                 return ERR_PTR(-EBADF);
2274         if (f.file->f_op != &bpf_prog_fops) {
2275                 fdput(f);
2276                 return ERR_PTR(-EINVAL);
2277         }
2278
2279         return f.file->private_data;
2280 }
2281
2282 void bpf_prog_add(struct bpf_prog *prog, int i)
2283 {
2284         atomic64_add(i, &prog->aux->refcnt);
2285 }
2286 EXPORT_SYMBOL_GPL(bpf_prog_add);
2287
2288 void bpf_prog_sub(struct bpf_prog *prog, int i)
2289 {
2290         /* Only to be used for undoing previous bpf_prog_add() in some
2291          * error path. We still know that another entity in our call
2292          * path holds a reference to the program, thus atomic_sub() can
2293          * be safely used in such cases!
2294          */
2295         WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
2296 }
2297 EXPORT_SYMBOL_GPL(bpf_prog_sub);
2298
2299 void bpf_prog_inc(struct bpf_prog *prog)
2300 {
2301         atomic64_inc(&prog->aux->refcnt);
2302 }
2303 EXPORT_SYMBOL_GPL(bpf_prog_inc);
2304
2305 /* prog_idr_lock should have been held */
2306 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
2307 {
2308         int refold;
2309
2310         refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
2311
2312         if (!refold)
2313                 return ERR_PTR(-ENOENT);
2314
2315         return prog;
2316 }
2317 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
2318
2319 bool bpf_prog_get_ok(struct bpf_prog *prog,
2320                             enum bpf_prog_type *attach_type, bool attach_drv)
2321 {
2322         /* not an attachment, just a refcount inc, always allow */
2323         if (!attach_type)
2324                 return true;
2325
2326         if (prog->type != *attach_type)
2327                 return false;
2328         if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
2329                 return false;
2330
2331         return true;
2332 }
2333
2334 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
2335                                        bool attach_drv)
2336 {
2337         struct fd f = fdget(ufd);
2338         struct bpf_prog *prog;
2339
2340         prog = ____bpf_prog_get(f);
2341         if (IS_ERR(prog))
2342                 return prog;
2343         if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
2344                 prog = ERR_PTR(-EINVAL);
2345                 goto out;
2346         }
2347
2348         bpf_prog_inc(prog);
2349 out:
2350         fdput(f);
2351         return prog;
2352 }
2353
2354 struct bpf_prog *bpf_prog_get(u32 ufd)
2355 {
2356         return __bpf_prog_get(ufd, NULL, false);
2357 }
2358
2359 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
2360                                        bool attach_drv)
2361 {
2362         return __bpf_prog_get(ufd, &type, attach_drv);
2363 }
2364 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
2365
2366 /* Initially all BPF programs could be loaded w/o specifying
2367  * expected_attach_type. Later for some of them specifying expected_attach_type
2368  * at load time became required so that program could be validated properly.
2369  * Programs of types that are allowed to be loaded both w/ and w/o (for
2370  * backward compatibility) expected_attach_type, should have the default attach
2371  * type assigned to expected_attach_type for the latter case, so that it can be
2372  * validated later at attach time.
2373  *
2374  * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
2375  * prog type requires it but has some attach types that have to be backward
2376  * compatible.
2377  */
2378 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
2379 {
2380         switch (attr->prog_type) {
2381         case BPF_PROG_TYPE_CGROUP_SOCK:
2382                 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
2383                  * exist so checking for non-zero is the way to go here.
2384                  */
2385                 if (!attr->expected_attach_type)
2386                         attr->expected_attach_type =
2387                                 BPF_CGROUP_INET_SOCK_CREATE;
2388                 break;
2389         case BPF_PROG_TYPE_SK_REUSEPORT:
2390                 if (!attr->expected_attach_type)
2391                         attr->expected_attach_type =
2392                                 BPF_SK_REUSEPORT_SELECT;
2393                 break;
2394         }
2395 }
2396
2397 static int
2398 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
2399                            enum bpf_attach_type expected_attach_type,
2400                            struct btf *attach_btf, u32 btf_id,
2401                            struct bpf_prog *dst_prog)
2402 {
2403         if (btf_id) {
2404                 if (btf_id > BTF_MAX_TYPE)
2405                         return -EINVAL;
2406
2407                 if (!attach_btf && !dst_prog)
2408                         return -EINVAL;
2409
2410                 switch (prog_type) {
2411                 case BPF_PROG_TYPE_TRACING:
2412                 case BPF_PROG_TYPE_LSM:
2413                 case BPF_PROG_TYPE_STRUCT_OPS:
2414                 case BPF_PROG_TYPE_EXT:
2415                         break;
2416                 default:
2417                         return -EINVAL;
2418                 }
2419         }
2420
2421         if (attach_btf && (!btf_id || dst_prog))
2422                 return -EINVAL;
2423
2424         if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
2425             prog_type != BPF_PROG_TYPE_EXT)
2426                 return -EINVAL;
2427
2428         switch (prog_type) {
2429         case BPF_PROG_TYPE_CGROUP_SOCK:
2430                 switch (expected_attach_type) {
2431                 case BPF_CGROUP_INET_SOCK_CREATE:
2432                 case BPF_CGROUP_INET_SOCK_RELEASE:
2433                 case BPF_CGROUP_INET4_POST_BIND:
2434                 case BPF_CGROUP_INET6_POST_BIND:
2435                         return 0;
2436                 default:
2437                         return -EINVAL;
2438                 }
2439         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2440                 switch (expected_attach_type) {
2441                 case BPF_CGROUP_INET4_BIND:
2442                 case BPF_CGROUP_INET6_BIND:
2443                 case BPF_CGROUP_INET4_CONNECT:
2444                 case BPF_CGROUP_INET6_CONNECT:
2445                 case BPF_CGROUP_INET4_GETPEERNAME:
2446                 case BPF_CGROUP_INET6_GETPEERNAME:
2447                 case BPF_CGROUP_INET4_GETSOCKNAME:
2448                 case BPF_CGROUP_INET6_GETSOCKNAME:
2449                 case BPF_CGROUP_UDP4_SENDMSG:
2450                 case BPF_CGROUP_UDP6_SENDMSG:
2451                 case BPF_CGROUP_UDP4_RECVMSG:
2452                 case BPF_CGROUP_UDP6_RECVMSG:
2453                         return 0;
2454                 default:
2455                         return -EINVAL;
2456                 }
2457         case BPF_PROG_TYPE_CGROUP_SKB:
2458                 switch (expected_attach_type) {
2459                 case BPF_CGROUP_INET_INGRESS:
2460                 case BPF_CGROUP_INET_EGRESS:
2461                         return 0;
2462                 default:
2463                         return -EINVAL;
2464                 }
2465         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2466                 switch (expected_attach_type) {
2467                 case BPF_CGROUP_SETSOCKOPT:
2468                 case BPF_CGROUP_GETSOCKOPT:
2469                         return 0;
2470                 default:
2471                         return -EINVAL;
2472                 }
2473         case BPF_PROG_TYPE_SK_LOOKUP:
2474                 if (expected_attach_type == BPF_SK_LOOKUP)
2475                         return 0;
2476                 return -EINVAL;
2477         case BPF_PROG_TYPE_SK_REUSEPORT:
2478                 switch (expected_attach_type) {
2479                 case BPF_SK_REUSEPORT_SELECT:
2480                 case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
2481                         return 0;
2482                 default:
2483                         return -EINVAL;
2484                 }
2485         case BPF_PROG_TYPE_NETFILTER:
2486                 if (expected_attach_type == BPF_NETFILTER)
2487                         return 0;
2488                 return -EINVAL;
2489         case BPF_PROG_TYPE_SYSCALL:
2490         case BPF_PROG_TYPE_EXT:
2491                 if (expected_attach_type)
2492                         return -EINVAL;
2493                 fallthrough;
2494         default:
2495                 return 0;
2496         }
2497 }
2498
2499 static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
2500 {
2501         switch (prog_type) {
2502         case BPF_PROG_TYPE_SCHED_CLS:
2503         case BPF_PROG_TYPE_SCHED_ACT:
2504         case BPF_PROG_TYPE_XDP:
2505         case BPF_PROG_TYPE_LWT_IN:
2506         case BPF_PROG_TYPE_LWT_OUT:
2507         case BPF_PROG_TYPE_LWT_XMIT:
2508         case BPF_PROG_TYPE_LWT_SEG6LOCAL:
2509         case BPF_PROG_TYPE_SK_SKB:
2510         case BPF_PROG_TYPE_SK_MSG:
2511         case BPF_PROG_TYPE_FLOW_DISSECTOR:
2512         case BPF_PROG_TYPE_CGROUP_DEVICE:
2513         case BPF_PROG_TYPE_CGROUP_SOCK:
2514         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2515         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2516         case BPF_PROG_TYPE_CGROUP_SYSCTL:
2517         case BPF_PROG_TYPE_SOCK_OPS:
2518         case BPF_PROG_TYPE_EXT: /* extends any prog */
2519         case BPF_PROG_TYPE_NETFILTER:
2520                 return true;
2521         case BPF_PROG_TYPE_CGROUP_SKB:
2522                 /* always unpriv */
2523         case BPF_PROG_TYPE_SK_REUSEPORT:
2524                 /* equivalent to SOCKET_FILTER. need CAP_BPF only */
2525         default:
2526                 return false;
2527         }
2528 }
2529
2530 static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
2531 {
2532         switch (prog_type) {
2533         case BPF_PROG_TYPE_KPROBE:
2534         case BPF_PROG_TYPE_TRACEPOINT:
2535         case BPF_PROG_TYPE_PERF_EVENT:
2536         case BPF_PROG_TYPE_RAW_TRACEPOINT:
2537         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2538         case BPF_PROG_TYPE_TRACING:
2539         case BPF_PROG_TYPE_LSM:
2540         case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
2541         case BPF_PROG_TYPE_EXT: /* extends any prog */
2542                 return true;
2543         default:
2544                 return false;
2545         }
2546 }
2547
2548 /* last field in 'union bpf_attr' used by this command */
2549 #define BPF_PROG_LOAD_LAST_FIELD log_true_size
2550
2551 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
2552 {
2553         enum bpf_prog_type type = attr->prog_type;
2554         struct bpf_prog *prog, *dst_prog = NULL;
2555         struct btf *attach_btf = NULL;
2556         int err;
2557         char license[128];
2558
2559         if (CHECK_ATTR(BPF_PROG_LOAD))
2560                 return -EINVAL;
2561
2562         if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
2563                                  BPF_F_ANY_ALIGNMENT |
2564                                  BPF_F_TEST_STATE_FREQ |
2565                                  BPF_F_SLEEPABLE |
2566                                  BPF_F_TEST_RND_HI32 |
2567                                  BPF_F_XDP_HAS_FRAGS |
2568                                  BPF_F_XDP_DEV_BOUND_ONLY))
2569                 return -EINVAL;
2570
2571         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2572             (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2573             !bpf_capable())
2574                 return -EPERM;
2575
2576         /* Intent here is for unprivileged_bpf_disabled to block BPF program
2577          * creation for unprivileged users; other actions depend
2578          * on fd availability and access to bpffs, so are dependent on
2579          * object creation success. Even with unprivileged BPF disabled,
2580          * capability checks are still carried out for these
2581          * and other operations.
2582          */
2583         if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
2584                 return -EPERM;
2585
2586         if (attr->insn_cnt == 0 ||
2587             attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
2588                 return -E2BIG;
2589         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2590             type != BPF_PROG_TYPE_CGROUP_SKB &&
2591             !bpf_capable())
2592                 return -EPERM;
2593
2594         if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
2595                 return -EPERM;
2596         if (is_perfmon_prog_type(type) && !perfmon_capable())
2597                 return -EPERM;
2598
2599         /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
2600          * or btf, we need to check which one it is
2601          */
2602         if (attr->attach_prog_fd) {
2603                 dst_prog = bpf_prog_get(attr->attach_prog_fd);
2604                 if (IS_ERR(dst_prog)) {
2605                         dst_prog = NULL;
2606                         attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
2607                         if (IS_ERR(attach_btf))
2608                                 return -EINVAL;
2609                         if (!btf_is_kernel(attach_btf)) {
2610                                 /* attaching through specifying bpf_prog's BTF
2611                                  * objects directly might be supported eventually
2612                                  */
2613                                 btf_put(attach_btf);
2614                                 return -ENOTSUPP;
2615                         }
2616                 }
2617         } else if (attr->attach_btf_id) {
2618                 /* fall back to vmlinux BTF, if BTF type ID is specified */
2619                 attach_btf = bpf_get_btf_vmlinux();
2620                 if (IS_ERR(attach_btf))
2621                         return PTR_ERR(attach_btf);
2622                 if (!attach_btf)
2623                         return -EINVAL;
2624                 btf_get(attach_btf);
2625         }
2626
2627         bpf_prog_load_fixup_attach_type(attr);
2628         if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
2629                                        attach_btf, attr->attach_btf_id,
2630                                        dst_prog)) {
2631                 if (dst_prog)
2632                         bpf_prog_put(dst_prog);
2633                 if (attach_btf)
2634                         btf_put(attach_btf);
2635                 return -EINVAL;
2636         }
2637
2638         /* plain bpf_prog allocation */
2639         prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
2640         if (!prog) {
2641                 if (dst_prog)
2642                         bpf_prog_put(dst_prog);
2643                 if (attach_btf)
2644                         btf_put(attach_btf);
2645                 return -ENOMEM;
2646         }
2647
2648         prog->expected_attach_type = attr->expected_attach_type;
2649         prog->aux->attach_btf = attach_btf;
2650         prog->aux->attach_btf_id = attr->attach_btf_id;
2651         prog->aux->dst_prog = dst_prog;
2652         prog->aux->dev_bound = !!attr->prog_ifindex;
2653         prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
2654         prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
2655
2656         err = security_bpf_prog_alloc(prog->aux);
2657         if (err)
2658                 goto free_prog;
2659
2660         prog->aux->user = get_current_user();
2661         prog->len = attr->insn_cnt;
2662
2663         err = -EFAULT;
2664         if (copy_from_bpfptr(prog->insns,
2665                              make_bpfptr(attr->insns, uattr.is_kernel),
2666                              bpf_prog_insn_size(prog)) != 0)
2667                 goto free_prog_sec;
2668         /* copy eBPF program license from user space */
2669         if (strncpy_from_bpfptr(license,
2670                                 make_bpfptr(attr->license, uattr.is_kernel),
2671                                 sizeof(license) - 1) < 0)
2672                 goto free_prog_sec;
2673         license[sizeof(license) - 1] = 0;
2674
2675         /* eBPF programs must be GPL compatible to use GPL-ed functions */
2676         prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
2677
2678         prog->orig_prog = NULL;
2679         prog->jited = 0;
2680
2681         atomic64_set(&prog->aux->refcnt, 1);
2682
2683         if (bpf_prog_is_dev_bound(prog->aux)) {
2684                 err = bpf_prog_dev_bound_init(prog, attr);
2685                 if (err)
2686                         goto free_prog_sec;
2687         }
2688
2689         if (type == BPF_PROG_TYPE_EXT && dst_prog &&
2690             bpf_prog_is_dev_bound(dst_prog->aux)) {
2691                 err = bpf_prog_dev_bound_inherit(prog, dst_prog);
2692                 if (err)
2693                         goto free_prog_sec;
2694         }
2695
2696         /* find program type: socket_filter vs tracing_filter */
2697         err = find_prog_type(type, prog);
2698         if (err < 0)
2699                 goto free_prog_sec;
2700
2701         prog->aux->load_time = ktime_get_boottime_ns();
2702         err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
2703                                sizeof(attr->prog_name));
2704         if (err < 0)
2705                 goto free_prog_sec;
2706
2707         /* run eBPF verifier */
2708         err = bpf_check(&prog, attr, uattr, uattr_size);
2709         if (err < 0)
2710                 goto free_used_maps;
2711
2712         prog = bpf_prog_select_runtime(prog, &err);
2713         if (err < 0)
2714                 goto free_used_maps;
2715
2716         err = bpf_prog_alloc_id(prog);
2717         if (err)
2718                 goto free_used_maps;
2719
2720         /* Upon success of bpf_prog_alloc_id(), the BPF prog is
2721          * effectively publicly exposed. However, retrieving via
2722          * bpf_prog_get_fd_by_id() will take another reference,
2723          * therefore it cannot be gone underneath us.
2724          *
2725          * Only for the time /after/ successful bpf_prog_new_fd()
2726          * and before returning to userspace, we might just hold
2727          * one reference and any parallel close on that fd could
2728          * rip everything out. Hence, below notifications must
2729          * happen before bpf_prog_new_fd().
2730          *
2731          * Also, any failure handling from this point onwards must
2732          * be using bpf_prog_put() given the program is exposed.
2733          */
2734         bpf_prog_kallsyms_add(prog);
2735         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
2736         bpf_audit_prog(prog, BPF_AUDIT_LOAD);
2737
2738         err = bpf_prog_new_fd(prog);
2739         if (err < 0)
2740                 bpf_prog_put(prog);
2741         return err;
2742
2743 free_used_maps:
2744         /* In case we have subprogs, we need to wait for a grace
2745          * period before we can tear down JIT memory since symbols
2746          * are already exposed under kallsyms.
2747          */
2748         __bpf_prog_put_noref(prog, prog->aux->func_cnt);
2749         return err;
2750 free_prog_sec:
2751         free_uid(prog->aux->user);
2752         security_bpf_prog_free(prog->aux);
2753 free_prog:
2754         if (prog->aux->attach_btf)
2755                 btf_put(prog->aux->attach_btf);
2756         bpf_prog_free(prog);
2757         return err;
2758 }
2759
2760 #define BPF_OBJ_LAST_FIELD path_fd
2761
2762 static int bpf_obj_pin(const union bpf_attr *attr)
2763 {
2764         int path_fd;
2765
2766         if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
2767                 return -EINVAL;
2768
2769         /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
2770         if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2771                 return -EINVAL;
2772
2773         path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2774         return bpf_obj_pin_user(attr->bpf_fd, path_fd,
2775                                 u64_to_user_ptr(attr->pathname));
2776 }
2777
2778 static int bpf_obj_get(const union bpf_attr *attr)
2779 {
2780         int path_fd;
2781
2782         if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
2783             attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
2784                 return -EINVAL;
2785
2786         /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
2787         if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2788                 return -EINVAL;
2789
2790         path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2791         return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
2792                                 attr->file_flags);
2793 }
2794
2795 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
2796                    const struct bpf_link_ops *ops, struct bpf_prog *prog)
2797 {
2798         atomic64_set(&link->refcnt, 1);
2799         link->type = type;
2800         link->id = 0;
2801         link->ops = ops;
2802         link->prog = prog;
2803 }
2804
2805 static void bpf_link_free_id(int id)
2806 {
2807         if (!id)
2808                 return;
2809
2810         spin_lock_bh(&link_idr_lock);
2811         idr_remove(&link_idr, id);
2812         spin_unlock_bh(&link_idr_lock);
2813 }
2814
2815 /* Clean up bpf_link and corresponding anon_inode file and FD. After
2816  * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
2817  * anon_inode's release() call. This helper marks bpf_link as
2818  * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
2819  * is not decremented, it's the responsibility of a calling code that failed
2820  * to complete bpf_link initialization.
2821  * This helper eventually calls link's dealloc callback, but does not call
2822  * link's release callback.
2823  */
2824 void bpf_link_cleanup(struct bpf_link_primer *primer)
2825 {
2826         primer->link->prog = NULL;
2827         bpf_link_free_id(primer->id);
2828         fput(primer->file);
2829         put_unused_fd(primer->fd);
2830 }
2831
2832 void bpf_link_inc(struct bpf_link *link)
2833 {
2834         atomic64_inc(&link->refcnt);
2835 }
2836
2837 /* bpf_link_free is guaranteed to be called from process context */
2838 static void bpf_link_free(struct bpf_link *link)
2839 {
2840         bpf_link_free_id(link->id);
2841         if (link->prog) {
2842                 /* detach BPF program, clean up used resources */
2843                 link->ops->release(link);
2844                 bpf_prog_put(link->prog);
2845         }
2846         /* free bpf_link and its containing memory */
2847         link->ops->dealloc(link);
2848 }
2849
2850 static void bpf_link_put_deferred(struct work_struct *work)
2851 {
2852         struct bpf_link *link = container_of(work, struct bpf_link, work);
2853
2854         bpf_link_free(link);
2855 }
2856
2857 /* bpf_link_put might be called from atomic context. It needs to be called
2858  * from sleepable context in order to acquire sleeping locks during the process.
2859  */
2860 void bpf_link_put(struct bpf_link *link)
2861 {
2862         if (!atomic64_dec_and_test(&link->refcnt))
2863                 return;
2864
2865         INIT_WORK(&link->work, bpf_link_put_deferred);
2866         schedule_work(&link->work);
2867 }
2868 EXPORT_SYMBOL(bpf_link_put);
2869
2870 static void bpf_link_put_direct(struct bpf_link *link)
2871 {
2872         if (!atomic64_dec_and_test(&link->refcnt))
2873                 return;
2874         bpf_link_free(link);
2875 }
2876
2877 static int bpf_link_release(struct inode *inode, struct file *filp)
2878 {
2879         struct bpf_link *link = filp->private_data;
2880
2881         bpf_link_put_direct(link);
2882         return 0;
2883 }
2884
2885 #ifdef CONFIG_PROC_FS
2886 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
2887 #define BPF_MAP_TYPE(_id, _ops)
2888 #define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
2889 static const char *bpf_link_type_strs[] = {
2890         [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
2891 #include <linux/bpf_types.h>
2892 };
2893 #undef BPF_PROG_TYPE
2894 #undef BPF_MAP_TYPE
2895 #undef BPF_LINK_TYPE
2896
2897 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
2898 {
2899         const struct bpf_link *link = filp->private_data;
2900         const struct bpf_prog *prog = link->prog;
2901         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2902
2903         seq_printf(m,
2904                    "link_type:\t%s\n"
2905                    "link_id:\t%u\n",
2906                    bpf_link_type_strs[link->type],
2907                    link->id);
2908         if (prog) {
2909                 bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2910                 seq_printf(m,
2911                            "prog_tag:\t%s\n"
2912                            "prog_id:\t%u\n",
2913                            prog_tag,
2914                            prog->aux->id);
2915         }
2916         if (link->ops->show_fdinfo)
2917                 link->ops->show_fdinfo(link, m);
2918 }
2919 #endif
2920
2921 static const struct file_operations bpf_link_fops = {
2922 #ifdef CONFIG_PROC_FS
2923         .show_fdinfo    = bpf_link_show_fdinfo,
2924 #endif
2925         .release        = bpf_link_release,
2926         .read           = bpf_dummy_read,
2927         .write          = bpf_dummy_write,
2928 };
2929
2930 static int bpf_link_alloc_id(struct bpf_link *link)
2931 {
2932         int id;
2933
2934         idr_preload(GFP_KERNEL);
2935         spin_lock_bh(&link_idr_lock);
2936         id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
2937         spin_unlock_bh(&link_idr_lock);
2938         idr_preload_end();
2939
2940         return id;
2941 }
2942
2943 /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
2944  * reserving unused FD and allocating ID from link_idr. This is to be paired
2945  * with bpf_link_settle() to install FD and ID and expose bpf_link to
2946  * user-space, if bpf_link is successfully attached. If not, bpf_link and
2947  * pre-allocated resources are to be freed with bpf_cleanup() call. All the
2948  * transient state is passed around in struct bpf_link_primer.
2949  * This is preferred way to create and initialize bpf_link, especially when
2950  * there are complicated and expensive operations in between creating bpf_link
2951  * itself and attaching it to BPF hook. By using bpf_link_prime() and
2952  * bpf_link_settle() kernel code using bpf_link doesn't have to perform
2953  * expensive (and potentially failing) roll back operations in a rare case
2954  * that file, FD, or ID can't be allocated.
2955  */
2956 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
2957 {
2958         struct file *file;
2959         int fd, id;
2960
2961         fd = get_unused_fd_flags(O_CLOEXEC);
2962         if (fd < 0)
2963                 return fd;
2964
2965
2966         id = bpf_link_alloc_id(link);
2967         if (id < 0) {
2968                 put_unused_fd(fd);
2969                 return id;
2970         }
2971
2972         file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
2973         if (IS_ERR(file)) {
2974                 bpf_link_free_id(id);
2975                 put_unused_fd(fd);
2976                 return PTR_ERR(file);
2977         }
2978
2979         primer->link = link;
2980         primer->file = file;
2981         primer->fd = fd;
2982         primer->id = id;
2983         return 0;
2984 }
2985
2986 int bpf_link_settle(struct bpf_link_primer *primer)
2987 {
2988         /* make bpf_link fetchable by ID */
2989         spin_lock_bh(&link_idr_lock);
2990         primer->link->id = primer->id;
2991         spin_unlock_bh(&link_idr_lock);
2992         /* make bpf_link fetchable by FD */
2993         fd_install(primer->fd, primer->file);
2994         /* pass through installed FD */
2995         return primer->fd;
2996 }
2997
2998 int bpf_link_new_fd(struct bpf_link *link)
2999 {
3000         return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
3001 }
3002
3003 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
3004 {
3005         struct fd f = fdget(ufd);
3006         struct bpf_link *link;
3007
3008         if (!f.file)
3009                 return ERR_PTR(-EBADF);
3010         if (f.file->f_op != &bpf_link_fops) {
3011                 fdput(f);
3012                 return ERR_PTR(-EINVAL);
3013         }
3014
3015         link = f.file->private_data;
3016         bpf_link_inc(link);
3017         fdput(f);
3018
3019         return link;
3020 }
3021 EXPORT_SYMBOL(bpf_link_get_from_fd);
3022
3023 static void bpf_tracing_link_release(struct bpf_link *link)
3024 {
3025         struct bpf_tracing_link *tr_link =
3026                 container_of(link, struct bpf_tracing_link, link.link);
3027
3028         WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
3029                                                 tr_link->trampoline));
3030
3031         bpf_trampoline_put(tr_link->trampoline);
3032
3033         /* tgt_prog is NULL if target is a kernel function */
3034         if (tr_link->tgt_prog)
3035                 bpf_prog_put(tr_link->tgt_prog);
3036 }
3037
3038 static void bpf_tracing_link_dealloc(struct bpf_link *link)
3039 {
3040         struct bpf_tracing_link *tr_link =
3041                 container_of(link, struct bpf_tracing_link, link.link);
3042
3043         kfree(tr_link);
3044 }
3045
3046 static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
3047                                          struct seq_file *seq)
3048 {
3049         struct bpf_tracing_link *tr_link =
3050                 container_of(link, struct bpf_tracing_link, link.link);
3051         u32 target_btf_id, target_obj_id;
3052
3053         bpf_trampoline_unpack_key(tr_link->trampoline->key,
3054                                   &target_obj_id, &target_btf_id);
3055         seq_printf(seq,
3056                    "attach_type:\t%d\n"
3057                    "target_obj_id:\t%u\n"
3058                    "target_btf_id:\t%u\n",
3059                    tr_link->attach_type,
3060                    target_obj_id,
3061                    target_btf_id);
3062 }
3063
3064 static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
3065                                            struct bpf_link_info *info)
3066 {
3067         struct bpf_tracing_link *tr_link =
3068                 container_of(link, struct bpf_tracing_link, link.link);
3069
3070         info->tracing.attach_type = tr_link->attach_type;
3071         bpf_trampoline_unpack_key(tr_link->trampoline->key,
3072                                   &info->tracing.target_obj_id,
3073                                   &info->tracing.target_btf_id);
3074
3075         return 0;
3076 }
3077
3078 static const struct bpf_link_ops bpf_tracing_link_lops = {
3079         .release = bpf_tracing_link_release,
3080         .dealloc = bpf_tracing_link_dealloc,
3081         .show_fdinfo = bpf_tracing_link_show_fdinfo,
3082         .fill_link_info = bpf_tracing_link_fill_link_info,
3083 };
3084
3085 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
3086                                    int tgt_prog_fd,
3087                                    u32 btf_id,
3088                                    u64 bpf_cookie)
3089 {
3090         struct bpf_link_primer link_primer;
3091         struct bpf_prog *tgt_prog = NULL;
3092         struct bpf_trampoline *tr = NULL;
3093         struct bpf_tracing_link *link;
3094         u64 key = 0;
3095         int err;
3096
3097         switch (prog->type) {
3098         case BPF_PROG_TYPE_TRACING:
3099                 if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
3100                     prog->expected_attach_type != BPF_TRACE_FEXIT &&
3101                     prog->expected_attach_type != BPF_MODIFY_RETURN) {
3102                         err = -EINVAL;
3103                         goto out_put_prog;
3104                 }
3105                 break;
3106         case BPF_PROG_TYPE_EXT:
3107                 if (prog->expected_attach_type != 0) {
3108                         err = -EINVAL;
3109                         goto out_put_prog;
3110                 }
3111                 break;
3112         case BPF_PROG_TYPE_LSM:
3113                 if (prog->expected_attach_type != BPF_LSM_MAC) {
3114                         err = -EINVAL;
3115                         goto out_put_prog;
3116                 }
3117                 break;
3118         default:
3119                 err = -EINVAL;
3120                 goto out_put_prog;
3121         }
3122
3123         if (!!tgt_prog_fd != !!btf_id) {
3124                 err = -EINVAL;
3125                 goto out_put_prog;
3126         }
3127
3128         if (tgt_prog_fd) {
3129                 /* For now we only allow new targets for BPF_PROG_TYPE_EXT */
3130                 if (prog->type != BPF_PROG_TYPE_EXT) {
3131                         err = -EINVAL;
3132                         goto out_put_prog;
3133                 }
3134
3135                 tgt_prog = bpf_prog_get(tgt_prog_fd);
3136                 if (IS_ERR(tgt_prog)) {
3137                         err = PTR_ERR(tgt_prog);
3138                         tgt_prog = NULL;
3139                         goto out_put_prog;
3140                 }
3141
3142                 key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
3143         }
3144
3145         link = kzalloc(sizeof(*link), GFP_USER);
3146         if (!link) {
3147                 err = -ENOMEM;
3148                 goto out_put_prog;
3149         }
3150         bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
3151                       &bpf_tracing_link_lops, prog);
3152         link->attach_type = prog->expected_attach_type;
3153         link->link.cookie = bpf_cookie;
3154
3155         mutex_lock(&prog->aux->dst_mutex);
3156
3157         /* There are a few possible cases here:
3158          *
3159          * - if prog->aux->dst_trampoline is set, the program was just loaded
3160          *   and not yet attached to anything, so we can use the values stored
3161          *   in prog->aux
3162          *
3163          * - if prog->aux->dst_trampoline is NULL, the program has already been
3164          *   attached to a target and its initial target was cleared (below)
3165          *
3166          * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
3167          *   target_btf_id using the link_create API.
3168          *
3169          * - if tgt_prog == NULL when this function was called using the old
3170          *   raw_tracepoint_open API, and we need a target from prog->aux
3171          *
3172          * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
3173          *   was detached and is going for re-attachment.
3174          */
3175         if (!prog->aux->dst_trampoline && !tgt_prog) {
3176                 /*
3177                  * Allow re-attach for TRACING and LSM programs. If it's
3178                  * currently linked, bpf_trampoline_link_prog will fail.
3179                  * EXT programs need to specify tgt_prog_fd, so they
3180                  * re-attach in separate code path.
3181                  */
3182                 if (prog->type != BPF_PROG_TYPE_TRACING &&
3183                     prog->type != BPF_PROG_TYPE_LSM) {
3184                         err = -EINVAL;
3185                         goto out_unlock;
3186                 }
3187                 btf_id = prog->aux->attach_btf_id;
3188                 key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
3189         }
3190
3191         if (!prog->aux->dst_trampoline ||
3192             (key && key != prog->aux->dst_trampoline->key)) {
3193                 /* If there is no saved target, or the specified target is
3194                  * different from the destination specified at load time, we
3195                  * need a new trampoline and a check for compatibility
3196                  */
3197                 struct bpf_attach_target_info tgt_info = {};
3198
3199                 err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
3200                                               &tgt_info);
3201                 if (err)
3202                         goto out_unlock;
3203
3204                 if (tgt_info.tgt_mod) {
3205                         module_put(prog->aux->mod);
3206                         prog->aux->mod = tgt_info.tgt_mod;
3207                 }
3208
3209                 tr = bpf_trampoline_get(key, &tgt_info);
3210                 if (!tr) {
3211                         err = -ENOMEM;
3212                         goto out_unlock;
3213                 }
3214         } else {
3215                 /* The caller didn't specify a target, or the target was the
3216                  * same as the destination supplied during program load. This
3217                  * means we can reuse the trampoline and reference from program
3218                  * load time, and there is no need to allocate a new one. This
3219                  * can only happen once for any program, as the saved values in
3220                  * prog->aux are cleared below.
3221                  */
3222                 tr = prog->aux->dst_trampoline;
3223                 tgt_prog = prog->aux->dst_prog;
3224         }
3225
3226         err = bpf_link_prime(&link->link.link, &link_primer);
3227         if (err)
3228                 goto out_unlock;
3229
3230         err = bpf_trampoline_link_prog(&link->link, tr);
3231         if (err) {
3232                 bpf_link_cleanup(&link_primer);
3233                 link = NULL;
3234                 goto out_unlock;
3235         }
3236
3237         link->tgt_prog = tgt_prog;
3238         link->trampoline = tr;
3239
3240         /* Always clear the trampoline and target prog from prog->aux to make
3241          * sure the original attach destination is not kept alive after a
3242          * program is (re-)attached to another target.
3243          */
3244         if (prog->aux->dst_prog &&
3245             (tgt_prog_fd || tr != prog->aux->dst_trampoline))
3246                 /* got extra prog ref from syscall, or attaching to different prog */
3247                 bpf_prog_put(prog->aux->dst_prog);
3248         if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
3249                 /* we allocated a new trampoline, so free the old one */
3250                 bpf_trampoline_put(prog->aux->dst_trampoline);
3251
3252         prog->aux->dst_prog = NULL;
3253         prog->aux->dst_trampoline = NULL;
3254         mutex_unlock(&prog->aux->dst_mutex);
3255
3256         return bpf_link_settle(&link_primer);
3257 out_unlock:
3258         if (tr && tr != prog->aux->dst_trampoline)
3259                 bpf_trampoline_put(tr);
3260         mutex_unlock(&prog->aux->dst_mutex);
3261         kfree(link);
3262 out_put_prog:
3263         if (tgt_prog_fd && tgt_prog)
3264                 bpf_prog_put(tgt_prog);
3265         return err;
3266 }
3267
3268 struct bpf_raw_tp_link {
3269         struct bpf_link link;
3270         struct bpf_raw_event_map *btp;
3271 };
3272
3273 static void bpf_raw_tp_link_release(struct bpf_link *link)
3274 {
3275         struct bpf_raw_tp_link *raw_tp =
3276                 container_of(link, struct bpf_raw_tp_link, link);
3277
3278         bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
3279         bpf_put_raw_tracepoint(raw_tp->btp);
3280 }
3281
3282 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
3283 {
3284         struct bpf_raw_tp_link *raw_tp =
3285                 container_of(link, struct bpf_raw_tp_link, link);
3286
3287         kfree(raw_tp);
3288 }
3289
3290 static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
3291                                         struct seq_file *seq)
3292 {
3293         struct bpf_raw_tp_link *raw_tp_link =
3294                 container_of(link, struct bpf_raw_tp_link, link);
3295
3296         seq_printf(seq,
3297                    "tp_name:\t%s\n",
3298                    raw_tp_link->btp->tp->name);
3299 }
3300
3301 static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
3302                             u32 len)
3303 {
3304         if (ulen >= len + 1) {
3305                 if (copy_to_user(ubuf, buf, len + 1))
3306                         return -EFAULT;
3307         } else {
3308                 char zero = '\0';
3309
3310                 if (copy_to_user(ubuf, buf, ulen - 1))
3311                         return -EFAULT;
3312                 if (put_user(zero, ubuf + ulen - 1))
3313                         return -EFAULT;
3314                 return -ENOSPC;
3315         }
3316
3317         return 0;
3318 }
3319
3320 static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
3321                                           struct bpf_link_info *info)
3322 {
3323         struct bpf_raw_tp_link *raw_tp_link =
3324                 container_of(link, struct bpf_raw_tp_link, link);
3325         char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
3326         const char *tp_name = raw_tp_link->btp->tp->name;
3327         u32 ulen = info->raw_tracepoint.tp_name_len;
3328         size_t tp_len = strlen(tp_name);
3329
3330         if (!ulen ^ !ubuf)
3331                 return -EINVAL;
3332
3333         info->raw_tracepoint.tp_name_len = tp_len + 1;
3334
3335         if (!ubuf)
3336                 return 0;
3337
3338         return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
3339 }
3340
3341 static const struct bpf_link_ops bpf_raw_tp_link_lops = {
3342         .release = bpf_raw_tp_link_release,
3343         .dealloc = bpf_raw_tp_link_dealloc,
3344         .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
3345         .fill_link_info = bpf_raw_tp_link_fill_link_info,
3346 };
3347
3348 #ifdef CONFIG_PERF_EVENTS
3349 struct bpf_perf_link {
3350         struct bpf_link link;
3351         struct file *perf_file;
3352 };
3353
3354 static void bpf_perf_link_release(struct bpf_link *link)
3355 {
3356         struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3357         struct perf_event *event = perf_link->perf_file->private_data;
3358
3359         perf_event_free_bpf_prog(event);
3360         fput(perf_link->perf_file);
3361 }
3362
3363 static void bpf_perf_link_dealloc(struct bpf_link *link)
3364 {
3365         struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3366
3367         kfree(perf_link);
3368 }
3369
3370 static int bpf_perf_link_fill_common(const struct perf_event *event,
3371                                      char __user *uname, u32 ulen,
3372                                      u64 *probe_offset, u64 *probe_addr,
3373                                      u32 *fd_type)
3374 {
3375         const char *buf;
3376         u32 prog_id;
3377         size_t len;
3378         int err;
3379
3380         if (!ulen ^ !uname)
3381                 return -EINVAL;
3382
3383         err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
3384                                       probe_offset, probe_addr);
3385         if (err)
3386                 return err;
3387         if (!uname)
3388                 return 0;
3389         if (buf) {
3390                 len = strlen(buf);
3391                 err = bpf_copy_to_user(uname, buf, ulen, len);
3392                 if (err)
3393                         return err;
3394         } else {
3395                 char zero = '\0';
3396
3397                 if (put_user(zero, uname))
3398                         return -EFAULT;
3399         }
3400         return 0;
3401 }
3402
3403 #ifdef CONFIG_KPROBE_EVENTS
3404 static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
3405                                      struct bpf_link_info *info)
3406 {
3407         char __user *uname;
3408         u64 addr, offset;
3409         u32 ulen, type;
3410         int err;
3411
3412         uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
3413         ulen = info->perf_event.kprobe.name_len;
3414         err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
3415                                         &type);
3416         if (err)
3417                 return err;
3418         if (type == BPF_FD_TYPE_KRETPROBE)
3419                 info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
3420         else
3421                 info->perf_event.type = BPF_PERF_EVENT_KPROBE;
3422
3423         info->perf_event.kprobe.offset = offset;
3424         if (!kallsyms_show_value(current_cred()))
3425                 addr = 0;
3426         info->perf_event.kprobe.addr = addr;
3427         return 0;
3428 }
3429 #endif
3430
3431 #ifdef CONFIG_UPROBE_EVENTS
3432 static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
3433                                      struct bpf_link_info *info)
3434 {
3435         char __user *uname;
3436         u64 addr, offset;
3437         u32 ulen, type;
3438         int err;
3439
3440         uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
3441         ulen = info->perf_event.uprobe.name_len;
3442         err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
3443                                         &type);
3444         if (err)
3445                 return err;
3446
3447         if (type == BPF_FD_TYPE_URETPROBE)
3448                 info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
3449         else
3450                 info->perf_event.type = BPF_PERF_EVENT_UPROBE;
3451         info->perf_event.uprobe.offset = offset;
3452         return 0;
3453 }
3454 #endif
3455
3456 static int bpf_perf_link_fill_probe(const struct perf_event *event,
3457                                     struct bpf_link_info *info)
3458 {
3459 #ifdef CONFIG_KPROBE_EVENTS
3460         if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
3461                 return bpf_perf_link_fill_kprobe(event, info);
3462 #endif
3463 #ifdef CONFIG_UPROBE_EVENTS
3464         if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
3465                 return bpf_perf_link_fill_uprobe(event, info);
3466 #endif
3467         return -EOPNOTSUPP;
3468 }
3469
3470 static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
3471                                          struct bpf_link_info *info)
3472 {
3473         char __user *uname;
3474         u32 ulen;
3475
3476         uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
3477         ulen = info->perf_event.tracepoint.name_len;
3478         info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
3479         return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
3480 }
3481
3482 static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
3483                                          struct bpf_link_info *info)
3484 {
3485         info->perf_event.event.type = event->attr.type;
3486         info->perf_event.event.config = event->attr.config;
3487         info->perf_event.type = BPF_PERF_EVENT_EVENT;
3488         return 0;
3489 }
3490
3491 static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
3492                                         struct bpf_link_info *info)
3493 {
3494         struct bpf_perf_link *perf_link;
3495         const struct perf_event *event;
3496
3497         perf_link = container_of(link, struct bpf_perf_link, link);
3498         event = perf_get_event(perf_link->perf_file);
3499         if (IS_ERR(event))
3500                 return PTR_ERR(event);
3501
3502         switch (event->prog->type) {
3503         case BPF_PROG_TYPE_PERF_EVENT:
3504                 return bpf_perf_link_fill_perf_event(event, info);
3505         case BPF_PROG_TYPE_TRACEPOINT:
3506                 return bpf_perf_link_fill_tracepoint(event, info);
3507         case BPF_PROG_TYPE_KPROBE:
3508                 return bpf_perf_link_fill_probe(event, info);
3509         default:
3510                 return -EOPNOTSUPP;
3511         }
3512 }
3513
3514 static const struct bpf_link_ops bpf_perf_link_lops = {
3515         .release = bpf_perf_link_release,
3516         .dealloc = bpf_perf_link_dealloc,
3517         .fill_link_info = bpf_perf_link_fill_link_info,
3518 };
3519
3520 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3521 {
3522         struct bpf_link_primer link_primer;
3523         struct bpf_perf_link *link;
3524         struct perf_event *event;
3525         struct file *perf_file;
3526         int err;
3527
3528         if (attr->link_create.flags)
3529                 return -EINVAL;
3530
3531         perf_file = perf_event_get(attr->link_create.target_fd);
3532         if (IS_ERR(perf_file))
3533                 return PTR_ERR(perf_file);
3534
3535         link = kzalloc(sizeof(*link), GFP_USER);
3536         if (!link) {
3537                 err = -ENOMEM;
3538                 goto out_put_file;
3539         }
3540         bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
3541         link->perf_file = perf_file;
3542
3543         err = bpf_link_prime(&link->link, &link_primer);
3544         if (err) {
3545                 kfree(link);
3546                 goto out_put_file;
3547         }
3548
3549         event = perf_file->private_data;
3550         err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
3551         if (err) {
3552                 bpf_link_cleanup(&link_primer);
3553                 goto out_put_file;
3554         }
3555         /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
3556         bpf_prog_inc(prog);
3557
3558         return bpf_link_settle(&link_primer);
3559
3560 out_put_file:
3561         fput(perf_file);
3562         return err;
3563 }
3564 #else
3565 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3566 {
3567         return -EOPNOTSUPP;
3568 }
3569 #endif /* CONFIG_PERF_EVENTS */
3570
3571 static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
3572                                   const char __user *user_tp_name)
3573 {
3574         struct bpf_link_primer link_primer;
3575         struct bpf_raw_tp_link *link;
3576         struct bpf_raw_event_map *btp;
3577         const char *tp_name;
3578         char buf[128];
3579         int err;
3580
3581         switch (prog->type) {
3582         case BPF_PROG_TYPE_TRACING:
3583         case BPF_PROG_TYPE_EXT:
3584         case BPF_PROG_TYPE_LSM:
3585                 if (user_tp_name)
3586                         /* The attach point for this category of programs
3587                          * should be specified via btf_id during program load.
3588                          */
3589                         return -EINVAL;
3590                 if (prog->type == BPF_PROG_TYPE_TRACING &&
3591                     prog->expected_attach_type == BPF_TRACE_RAW_TP) {
3592                         tp_name = prog->aux->attach_func_name;
3593                         break;
3594                 }
3595                 return bpf_tracing_prog_attach(prog, 0, 0, 0);
3596         case BPF_PROG_TYPE_RAW_TRACEPOINT:
3597         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
3598                 if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
3599                         return -EFAULT;
3600                 buf[sizeof(buf) - 1] = 0;
3601                 tp_name = buf;
3602                 break;
3603         default:
3604                 return -EINVAL;
3605         }
3606
3607         btp = bpf_get_raw_tracepoint(tp_name);
3608         if (!btp)
3609                 return -ENOENT;
3610
3611         link = kzalloc(sizeof(*link), GFP_USER);
3612         if (!link) {
3613                 err = -ENOMEM;
3614                 goto out_put_btp;
3615         }
3616         bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
3617                       &bpf_raw_tp_link_lops, prog);
3618         link->btp = btp;
3619
3620         err = bpf_link_prime(&link->link, &link_primer);
3621         if (err) {
3622                 kfree(link);
3623                 goto out_put_btp;
3624         }
3625
3626         err = bpf_probe_register(link->btp, prog);
3627         if (err) {
3628                 bpf_link_cleanup(&link_primer);
3629                 goto out_put_btp;
3630         }
3631
3632         return bpf_link_settle(&link_primer);
3633
3634 out_put_btp:
3635         bpf_put_raw_tracepoint(btp);
3636         return err;
3637 }
3638
3639 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
3640
3641 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
3642 {
3643         struct bpf_prog *prog;
3644         int fd;
3645
3646         if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
3647                 return -EINVAL;
3648
3649         prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
3650         if (IS_ERR(prog))
3651                 return PTR_ERR(prog);
3652
3653         fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
3654         if (fd < 0)
3655                 bpf_prog_put(prog);
3656         return fd;
3657 }
3658
3659 static enum bpf_prog_type
3660 attach_type_to_prog_type(enum bpf_attach_type attach_type)
3661 {
3662         switch (attach_type) {
3663         case BPF_CGROUP_INET_INGRESS:
3664         case BPF_CGROUP_INET_EGRESS:
3665                 return BPF_PROG_TYPE_CGROUP_SKB;
3666         case BPF_CGROUP_INET_SOCK_CREATE:
3667         case BPF_CGROUP_INET_SOCK_RELEASE:
3668         case BPF_CGROUP_INET4_POST_BIND:
3669         case BPF_CGROUP_INET6_POST_BIND:
3670                 return BPF_PROG_TYPE_CGROUP_SOCK;
3671         case BPF_CGROUP_INET4_BIND:
3672         case BPF_CGROUP_INET6_BIND:
3673         case BPF_CGROUP_INET4_CONNECT:
3674         case BPF_CGROUP_INET6_CONNECT:
3675         case BPF_CGROUP_INET4_GETPEERNAME:
3676         case BPF_CGROUP_INET6_GETPEERNAME:
3677         case BPF_CGROUP_INET4_GETSOCKNAME:
3678         case BPF_CGROUP_INET6_GETSOCKNAME:
3679         case BPF_CGROUP_UDP4_SENDMSG:
3680         case BPF_CGROUP_UDP6_SENDMSG:
3681         case BPF_CGROUP_UDP4_RECVMSG:
3682         case BPF_CGROUP_UDP6_RECVMSG:
3683                 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
3684         case BPF_CGROUP_SOCK_OPS:
3685                 return BPF_PROG_TYPE_SOCK_OPS;
3686         case BPF_CGROUP_DEVICE:
3687                 return BPF_PROG_TYPE_CGROUP_DEVICE;
3688         case BPF_SK_MSG_VERDICT:
3689                 return BPF_PROG_TYPE_SK_MSG;
3690         case BPF_SK_SKB_STREAM_PARSER:
3691         case BPF_SK_SKB_STREAM_VERDICT:
3692         case BPF_SK_SKB_VERDICT:
3693                 return BPF_PROG_TYPE_SK_SKB;
3694         case BPF_LIRC_MODE2:
3695                 return BPF_PROG_TYPE_LIRC_MODE2;
3696         case BPF_FLOW_DISSECTOR:
3697                 return BPF_PROG_TYPE_FLOW_DISSECTOR;
3698         case BPF_CGROUP_SYSCTL:
3699                 return BPF_PROG_TYPE_CGROUP_SYSCTL;
3700         case BPF_CGROUP_GETSOCKOPT:
3701         case BPF_CGROUP_SETSOCKOPT:
3702                 return BPF_PROG_TYPE_CGROUP_SOCKOPT;
3703         case BPF_TRACE_ITER:
3704         case BPF_TRACE_RAW_TP:
3705         case BPF_TRACE_FENTRY:
3706         case BPF_TRACE_FEXIT:
3707         case BPF_MODIFY_RETURN:
3708                 return BPF_PROG_TYPE_TRACING;
3709         case BPF_LSM_MAC:
3710                 return BPF_PROG_TYPE_LSM;
3711         case BPF_SK_LOOKUP:
3712                 return BPF_PROG_TYPE_SK_LOOKUP;
3713         case BPF_XDP:
3714                 return BPF_PROG_TYPE_XDP;
3715         case BPF_LSM_CGROUP:
3716                 return BPF_PROG_TYPE_LSM;
3717         case BPF_TCX_INGRESS:
3718         case BPF_TCX_EGRESS:
3719                 return BPF_PROG_TYPE_SCHED_CLS;
3720         default:
3721                 return BPF_PROG_TYPE_UNSPEC;
3722         }
3723 }
3724
3725 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
3726                                              enum bpf_attach_type attach_type)
3727 {
3728         enum bpf_prog_type ptype;
3729
3730         switch (prog->type) {
3731         case BPF_PROG_TYPE_CGROUP_SOCK:
3732         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3733         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3734         case BPF_PROG_TYPE_SK_LOOKUP:
3735                 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
3736         case BPF_PROG_TYPE_CGROUP_SKB:
3737                 if (!capable(CAP_NET_ADMIN))
3738                         /* cg-skb progs can be loaded by unpriv user.
3739                          * check permissions at attach time.
3740                          */
3741                         return -EPERM;
3742                 return prog->enforce_expected_attach_type &&
3743                         prog->expected_attach_type != attach_type ?
3744                         -EINVAL : 0;
3745         case BPF_PROG_TYPE_EXT:
3746                 return 0;
3747         case BPF_PROG_TYPE_NETFILTER:
3748                 if (attach_type != BPF_NETFILTER)
3749                         return -EINVAL;
3750                 return 0;
3751         case BPF_PROG_TYPE_PERF_EVENT:
3752         case BPF_PROG_TYPE_TRACEPOINT:
3753                 if (attach_type != BPF_PERF_EVENT)
3754                         return -EINVAL;
3755                 return 0;
3756         case BPF_PROG_TYPE_KPROBE:
3757                 if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
3758                     attach_type != BPF_TRACE_KPROBE_MULTI)
3759                         return -EINVAL;
3760                 if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
3761                     attach_type != BPF_TRACE_UPROBE_MULTI)
3762                         return -EINVAL;
3763                 if (attach_type != BPF_PERF_EVENT &&
3764                     attach_type != BPF_TRACE_KPROBE_MULTI &&
3765                     attach_type != BPF_TRACE_UPROBE_MULTI)
3766                         return -EINVAL;
3767                 return 0;
3768         case BPF_PROG_TYPE_SCHED_CLS:
3769                 if (attach_type != BPF_TCX_INGRESS &&
3770                     attach_type != BPF_TCX_EGRESS)
3771                         return -EINVAL;
3772                 return 0;
3773         default:
3774                 ptype = attach_type_to_prog_type(attach_type);
3775                 if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
3776                         return -EINVAL;
3777                 return 0;
3778         }
3779 }
3780
3781 #define BPF_PROG_ATTACH_LAST_FIELD expected_revision
3782
3783 #define BPF_F_ATTACH_MASK_BASE  \
3784         (BPF_F_ALLOW_OVERRIDE | \
3785          BPF_F_ALLOW_MULTI |    \
3786          BPF_F_REPLACE)
3787
3788 #define BPF_F_ATTACH_MASK_MPROG \
3789         (BPF_F_REPLACE |        \
3790          BPF_F_BEFORE |         \
3791          BPF_F_AFTER |          \
3792          BPF_F_ID |             \
3793          BPF_F_LINK)
3794
3795 static int bpf_prog_attach(const union bpf_attr *attr)
3796 {
3797         enum bpf_prog_type ptype;
3798         struct bpf_prog *prog;
3799         int ret;
3800
3801         if (CHECK_ATTR(BPF_PROG_ATTACH))
3802                 return -EINVAL;
3803
3804         ptype = attach_type_to_prog_type(attr->attach_type);
3805         if (ptype == BPF_PROG_TYPE_UNSPEC)
3806                 return -EINVAL;
3807         if (bpf_mprog_supported(ptype)) {
3808                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
3809                         return -EINVAL;
3810         } else {
3811                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
3812                         return -EINVAL;
3813                 if (attr->relative_fd ||
3814                     attr->expected_revision)
3815                         return -EINVAL;
3816         }
3817
3818         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
3819         if (IS_ERR(prog))
3820                 return PTR_ERR(prog);
3821
3822         if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
3823                 bpf_prog_put(prog);
3824                 return -EINVAL;
3825         }
3826
3827         switch (ptype) {
3828         case BPF_PROG_TYPE_SK_SKB:
3829         case BPF_PROG_TYPE_SK_MSG:
3830                 ret = sock_map_get_from_fd(attr, prog);
3831                 break;
3832         case BPF_PROG_TYPE_LIRC_MODE2:
3833                 ret = lirc_prog_attach(attr, prog);
3834                 break;
3835         case BPF_PROG_TYPE_FLOW_DISSECTOR:
3836                 ret = netns_bpf_prog_attach(attr, prog);
3837                 break;
3838         case BPF_PROG_TYPE_CGROUP_DEVICE:
3839         case BPF_PROG_TYPE_CGROUP_SKB:
3840         case BPF_PROG_TYPE_CGROUP_SOCK:
3841         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3842         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3843         case BPF_PROG_TYPE_CGROUP_SYSCTL:
3844         case BPF_PROG_TYPE_SOCK_OPS:
3845         case BPF_PROG_TYPE_LSM:
3846                 if (ptype == BPF_PROG_TYPE_LSM &&
3847                     prog->expected_attach_type != BPF_LSM_CGROUP)
3848                         ret = -EINVAL;
3849                 else
3850                         ret = cgroup_bpf_prog_attach(attr, ptype, prog);
3851                 break;
3852         case BPF_PROG_TYPE_SCHED_CLS:
3853                 ret = tcx_prog_attach(attr, prog);
3854                 break;
3855         default:
3856                 ret = -EINVAL;
3857         }
3858
3859         if (ret)
3860                 bpf_prog_put(prog);
3861         return ret;
3862 }
3863
3864 #define BPF_PROG_DETACH_LAST_FIELD expected_revision
3865
3866 static int bpf_prog_detach(const union bpf_attr *attr)
3867 {
3868         struct bpf_prog *prog = NULL;
3869         enum bpf_prog_type ptype;
3870         int ret;
3871
3872         if (CHECK_ATTR(BPF_PROG_DETACH))
3873                 return -EINVAL;
3874
3875         ptype = attach_type_to_prog_type(attr->attach_type);
3876         if (bpf_mprog_supported(ptype)) {
3877                 if (ptype == BPF_PROG_TYPE_UNSPEC)
3878                         return -EINVAL;
3879                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
3880                         return -EINVAL;
3881                 if (attr->attach_bpf_fd) {
3882                         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
3883                         if (IS_ERR(prog))
3884                                 return PTR_ERR(prog);
3885                 }
3886         } else if (attr->attach_flags ||
3887                    attr->relative_fd ||
3888                    attr->expected_revision) {
3889                 return -EINVAL;
3890         }
3891
3892         switch (ptype) {
3893         case BPF_PROG_TYPE_SK_MSG:
3894         case BPF_PROG_TYPE_SK_SKB:
3895                 ret = sock_map_prog_detach(attr, ptype);
3896                 break;
3897         case BPF_PROG_TYPE_LIRC_MODE2:
3898                 ret = lirc_prog_detach(attr);
3899                 break;
3900         case BPF_PROG_TYPE_FLOW_DISSECTOR:
3901                 ret = netns_bpf_prog_detach(attr, ptype);
3902                 break;
3903         case BPF_PROG_TYPE_CGROUP_DEVICE:
3904         case BPF_PROG_TYPE_CGROUP_SKB:
3905         case BPF_PROG_TYPE_CGROUP_SOCK:
3906         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3907         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3908         case BPF_PROG_TYPE_CGROUP_SYSCTL:
3909         case BPF_PROG_TYPE_SOCK_OPS:
3910         case BPF_PROG_TYPE_LSM:
3911                 ret = cgroup_bpf_prog_detach(attr, ptype);
3912                 break;
3913         case BPF_PROG_TYPE_SCHED_CLS:
3914                 ret = tcx_prog_detach(attr, prog);
3915                 break;
3916         default:
3917                 ret = -EINVAL;
3918         }
3919
3920         if (prog)
3921                 bpf_prog_put(prog);
3922         return ret;
3923 }
3924
3925 #define BPF_PROG_QUERY_LAST_FIELD query.revision
3926
3927 static int bpf_prog_query(const union bpf_attr *attr,
3928                           union bpf_attr __user *uattr)
3929 {
3930         if (!capable(CAP_NET_ADMIN))
3931                 return -EPERM;
3932         if (CHECK_ATTR(BPF_PROG_QUERY))
3933                 return -EINVAL;
3934         if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
3935                 return -EINVAL;
3936
3937         switch (attr->query.attach_type) {
3938         case BPF_CGROUP_INET_INGRESS:
3939         case BPF_CGROUP_INET_EGRESS:
3940         case BPF_CGROUP_INET_SOCK_CREATE:
3941         case BPF_CGROUP_INET_SOCK_RELEASE:
3942         case BPF_CGROUP_INET4_BIND:
3943         case BPF_CGROUP_INET6_BIND:
3944         case BPF_CGROUP_INET4_POST_BIND:
3945         case BPF_CGROUP_INET6_POST_BIND:
3946         case BPF_CGROUP_INET4_CONNECT:
3947         case BPF_CGROUP_INET6_CONNECT:
3948         case BPF_CGROUP_INET4_GETPEERNAME:
3949         case BPF_CGROUP_INET6_GETPEERNAME:
3950         case BPF_CGROUP_INET4_GETSOCKNAME:
3951         case BPF_CGROUP_INET6_GETSOCKNAME:
3952         case BPF_CGROUP_UDP4_SENDMSG:
3953         case BPF_CGROUP_UDP6_SENDMSG:
3954         case BPF_CGROUP_UDP4_RECVMSG:
3955         case BPF_CGROUP_UDP6_RECVMSG:
3956         case BPF_CGROUP_SOCK_OPS:
3957         case BPF_CGROUP_DEVICE:
3958         case BPF_CGROUP_SYSCTL:
3959         case BPF_CGROUP_GETSOCKOPT:
3960         case BPF_CGROUP_SETSOCKOPT:
3961         case BPF_LSM_CGROUP:
3962                 return cgroup_bpf_prog_query(attr, uattr);
3963         case BPF_LIRC_MODE2:
3964                 return lirc_prog_query(attr, uattr);
3965         case BPF_FLOW_DISSECTOR:
3966         case BPF_SK_LOOKUP:
3967                 return netns_bpf_prog_query(attr, uattr);
3968         case BPF_SK_SKB_STREAM_PARSER:
3969         case BPF_SK_SKB_STREAM_VERDICT:
3970         case BPF_SK_MSG_VERDICT:
3971         case BPF_SK_SKB_VERDICT:
3972                 return sock_map_bpf_prog_query(attr, uattr);
3973         case BPF_TCX_INGRESS:
3974         case BPF_TCX_EGRESS:
3975                 return tcx_prog_query(attr, uattr);
3976         default:
3977                 return -EINVAL;
3978         }
3979 }
3980
3981 #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
3982
3983 static int bpf_prog_test_run(const union bpf_attr *attr,
3984                              union bpf_attr __user *uattr)
3985 {
3986         struct bpf_prog *prog;
3987         int ret = -ENOTSUPP;
3988
3989         if (CHECK_ATTR(BPF_PROG_TEST_RUN))
3990                 return -EINVAL;
3991
3992         if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
3993             (!attr->test.ctx_size_in && attr->test.ctx_in))
3994                 return -EINVAL;
3995
3996         if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
3997             (!attr->test.ctx_size_out && attr->test.ctx_out))
3998                 return -EINVAL;
3999
4000         prog = bpf_prog_get(attr->test.prog_fd);
4001         if (IS_ERR(prog))
4002                 return PTR_ERR(prog);
4003
4004         if (prog->aux->ops->test_run)
4005                 ret = prog->aux->ops->test_run(prog, attr, uattr);
4006
4007         bpf_prog_put(prog);
4008         return ret;
4009 }
4010
4011 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
4012
4013 static int bpf_obj_get_next_id(const union bpf_attr *attr,
4014                                union bpf_attr __user *uattr,
4015                                struct idr *idr,
4016                                spinlock_t *lock)
4017 {
4018         u32 next_id = attr->start_id;
4019         int err = 0;
4020
4021         if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
4022                 return -EINVAL;
4023
4024         if (!capable(CAP_SYS_ADMIN))
4025                 return -EPERM;
4026
4027         next_id++;
4028         spin_lock_bh(lock);
4029         if (!idr_get_next(idr, &next_id))
4030                 err = -ENOENT;
4031         spin_unlock_bh(lock);
4032
4033         if (!err)
4034                 err = put_user(next_id, &uattr->next_id);
4035
4036         return err;
4037 }
4038
4039 struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
4040 {
4041         struct bpf_map *map;
4042
4043         spin_lock_bh(&map_idr_lock);
4044 again:
4045         map = idr_get_next(&map_idr, id);
4046         if (map) {
4047                 map = __bpf_map_inc_not_zero(map, false);
4048                 if (IS_ERR(map)) {
4049                         (*id)++;
4050                         goto again;
4051                 }
4052         }
4053         spin_unlock_bh(&map_idr_lock);
4054
4055         return map;
4056 }
4057
4058 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
4059 {
4060         struct bpf_prog *prog;
4061
4062         spin_lock_bh(&prog_idr_lock);
4063 again:
4064         prog = idr_get_next(&prog_idr, id);
4065         if (prog) {
4066                 prog = bpf_prog_inc_not_zero(prog);
4067                 if (IS_ERR(prog)) {
4068                         (*id)++;
4069                         goto again;
4070                 }
4071         }
4072         spin_unlock_bh(&prog_idr_lock);
4073
4074         return prog;
4075 }
4076
4077 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
4078
4079 struct bpf_prog *bpf_prog_by_id(u32 id)
4080 {
4081         struct bpf_prog *prog;
4082
4083         if (!id)
4084                 return ERR_PTR(-ENOENT);
4085
4086         spin_lock_bh(&prog_idr_lock);
4087         prog = idr_find(&prog_idr, id);
4088         if (prog)
4089                 prog = bpf_prog_inc_not_zero(prog);
4090         else
4091                 prog = ERR_PTR(-ENOENT);
4092         spin_unlock_bh(&prog_idr_lock);
4093         return prog;
4094 }
4095
4096 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
4097 {
4098         struct bpf_prog *prog;
4099         u32 id = attr->prog_id;
4100         int fd;
4101
4102         if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
4103                 return -EINVAL;
4104
4105         if (!capable(CAP_SYS_ADMIN))
4106                 return -EPERM;
4107
4108         prog = bpf_prog_by_id(id);
4109         if (IS_ERR(prog))
4110                 return PTR_ERR(prog);
4111
4112         fd = bpf_prog_new_fd(prog);
4113         if (fd < 0)
4114                 bpf_prog_put(prog);
4115
4116         return fd;
4117 }
4118
4119 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
4120
4121 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
4122 {
4123         struct bpf_map *map;
4124         u32 id = attr->map_id;
4125         int f_flags;
4126         int fd;
4127
4128         if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
4129             attr->open_flags & ~BPF_OBJ_FLAG_MASK)
4130                 return -EINVAL;
4131
4132         if (!capable(CAP_SYS_ADMIN))
4133                 return -EPERM;
4134
4135         f_flags = bpf_get_file_flag(attr->open_flags);
4136         if (f_flags < 0)
4137                 return f_flags;
4138
4139         spin_lock_bh(&map_idr_lock);
4140         map = idr_find(&map_idr, id);
4141         if (map)
4142                 map = __bpf_map_inc_not_zero(map, true);
4143         else
4144                 map = ERR_PTR(-ENOENT);
4145         spin_unlock_bh(&map_idr_lock);
4146
4147         if (IS_ERR(map))
4148                 return PTR_ERR(map);
4149
4150         fd = bpf_map_new_fd(map, f_flags);
4151         if (fd < 0)
4152                 bpf_map_put_with_uref(map);
4153
4154         return fd;
4155 }
4156
4157 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
4158                                               unsigned long addr, u32 *off,
4159                                               u32 *type)
4160 {
4161         const struct bpf_map *map;
4162         int i;
4163
4164         mutex_lock(&prog->aux->used_maps_mutex);
4165         for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
4166                 map = prog->aux->used_maps[i];
4167                 if (map == (void *)addr) {
4168                         *type = BPF_PSEUDO_MAP_FD;
4169                         goto out;
4170                 }
4171                 if (!map->ops->map_direct_value_meta)
4172                         continue;
4173                 if (!map->ops->map_direct_value_meta(map, addr, off)) {
4174                         *type = BPF_PSEUDO_MAP_VALUE;
4175                         goto out;
4176                 }
4177         }
4178         map = NULL;
4179
4180 out:
4181         mutex_unlock(&prog->aux->used_maps_mutex);
4182         return map;
4183 }
4184
4185 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
4186                                               const struct cred *f_cred)
4187 {
4188         const struct bpf_map *map;
4189         struct bpf_insn *insns;
4190         u32 off, type;
4191         u64 imm;
4192         u8 code;
4193         int i;
4194
4195         insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
4196                         GFP_USER);
4197         if (!insns)
4198                 return insns;
4199
4200         for (i = 0; i < prog->len; i++) {
4201                 code = insns[i].code;
4202
4203                 if (code == (BPF_JMP | BPF_TAIL_CALL)) {
4204                         insns[i].code = BPF_JMP | BPF_CALL;
4205                         insns[i].imm = BPF_FUNC_tail_call;
4206                         /* fall-through */
4207                 }
4208                 if (code == (BPF_JMP | BPF_CALL) ||
4209                     code == (BPF_JMP | BPF_CALL_ARGS)) {
4210                         if (code == (BPF_JMP | BPF_CALL_ARGS))
4211                                 insns[i].code = BPF_JMP | BPF_CALL;
4212                         if (!bpf_dump_raw_ok(f_cred))
4213                                 insns[i].imm = 0;
4214                         continue;
4215                 }
4216                 if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
4217                         insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
4218                         continue;
4219                 }
4220
4221                 if (code != (BPF_LD | BPF_IMM | BPF_DW))
4222                         continue;
4223
4224                 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
4225                 map = bpf_map_from_imm(prog, imm, &off, &type);
4226                 if (map) {
4227                         insns[i].src_reg = type;
4228                         insns[i].imm = map->id;
4229                         insns[i + 1].imm = off;
4230                         continue;
4231                 }
4232         }
4233
4234         return insns;
4235 }
4236
4237 static int set_info_rec_size(struct bpf_prog_info *info)
4238 {
4239         /*
4240          * Ensure info.*_rec_size is the same as kernel expected size
4241          *
4242          * or
4243          *
4244          * Only allow zero *_rec_size if both _rec_size and _cnt are
4245          * zero.  In this case, the kernel will set the expected
4246          * _rec_size back to the info.
4247          */
4248
4249         if ((info->nr_func_info || info->func_info_rec_size) &&
4250             info->func_info_rec_size != sizeof(struct bpf_func_info))
4251                 return -EINVAL;
4252
4253         if ((info->nr_line_info || info->line_info_rec_size) &&
4254             info->line_info_rec_size != sizeof(struct bpf_line_info))
4255                 return -EINVAL;
4256
4257         if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
4258             info->jited_line_info_rec_size != sizeof(__u64))
4259                 return -EINVAL;
4260
4261         info->func_info_rec_size = sizeof(struct bpf_func_info);
4262         info->line_info_rec_size = sizeof(struct bpf_line_info);
4263         info->jited_line_info_rec_size = sizeof(__u64);
4264
4265         return 0;
4266 }
4267
4268 static int bpf_prog_get_info_by_fd(struct file *file,
4269                                    struct bpf_prog *prog,
4270                                    const union bpf_attr *attr,
4271                                    union bpf_attr __user *uattr)
4272 {
4273         struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4274         struct btf *attach_btf = bpf_prog_get_target_btf(prog);
4275         struct bpf_prog_info info;
4276         u32 info_len = attr->info.info_len;
4277         struct bpf_prog_kstats stats;
4278         char __user *uinsns;
4279         u32 ulen;
4280         int err;
4281
4282         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4283         if (err)
4284                 return err;
4285         info_len = min_t(u32, sizeof(info), info_len);
4286
4287         memset(&info, 0, sizeof(info));
4288         if (copy_from_user(&info, uinfo, info_len))
4289                 return -EFAULT;
4290
4291         info.type = prog->type;
4292         info.id = prog->aux->id;
4293         info.load_time = prog->aux->load_time;
4294         info.created_by_uid = from_kuid_munged(current_user_ns(),
4295                                                prog->aux->user->uid);
4296         info.gpl_compatible = prog->gpl_compatible;
4297
4298         memcpy(info.tag, prog->tag, sizeof(prog->tag));
4299         memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
4300
4301         mutex_lock(&prog->aux->used_maps_mutex);
4302         ulen = info.nr_map_ids;
4303         info.nr_map_ids = prog->aux->used_map_cnt;
4304         ulen = min_t(u32, info.nr_map_ids, ulen);
4305         if (ulen) {
4306                 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
4307                 u32 i;
4308
4309                 for (i = 0; i < ulen; i++)
4310                         if (put_user(prog->aux->used_maps[i]->id,
4311                                      &user_map_ids[i])) {
4312                                 mutex_unlock(&prog->aux->used_maps_mutex);
4313                                 return -EFAULT;
4314                         }
4315         }
4316         mutex_unlock(&prog->aux->used_maps_mutex);
4317
4318         err = set_info_rec_size(&info);
4319         if (err)
4320                 return err;
4321
4322         bpf_prog_get_stats(prog, &stats);
4323         info.run_time_ns = stats.nsecs;
4324         info.run_cnt = stats.cnt;
4325         info.recursion_misses = stats.misses;
4326
4327         info.verified_insns = prog->aux->verified_insns;
4328
4329         if (!bpf_capable()) {
4330                 info.jited_prog_len = 0;
4331                 info.xlated_prog_len = 0;
4332                 info.nr_jited_ksyms = 0;
4333                 info.nr_jited_func_lens = 0;
4334                 info.nr_func_info = 0;
4335                 info.nr_line_info = 0;
4336                 info.nr_jited_line_info = 0;
4337                 goto done;
4338         }
4339
4340         ulen = info.xlated_prog_len;
4341         info.xlated_prog_len = bpf_prog_insn_size(prog);
4342         if (info.xlated_prog_len && ulen) {
4343                 struct bpf_insn *insns_sanitized;
4344                 bool fault;
4345
4346                 if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
4347                         info.xlated_prog_insns = 0;
4348                         goto done;
4349                 }
4350                 insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
4351                 if (!insns_sanitized)
4352                         return -ENOMEM;
4353                 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
4354                 ulen = min_t(u32, info.xlated_prog_len, ulen);
4355                 fault = copy_to_user(uinsns, insns_sanitized, ulen);
4356                 kfree(insns_sanitized);
4357                 if (fault)
4358                         return -EFAULT;
4359         }
4360
4361         if (bpf_prog_is_offloaded(prog->aux)) {
4362                 err = bpf_prog_offload_info_fill(&info, prog);
4363                 if (err)
4364                         return err;
4365                 goto done;
4366         }
4367
4368         /* NOTE: the following code is supposed to be skipped for offload.
4369          * bpf_prog_offload_info_fill() is the place to fill similar fields
4370          * for offload.
4371          */
4372         ulen = info.jited_prog_len;
4373         if (prog->aux->func_cnt) {
4374                 u32 i;
4375
4376                 info.jited_prog_len = 0;
4377                 for (i = 0; i < prog->aux->func_cnt; i++)
4378                         info.jited_prog_len += prog->aux->func[i]->jited_len;
4379         } else {
4380                 info.jited_prog_len = prog->jited_len;
4381         }
4382
4383         if (info.jited_prog_len && ulen) {
4384                 if (bpf_dump_raw_ok(file->f_cred)) {
4385                         uinsns = u64_to_user_ptr(info.jited_prog_insns);
4386                         ulen = min_t(u32, info.jited_prog_len, ulen);
4387
4388                         /* for multi-function programs, copy the JITed
4389                          * instructions for all the functions
4390                          */
4391                         if (prog->aux->func_cnt) {
4392                                 u32 len, free, i;
4393                                 u8 *img;
4394
4395                                 free = ulen;
4396                                 for (i = 0; i < prog->aux->func_cnt; i++) {
4397                                         len = prog->aux->func[i]->jited_len;
4398                                         len = min_t(u32, len, free);
4399                                         img = (u8 *) prog->aux->func[i]->bpf_func;
4400                                         if (copy_to_user(uinsns, img, len))
4401                                                 return -EFAULT;
4402                                         uinsns += len;
4403                                         free -= len;
4404                                         if (!free)
4405                                                 break;
4406                                 }
4407                         } else {
4408                                 if (copy_to_user(uinsns, prog->bpf_func, ulen))
4409                                         return -EFAULT;
4410                         }
4411                 } else {
4412                         info.jited_prog_insns = 0;
4413                 }
4414         }
4415
4416         ulen = info.nr_jited_ksyms;
4417         info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
4418         if (ulen) {
4419                 if (bpf_dump_raw_ok(file->f_cred)) {
4420                         unsigned long ksym_addr;
4421                         u64 __user *user_ksyms;
4422                         u32 i;
4423
4424                         /* copy the address of the kernel symbol
4425                          * corresponding to each function
4426                          */
4427                         ulen = min_t(u32, info.nr_jited_ksyms, ulen);
4428                         user_ksyms = u64_to_user_ptr(info.jited_ksyms);
4429                         if (prog->aux->func_cnt) {
4430                                 for (i = 0; i < ulen; i++) {
4431                                         ksym_addr = (unsigned long)
4432                                                 prog->aux->func[i]->bpf_func;
4433                                         if (put_user((u64) ksym_addr,
4434                                                      &user_ksyms[i]))
4435                                                 return -EFAULT;
4436                                 }
4437                         } else {
4438                                 ksym_addr = (unsigned long) prog->bpf_func;
4439                                 if (put_user((u64) ksym_addr, &user_ksyms[0]))
4440                                         return -EFAULT;
4441                         }
4442                 } else {
4443                         info.jited_ksyms = 0;
4444                 }
4445         }
4446
4447         ulen = info.nr_jited_func_lens;
4448         info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
4449         if (ulen) {
4450                 if (bpf_dump_raw_ok(file->f_cred)) {
4451                         u32 __user *user_lens;
4452                         u32 func_len, i;
4453
4454                         /* copy the JITed image lengths for each function */
4455                         ulen = min_t(u32, info.nr_jited_func_lens, ulen);
4456                         user_lens = u64_to_user_ptr(info.jited_func_lens);
4457                         if (prog->aux->func_cnt) {
4458                                 for (i = 0; i < ulen; i++) {
4459                                         func_len =
4460                                                 prog->aux->func[i]->jited_len;
4461                                         if (put_user(func_len, &user_lens[i]))
4462                                                 return -EFAULT;
4463                                 }
4464                         } else {
4465                                 func_len = prog->jited_len;
4466                                 if (put_user(func_len, &user_lens[0]))
4467                                         return -EFAULT;
4468                         }
4469                 } else {
4470                         info.jited_func_lens = 0;
4471                 }
4472         }
4473
4474         if (prog->aux->btf)
4475                 info.btf_id = btf_obj_id(prog->aux->btf);
4476         info.attach_btf_id = prog->aux->attach_btf_id;
4477         if (attach_btf)
4478                 info.attach_btf_obj_id = btf_obj_id(attach_btf);
4479
4480         ulen = info.nr_func_info;
4481         info.nr_func_info = prog->aux->func_info_cnt;
4482         if (info.nr_func_info && ulen) {
4483                 char __user *user_finfo;
4484
4485                 user_finfo = u64_to_user_ptr(info.func_info);
4486                 ulen = min_t(u32, info.nr_func_info, ulen);
4487                 if (copy_to_user(user_finfo, prog->aux->func_info,
4488                                  info.func_info_rec_size * ulen))
4489                         return -EFAULT;
4490         }
4491
4492         ulen = info.nr_line_info;
4493         info.nr_line_info = prog->aux->nr_linfo;
4494         if (info.nr_line_info && ulen) {
4495                 __u8 __user *user_linfo;
4496
4497                 user_linfo = u64_to_user_ptr(info.line_info);
4498                 ulen = min_t(u32, info.nr_line_info, ulen);
4499                 if (copy_to_user(user_linfo, prog->aux->linfo,
4500                                  info.line_info_rec_size * ulen))
4501                         return -EFAULT;
4502         }
4503
4504         ulen = info.nr_jited_line_info;
4505         if (prog->aux->jited_linfo)
4506                 info.nr_jited_line_info = prog->aux->nr_linfo;
4507         else
4508                 info.nr_jited_line_info = 0;
4509         if (info.nr_jited_line_info && ulen) {
4510                 if (bpf_dump_raw_ok(file->f_cred)) {
4511                         unsigned long line_addr;
4512                         __u64 __user *user_linfo;
4513                         u32 i;
4514
4515                         user_linfo = u64_to_user_ptr(info.jited_line_info);
4516                         ulen = min_t(u32, info.nr_jited_line_info, ulen);
4517                         for (i = 0; i < ulen; i++) {
4518                                 line_addr = (unsigned long)prog->aux->jited_linfo[i];
4519                                 if (put_user((__u64)line_addr, &user_linfo[i]))
4520                                         return -EFAULT;
4521                         }
4522                 } else {
4523                         info.jited_line_info = 0;
4524                 }
4525         }
4526
4527         ulen = info.nr_prog_tags;
4528         info.nr_prog_tags = prog->aux->func_cnt ? : 1;
4529         if (ulen) {
4530                 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
4531                 u32 i;
4532
4533                 user_prog_tags = u64_to_user_ptr(info.prog_tags);
4534                 ulen = min_t(u32, info.nr_prog_tags, ulen);
4535                 if (prog->aux->func_cnt) {
4536                         for (i = 0; i < ulen; i++) {
4537                                 if (copy_to_user(user_prog_tags[i],
4538                                                  prog->aux->func[i]->tag,
4539                                                  BPF_TAG_SIZE))
4540                                         return -EFAULT;
4541                         }
4542                 } else {
4543                         if (copy_to_user(user_prog_tags[0],
4544                                          prog->tag, BPF_TAG_SIZE))
4545                                 return -EFAULT;
4546                 }
4547         }
4548
4549 done:
4550         if (copy_to_user(uinfo, &info, info_len) ||
4551             put_user(info_len, &uattr->info.info_len))
4552                 return -EFAULT;
4553
4554         return 0;
4555 }
4556
4557 static int bpf_map_get_info_by_fd(struct file *file,
4558                                   struct bpf_map *map,
4559                                   const union bpf_attr *attr,
4560                                   union bpf_attr __user *uattr)
4561 {
4562         struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4563         struct bpf_map_info info;
4564         u32 info_len = attr->info.info_len;
4565         int err;
4566
4567         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4568         if (err)
4569                 return err;
4570         info_len = min_t(u32, sizeof(info), info_len);
4571
4572         memset(&info, 0, sizeof(info));
4573         info.type = map->map_type;
4574         info.id = map->id;
4575         info.key_size = map->key_size;
4576         info.value_size = map->value_size;
4577         info.max_entries = map->max_entries;
4578         info.map_flags = map->map_flags;
4579         info.map_extra = map->map_extra;
4580         memcpy(info.name, map->name, sizeof(map->name));
4581
4582         if (map->btf) {
4583                 info.btf_id = btf_obj_id(map->btf);
4584                 info.btf_key_type_id = map->btf_key_type_id;
4585                 info.btf_value_type_id = map->btf_value_type_id;
4586         }
4587         info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
4588
4589         if (bpf_map_is_offloaded(map)) {
4590                 err = bpf_map_offload_info_fill(&info, map);
4591                 if (err)
4592                         return err;
4593         }
4594
4595         if (copy_to_user(uinfo, &info, info_len) ||
4596             put_user(info_len, &uattr->info.info_len))
4597                 return -EFAULT;
4598
4599         return 0;
4600 }
4601
4602 static int bpf_btf_get_info_by_fd(struct file *file,
4603                                   struct btf *btf,
4604                                   const union bpf_attr *attr,
4605                                   union bpf_attr __user *uattr)
4606 {
4607         struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4608         u32 info_len = attr->info.info_len;
4609         int err;
4610
4611         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
4612         if (err)
4613                 return err;
4614
4615         return btf_get_info_by_fd(btf, attr, uattr);
4616 }
4617
4618 static int bpf_link_get_info_by_fd(struct file *file,
4619                                   struct bpf_link *link,
4620                                   const union bpf_attr *attr,
4621                                   union bpf_attr __user *uattr)
4622 {
4623         struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4624         struct bpf_link_info info;
4625         u32 info_len = attr->info.info_len;
4626         int err;
4627
4628         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4629         if (err)
4630                 return err;
4631         info_len = min_t(u32, sizeof(info), info_len);
4632
4633         memset(&info, 0, sizeof(info));
4634         if (copy_from_user(&info, uinfo, info_len))
4635                 return -EFAULT;
4636
4637         info.type = link->type;
4638         info.id = link->id;
4639         if (link->prog)
4640                 info.prog_id = link->prog->aux->id;
4641
4642         if (link->ops->fill_link_info) {
4643                 err = link->ops->fill_link_info(link, &info);
4644                 if (err)
4645                         return err;
4646         }
4647
4648         if (copy_to_user(uinfo, &info, info_len) ||
4649             put_user(info_len, &uattr->info.info_len))
4650                 return -EFAULT;
4651
4652         return 0;
4653 }
4654
4655
4656 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
4657
4658 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
4659                                   union bpf_attr __user *uattr)
4660 {
4661         int ufd = attr->info.bpf_fd;
4662         struct fd f;
4663         int err;
4664
4665         if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
4666                 return -EINVAL;
4667
4668         f = fdget(ufd);
4669         if (!f.file)
4670                 return -EBADFD;
4671
4672         if (f.file->f_op == &bpf_prog_fops)
4673                 err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
4674                                               uattr);
4675         else if (f.file->f_op == &bpf_map_fops)
4676                 err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
4677                                              uattr);
4678         else if (f.file->f_op == &btf_fops)
4679                 err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
4680         else if (f.file->f_op == &bpf_link_fops)
4681                 err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
4682                                               attr, uattr);
4683         else
4684                 err = -EINVAL;
4685
4686         fdput(f);
4687         return err;
4688 }
4689
4690 #define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
4691
4692 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
4693 {
4694         if (CHECK_ATTR(BPF_BTF_LOAD))
4695                 return -EINVAL;
4696
4697         if (!bpf_capable())
4698                 return -EPERM;
4699
4700         return btf_new_fd(attr, uattr, uattr_size);
4701 }
4702
4703 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
4704
4705 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
4706 {
4707         if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
4708                 return -EINVAL;
4709
4710         if (!capable(CAP_SYS_ADMIN))
4711                 return -EPERM;
4712
4713         return btf_get_fd_by_id(attr->btf_id);
4714 }
4715
4716 static int bpf_task_fd_query_copy(const union bpf_attr *attr,
4717                                     union bpf_attr __user *uattr,
4718                                     u32 prog_id, u32 fd_type,
4719                                     const char *buf, u64 probe_offset,
4720                                     u64 probe_addr)
4721 {
4722         char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
4723         u32 len = buf ? strlen(buf) : 0, input_len;
4724         int err = 0;
4725
4726         if (put_user(len, &uattr->task_fd_query.buf_len))
4727                 return -EFAULT;
4728         input_len = attr->task_fd_query.buf_len;
4729         if (input_len && ubuf) {
4730                 if (!len) {
4731                         /* nothing to copy, just make ubuf NULL terminated */
4732                         char zero = '\0';
4733
4734                         if (put_user(zero, ubuf))
4735                                 return -EFAULT;
4736                 } else if (input_len >= len + 1) {
4737                         /* ubuf can hold the string with NULL terminator */
4738                         if (copy_to_user(ubuf, buf, len + 1))
4739                                 return -EFAULT;
4740                 } else {
4741                         /* ubuf cannot hold the string with NULL terminator,
4742                          * do a partial copy with NULL terminator.
4743                          */
4744                         char zero = '\0';
4745
4746                         err = -ENOSPC;
4747                         if (copy_to_user(ubuf, buf, input_len - 1))
4748                                 return -EFAULT;
4749                         if (put_user(zero, ubuf + input_len - 1))
4750                                 return -EFAULT;
4751                 }
4752         }
4753
4754         if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
4755             put_user(fd_type, &uattr->task_fd_query.fd_type) ||
4756             put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
4757             put_user(probe_addr, &uattr->task_fd_query.probe_addr))
4758                 return -EFAULT;
4759
4760         return err;
4761 }
4762
4763 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
4764
4765 static int bpf_task_fd_query(const union bpf_attr *attr,
4766                              union bpf_attr __user *uattr)
4767 {
4768         pid_t pid = attr->task_fd_query.pid;
4769         u32 fd = attr->task_fd_query.fd;
4770         const struct perf_event *event;
4771         struct task_struct *task;
4772         struct file *file;
4773         int err;
4774
4775         if (CHECK_ATTR(BPF_TASK_FD_QUERY))
4776                 return -EINVAL;
4777
4778         if (!capable(CAP_SYS_ADMIN))
4779                 return -EPERM;
4780
4781         if (attr->task_fd_query.flags != 0)
4782                 return -EINVAL;
4783
4784         rcu_read_lock();
4785         task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
4786         rcu_read_unlock();
4787         if (!task)
4788                 return -ENOENT;
4789
4790         err = 0;
4791         file = fget_task(task, fd);
4792         put_task_struct(task);
4793         if (!file)
4794                 return -EBADF;
4795
4796         if (file->f_op == &bpf_link_fops) {
4797                 struct bpf_link *link = file->private_data;
4798
4799                 if (link->ops == &bpf_raw_tp_link_lops) {
4800                         struct bpf_raw_tp_link *raw_tp =
4801                                 container_of(link, struct bpf_raw_tp_link, link);
4802                         struct bpf_raw_event_map *btp = raw_tp->btp;
4803
4804                         err = bpf_task_fd_query_copy(attr, uattr,
4805                                                      raw_tp->link.prog->aux->id,
4806                                                      BPF_FD_TYPE_RAW_TRACEPOINT,
4807                                                      btp->tp->name, 0, 0);
4808                         goto put_file;
4809                 }
4810                 goto out_not_supp;
4811         }
4812
4813         event = perf_get_event(file);
4814         if (!IS_ERR(event)) {
4815                 u64 probe_offset, probe_addr;
4816                 u32 prog_id, fd_type;
4817                 const char *buf;
4818
4819                 err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
4820                                               &buf, &probe_offset,
4821                                               &probe_addr);
4822                 if (!err)
4823                         err = bpf_task_fd_query_copy(attr, uattr, prog_id,
4824                                                      fd_type, buf,
4825                                                      probe_offset,
4826                                                      probe_addr);
4827                 goto put_file;
4828         }
4829
4830 out_not_supp:
4831         err = -ENOTSUPP;
4832 put_file:
4833         fput(file);
4834         return err;
4835 }
4836
4837 #define BPF_MAP_BATCH_LAST_FIELD batch.flags
4838
4839 #define BPF_DO_BATCH(fn, ...)                   \
4840         do {                                    \
4841                 if (!fn) {                      \
4842                         err = -ENOTSUPP;        \
4843                         goto err_put;           \
4844                 }                               \
4845                 err = fn(__VA_ARGS__);          \
4846         } while (0)
4847
4848 static int bpf_map_do_batch(const union bpf_attr *attr,
4849                             union bpf_attr __user *uattr,
4850                             int cmd)
4851 {
4852         bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
4853                          cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
4854         bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
4855         struct bpf_map *map;
4856         int err, ufd;
4857         struct fd f;
4858
4859         if (CHECK_ATTR(BPF_MAP_BATCH))
4860                 return -EINVAL;
4861
4862         ufd = attr->batch.map_fd;
4863         f = fdget(ufd);
4864         map = __bpf_map_get(f);
4865         if (IS_ERR(map))
4866                 return PTR_ERR(map);
4867         if (has_write)
4868                 bpf_map_write_active_inc(map);
4869         if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
4870                 err = -EPERM;
4871                 goto err_put;
4872         }
4873         if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
4874                 err = -EPERM;
4875                 goto err_put;
4876         }
4877
4878         if (cmd == BPF_MAP_LOOKUP_BATCH)
4879                 BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
4880         else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
4881                 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
4882         else if (cmd == BPF_MAP_UPDATE_BATCH)
4883                 BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
4884         else
4885                 BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
4886 err_put:
4887         if (has_write)
4888                 bpf_map_write_active_dec(map);
4889         fdput(f);
4890         return err;
4891 }
4892
4893 #define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
4894 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
4895 {
4896         struct bpf_prog *prog;
4897         int ret;
4898
4899         if (CHECK_ATTR(BPF_LINK_CREATE))
4900                 return -EINVAL;
4901
4902         if (attr->link_create.attach_type == BPF_STRUCT_OPS)
4903                 return bpf_struct_ops_link_create(attr);
4904
4905         prog = bpf_prog_get(attr->link_create.prog_fd);
4906         if (IS_ERR(prog))
4907                 return PTR_ERR(prog);
4908
4909         ret = bpf_prog_attach_check_attach_type(prog,
4910                                                 attr->link_create.attach_type);
4911         if (ret)
4912                 goto out;
4913
4914         switch (prog->type) {
4915         case BPF_PROG_TYPE_CGROUP_SKB:
4916         case BPF_PROG_TYPE_CGROUP_SOCK:
4917         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4918         case BPF_PROG_TYPE_SOCK_OPS:
4919         case BPF_PROG_TYPE_CGROUP_DEVICE:
4920         case BPF_PROG_TYPE_CGROUP_SYSCTL:
4921         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4922                 ret = cgroup_bpf_link_attach(attr, prog);
4923                 break;
4924         case BPF_PROG_TYPE_EXT:
4925                 ret = bpf_tracing_prog_attach(prog,
4926                                               attr->link_create.target_fd,
4927                                               attr->link_create.target_btf_id,
4928                                               attr->link_create.tracing.cookie);
4929                 break;
4930         case BPF_PROG_TYPE_LSM:
4931         case BPF_PROG_TYPE_TRACING:
4932                 if (attr->link_create.attach_type != prog->expected_attach_type) {
4933                         ret = -EINVAL;
4934                         goto out;
4935                 }
4936                 if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
4937                         ret = bpf_raw_tp_link_attach(prog, NULL);
4938                 else if (prog->expected_attach_type == BPF_TRACE_ITER)
4939                         ret = bpf_iter_link_attach(attr, uattr, prog);
4940                 else if (prog->expected_attach_type == BPF_LSM_CGROUP)
4941                         ret = cgroup_bpf_link_attach(attr, prog);
4942                 else
4943                         ret = bpf_tracing_prog_attach(prog,
4944                                                       attr->link_create.target_fd,
4945                                                       attr->link_create.target_btf_id,
4946                                                       attr->link_create.tracing.cookie);
4947                 break;
4948         case BPF_PROG_TYPE_FLOW_DISSECTOR:
4949         case BPF_PROG_TYPE_SK_LOOKUP:
4950                 ret = netns_bpf_link_create(attr, prog);
4951                 break;
4952 #ifdef CONFIG_NET
4953         case BPF_PROG_TYPE_XDP:
4954                 ret = bpf_xdp_link_attach(attr, prog);
4955                 break;
4956         case BPF_PROG_TYPE_SCHED_CLS:
4957                 ret = tcx_link_attach(attr, prog);
4958                 break;
4959         case BPF_PROG_TYPE_NETFILTER:
4960                 ret = bpf_nf_link_attach(attr, prog);
4961                 break;
4962 #endif
4963         case BPF_PROG_TYPE_PERF_EVENT:
4964         case BPF_PROG_TYPE_TRACEPOINT:
4965                 ret = bpf_perf_link_attach(attr, prog);
4966                 break;
4967         case BPF_PROG_TYPE_KPROBE:
4968                 if (attr->link_create.attach_type == BPF_PERF_EVENT)
4969                         ret = bpf_perf_link_attach(attr, prog);
4970                 else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
4971                         ret = bpf_kprobe_multi_link_attach(attr, prog);
4972                 else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
4973                         ret = bpf_uprobe_multi_link_attach(attr, prog);
4974                 break;
4975         default:
4976                 ret = -EINVAL;
4977         }
4978
4979 out:
4980         if (ret < 0)
4981                 bpf_prog_put(prog);
4982         return ret;
4983 }
4984
4985 static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
4986 {
4987         struct bpf_map *new_map, *old_map = NULL;
4988         int ret;
4989
4990         new_map = bpf_map_get(attr->link_update.new_map_fd);
4991         if (IS_ERR(new_map))
4992                 return PTR_ERR(new_map);
4993
4994         if (attr->link_update.flags & BPF_F_REPLACE) {
4995                 old_map = bpf_map_get(attr->link_update.old_map_fd);
4996                 if (IS_ERR(old_map)) {
4997                         ret = PTR_ERR(old_map);
4998                         goto out_put;
4999                 }
5000         } else if (attr->link_update.old_map_fd) {
5001                 ret = -EINVAL;
5002                 goto out_put;
5003         }
5004
5005         ret = link->ops->update_map(link, new_map, old_map);
5006
5007         if (old_map)
5008                 bpf_map_put(old_map);
5009 out_put:
5010         bpf_map_put(new_map);
5011         return ret;
5012 }
5013
5014 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
5015
5016 static int link_update(union bpf_attr *attr)
5017 {
5018         struct bpf_prog *old_prog = NULL, *new_prog;
5019         struct bpf_link *link;
5020         u32 flags;
5021         int ret;
5022
5023         if (CHECK_ATTR(BPF_LINK_UPDATE))
5024                 return -EINVAL;
5025
5026         flags = attr->link_update.flags;
5027         if (flags & ~BPF_F_REPLACE)
5028                 return -EINVAL;
5029
5030         link = bpf_link_get_from_fd(attr->link_update.link_fd);
5031         if (IS_ERR(link))
5032                 return PTR_ERR(link);
5033
5034         if (link->ops->update_map) {
5035                 ret = link_update_map(link, attr);
5036                 goto out_put_link;
5037         }
5038
5039         new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
5040         if (IS_ERR(new_prog)) {
5041                 ret = PTR_ERR(new_prog);
5042                 goto out_put_link;
5043         }
5044
5045         if (flags & BPF_F_REPLACE) {
5046                 old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
5047                 if (IS_ERR(old_prog)) {
5048                         ret = PTR_ERR(old_prog);
5049                         old_prog = NULL;
5050                         goto out_put_progs;
5051                 }
5052         } else if (attr->link_update.old_prog_fd) {
5053                 ret = -EINVAL;
5054                 goto out_put_progs;
5055         }
5056
5057         if (link->ops->update_prog)
5058                 ret = link->ops->update_prog(link, new_prog, old_prog);
5059         else
5060                 ret = -EINVAL;
5061
5062 out_put_progs:
5063         if (old_prog)
5064                 bpf_prog_put(old_prog);
5065         if (ret)
5066                 bpf_prog_put(new_prog);
5067 out_put_link:
5068         bpf_link_put_direct(link);
5069         return ret;
5070 }
5071
5072 #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
5073
5074 static int link_detach(union bpf_attr *attr)
5075 {
5076         struct bpf_link *link;
5077         int ret;
5078
5079         if (CHECK_ATTR(BPF_LINK_DETACH))
5080                 return -EINVAL;
5081
5082         link = bpf_link_get_from_fd(attr->link_detach.link_fd);
5083         if (IS_ERR(link))
5084                 return PTR_ERR(link);
5085
5086         if (link->ops->detach)
5087                 ret = link->ops->detach(link);
5088         else
5089                 ret = -EOPNOTSUPP;
5090
5091         bpf_link_put_direct(link);
5092         return ret;
5093 }
5094
5095 static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
5096 {
5097         return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
5098 }
5099
5100 struct bpf_link *bpf_link_by_id(u32 id)
5101 {
5102         struct bpf_link *link;
5103
5104         if (!id)
5105                 return ERR_PTR(-ENOENT);
5106
5107         spin_lock_bh(&link_idr_lock);
5108         /* before link is "settled", ID is 0, pretend it doesn't exist yet */
5109         link = idr_find(&link_idr, id);
5110         if (link) {
5111                 if (link->id)
5112                         link = bpf_link_inc_not_zero(link);
5113                 else
5114                         link = ERR_PTR(-EAGAIN);
5115         } else {
5116                 link = ERR_PTR(-ENOENT);
5117         }
5118         spin_unlock_bh(&link_idr_lock);
5119         return link;
5120 }
5121
5122 struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
5123 {
5124         struct bpf_link *link;
5125
5126         spin_lock_bh(&link_idr_lock);
5127 again:
5128         link = idr_get_next(&link_idr, id);
5129         if (link) {
5130                 link = bpf_link_inc_not_zero(link);
5131                 if (IS_ERR(link)) {
5132                         (*id)++;
5133                         goto again;
5134                 }
5135         }
5136         spin_unlock_bh(&link_idr_lock);
5137
5138         return link;
5139 }
5140
5141 #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
5142
5143 static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
5144 {
5145         struct bpf_link *link;
5146         u32 id = attr->link_id;
5147         int fd;
5148
5149         if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
5150                 return -EINVAL;
5151
5152         if (!capable(CAP_SYS_ADMIN))
5153                 return -EPERM;
5154
5155         link = bpf_link_by_id(id);
5156         if (IS_ERR(link))
5157                 return PTR_ERR(link);
5158
5159         fd = bpf_link_new_fd(link);
5160         if (fd < 0)
5161                 bpf_link_put_direct(link);
5162
5163         return fd;
5164 }
5165
5166 DEFINE_MUTEX(bpf_stats_enabled_mutex);
5167
5168 static int bpf_stats_release(struct inode *inode, struct file *file)
5169 {
5170         mutex_lock(&bpf_stats_enabled_mutex);
5171         static_key_slow_dec(&bpf_stats_enabled_key.key);
5172         mutex_unlock(&bpf_stats_enabled_mutex);
5173         return 0;
5174 }
5175
5176 static const struct file_operations bpf_stats_fops = {
5177         .release = bpf_stats_release,
5178 };
5179
5180 static int bpf_enable_runtime_stats(void)
5181 {
5182         int fd;
5183
5184         mutex_lock(&bpf_stats_enabled_mutex);
5185
5186         /* Set a very high limit to avoid overflow */
5187         if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
5188                 mutex_unlock(&bpf_stats_enabled_mutex);
5189                 return -EBUSY;
5190         }
5191
5192         fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
5193         if (fd >= 0)
5194                 static_key_slow_inc(&bpf_stats_enabled_key.key);
5195
5196         mutex_unlock(&bpf_stats_enabled_mutex);
5197         return fd;
5198 }
5199
5200 #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
5201
5202 static int bpf_enable_stats(union bpf_attr *attr)
5203 {
5204
5205         if (CHECK_ATTR(BPF_ENABLE_STATS))
5206                 return -EINVAL;
5207
5208         if (!capable(CAP_SYS_ADMIN))
5209                 return -EPERM;
5210
5211         switch (attr->enable_stats.type) {
5212         case BPF_STATS_RUN_TIME:
5213                 return bpf_enable_runtime_stats();
5214         default:
5215                 break;
5216         }
5217         return -EINVAL;
5218 }
5219
5220 #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
5221
5222 static int bpf_iter_create(union bpf_attr *attr)
5223 {
5224         struct bpf_link *link;
5225         int err;
5226
5227         if (CHECK_ATTR(BPF_ITER_CREATE))
5228                 return -EINVAL;
5229
5230         if (attr->iter_create.flags)
5231                 return -EINVAL;
5232
5233         link = bpf_link_get_from_fd(attr->iter_create.link_fd);
5234         if (IS_ERR(link))
5235                 return PTR_ERR(link);
5236
5237         err = bpf_iter_new_fd(link);
5238         bpf_link_put_direct(link);
5239
5240         return err;
5241 }
5242
5243 #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
5244
5245 static int bpf_prog_bind_map(union bpf_attr *attr)
5246 {
5247         struct bpf_prog *prog;
5248         struct bpf_map *map;
5249         struct bpf_map **used_maps_old, **used_maps_new;
5250         int i, ret = 0;
5251
5252         if (CHECK_ATTR(BPF_PROG_BIND_MAP))
5253                 return -EINVAL;
5254
5255         if (attr->prog_bind_map.flags)
5256                 return -EINVAL;
5257
5258         prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
5259         if (IS_ERR(prog))
5260                 return PTR_ERR(prog);
5261
5262         map = bpf_map_get(attr->prog_bind_map.map_fd);
5263         if (IS_ERR(map)) {
5264                 ret = PTR_ERR(map);
5265                 goto out_prog_put;
5266         }
5267
5268         mutex_lock(&prog->aux->used_maps_mutex);
5269
5270         used_maps_old = prog->aux->used_maps;
5271
5272         for (i = 0; i < prog->aux->used_map_cnt; i++)
5273                 if (used_maps_old[i] == map) {
5274                         bpf_map_put(map);
5275                         goto out_unlock;
5276                 }
5277
5278         used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
5279                                       sizeof(used_maps_new[0]),
5280                                       GFP_KERNEL);
5281         if (!used_maps_new) {
5282                 ret = -ENOMEM;
5283                 goto out_unlock;
5284         }
5285
5286         memcpy(used_maps_new, used_maps_old,
5287                sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
5288         used_maps_new[prog->aux->used_map_cnt] = map;
5289
5290         prog->aux->used_map_cnt++;
5291         prog->aux->used_maps = used_maps_new;
5292
5293         kfree(used_maps_old);
5294
5295 out_unlock:
5296         mutex_unlock(&prog->aux->used_maps_mutex);
5297
5298         if (ret)
5299                 bpf_map_put(map);
5300 out_prog_put:
5301         bpf_prog_put(prog);
5302         return ret;
5303 }
5304
5305 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
5306 {
5307         union bpf_attr attr;
5308         int err;
5309
5310         err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
5311         if (err)
5312                 return err;
5313         size = min_t(u32, size, sizeof(attr));
5314
5315         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
5316         memset(&attr, 0, sizeof(attr));
5317         if (copy_from_bpfptr(&attr, uattr, size) != 0)
5318                 return -EFAULT;
5319
5320         err = security_bpf(cmd, &attr, size);
5321         if (err < 0)
5322                 return err;
5323
5324         switch (cmd) {
5325         case BPF_MAP_CREATE:
5326                 err = map_create(&attr);
5327                 break;
5328         case BPF_MAP_LOOKUP_ELEM:
5329                 err = map_lookup_elem(&attr);
5330                 break;
5331         case BPF_MAP_UPDATE_ELEM:
5332                 err = map_update_elem(&attr, uattr);
5333                 break;
5334         case BPF_MAP_DELETE_ELEM:
5335                 err = map_delete_elem(&attr, uattr);
5336                 break;
5337         case BPF_MAP_GET_NEXT_KEY:
5338                 err = map_get_next_key(&attr);
5339                 break;
5340         case BPF_MAP_FREEZE:
5341                 err = map_freeze(&attr);
5342                 break;
5343         case BPF_PROG_LOAD:
5344                 err = bpf_prog_load(&attr, uattr, size);
5345                 break;
5346         case BPF_OBJ_PIN:
5347                 err = bpf_obj_pin(&attr);
5348                 break;
5349         case BPF_OBJ_GET:
5350                 err = bpf_obj_get(&attr);
5351                 break;
5352         case BPF_PROG_ATTACH:
5353                 err = bpf_prog_attach(&attr);
5354                 break;
5355         case BPF_PROG_DETACH:
5356                 err = bpf_prog_detach(&attr);
5357                 break;
5358         case BPF_PROG_QUERY:
5359                 err = bpf_prog_query(&attr, uattr.user);
5360                 break;
5361         case BPF_PROG_TEST_RUN:
5362                 err = bpf_prog_test_run(&attr, uattr.user);
5363                 break;
5364         case BPF_PROG_GET_NEXT_ID:
5365                 err = bpf_obj_get_next_id(&attr, uattr.user,
5366                                           &prog_idr, &prog_idr_lock);
5367                 break;
5368         case BPF_MAP_GET_NEXT_ID:
5369                 err = bpf_obj_get_next_id(&attr, uattr.user,
5370                                           &map_idr, &map_idr_lock);
5371                 break;
5372         case BPF_BTF_GET_NEXT_ID:
5373                 err = bpf_obj_get_next_id(&attr, uattr.user,
5374                                           &btf_idr, &btf_idr_lock);
5375                 break;
5376         case BPF_PROG_GET_FD_BY_ID:
5377                 err = bpf_prog_get_fd_by_id(&attr);
5378                 break;
5379         case BPF_MAP_GET_FD_BY_ID:
5380                 err = bpf_map_get_fd_by_id(&attr);
5381                 break;
5382         case BPF_OBJ_GET_INFO_BY_FD:
5383                 err = bpf_obj_get_info_by_fd(&attr, uattr.user);
5384                 break;
5385         case BPF_RAW_TRACEPOINT_OPEN:
5386                 err = bpf_raw_tracepoint_open(&attr);
5387                 break;
5388         case BPF_BTF_LOAD:
5389                 err = bpf_btf_load(&attr, uattr, size);
5390                 break;
5391         case BPF_BTF_GET_FD_BY_ID:
5392                 err = bpf_btf_get_fd_by_id(&attr);
5393                 break;
5394         case BPF_TASK_FD_QUERY:
5395                 err = bpf_task_fd_query(&attr, uattr.user);
5396                 break;
5397         case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
5398                 err = map_lookup_and_delete_elem(&attr);
5399                 break;
5400         case BPF_MAP_LOOKUP_BATCH:
5401                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
5402                 break;
5403         case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
5404                 err = bpf_map_do_batch(&attr, uattr.user,
5405                                        BPF_MAP_LOOKUP_AND_DELETE_BATCH);
5406                 break;
5407         case BPF_MAP_UPDATE_BATCH:
5408                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
5409                 break;
5410         case BPF_MAP_DELETE_BATCH:
5411                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
5412                 break;
5413         case BPF_LINK_CREATE:
5414                 err = link_create(&attr, uattr);
5415                 break;
5416         case BPF_LINK_UPDATE:
5417                 err = link_update(&attr);
5418                 break;
5419         case BPF_LINK_GET_FD_BY_ID:
5420                 err = bpf_link_get_fd_by_id(&attr);
5421                 break;
5422         case BPF_LINK_GET_NEXT_ID:
5423                 err = bpf_obj_get_next_id(&attr, uattr.user,
5424                                           &link_idr, &link_idr_lock);
5425                 break;
5426         case BPF_ENABLE_STATS:
5427                 err = bpf_enable_stats(&attr);
5428                 break;
5429         case BPF_ITER_CREATE:
5430                 err = bpf_iter_create(&attr);
5431                 break;
5432         case BPF_LINK_DETACH:
5433                 err = link_detach(&attr);
5434                 break;
5435         case BPF_PROG_BIND_MAP:
5436                 err = bpf_prog_bind_map(&attr);
5437                 break;
5438         default:
5439                 err = -EINVAL;
5440                 break;
5441         }
5442
5443         return err;
5444 }
5445
5446 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
5447 {
5448         return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
5449 }
5450
5451 static bool syscall_prog_is_valid_access(int off, int size,
5452                                          enum bpf_access_type type,
5453                                          const struct bpf_prog *prog,
5454                                          struct bpf_insn_access_aux *info)
5455 {
5456         if (off < 0 || off >= U16_MAX)
5457                 return false;
5458         if (off % size != 0)
5459                 return false;
5460         return true;
5461 }
5462
5463 BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
5464 {
5465         switch (cmd) {
5466         case BPF_MAP_CREATE:
5467         case BPF_MAP_DELETE_ELEM:
5468         case BPF_MAP_UPDATE_ELEM:
5469         case BPF_MAP_FREEZE:
5470         case BPF_MAP_GET_FD_BY_ID:
5471         case BPF_PROG_LOAD:
5472         case BPF_BTF_LOAD:
5473         case BPF_LINK_CREATE:
5474         case BPF_RAW_TRACEPOINT_OPEN:
5475                 break;
5476         default:
5477                 return -EINVAL;
5478         }
5479         return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
5480 }
5481
5482
5483 /* To shut up -Wmissing-prototypes.
5484  * This function is used by the kernel light skeleton
5485  * to load bpf programs when modules are loaded or during kernel boot.
5486  * See tools/lib/bpf/skel_internal.h
5487  */
5488 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
5489
5490 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
5491 {
5492         struct bpf_prog * __maybe_unused prog;
5493         struct bpf_tramp_run_ctx __maybe_unused run_ctx;
5494
5495         switch (cmd) {
5496 #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
5497         case BPF_PROG_TEST_RUN:
5498                 if (attr->test.data_in || attr->test.data_out ||
5499                     attr->test.ctx_out || attr->test.duration ||
5500                     attr->test.repeat || attr->test.flags)
5501                         return -EINVAL;
5502
5503                 prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
5504                 if (IS_ERR(prog))
5505                         return PTR_ERR(prog);
5506
5507                 if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
5508                     attr->test.ctx_size_in > U16_MAX) {
5509                         bpf_prog_put(prog);
5510                         return -EINVAL;
5511                 }
5512
5513                 run_ctx.bpf_cookie = 0;
5514                 if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
5515                         /* recursion detected */
5516                         __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
5517                         bpf_prog_put(prog);
5518                         return -EBUSY;
5519                 }
5520                 attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
5521                 __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
5522                                                 &run_ctx);
5523                 bpf_prog_put(prog);
5524                 return 0;
5525 #endif
5526         default:
5527                 return ____bpf_sys_bpf(cmd, attr, size);
5528         }
5529 }
5530 EXPORT_SYMBOL(kern_sys_bpf);
5531
5532 static const struct bpf_func_proto bpf_sys_bpf_proto = {
5533         .func           = bpf_sys_bpf,
5534         .gpl_only       = false,
5535         .ret_type       = RET_INTEGER,
5536         .arg1_type      = ARG_ANYTHING,
5537         .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
5538         .arg3_type      = ARG_CONST_SIZE,
5539 };
5540
5541 const struct bpf_func_proto * __weak
5542 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5543 {
5544         return bpf_base_func_proto(func_id);
5545 }
5546
5547 BPF_CALL_1(bpf_sys_close, u32, fd)
5548 {
5549         /* When bpf program calls this helper there should not be
5550          * an fdget() without matching completed fdput().
5551          * This helper is allowed in the following callchain only:
5552          * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
5553          */
5554         return close_fd(fd);
5555 }
5556
5557 static const struct bpf_func_proto bpf_sys_close_proto = {
5558         .func           = bpf_sys_close,
5559         .gpl_only       = false,
5560         .ret_type       = RET_INTEGER,
5561         .arg1_type      = ARG_ANYTHING,
5562 };
5563
5564 BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
5565 {
5566         if (flags)
5567                 return -EINVAL;
5568
5569         if (name_sz <= 1 || name[name_sz - 1])
5570                 return -EINVAL;
5571
5572         if (!bpf_dump_raw_ok(current_cred()))
5573                 return -EPERM;
5574
5575         *res = kallsyms_lookup_name(name);
5576         return *res ? 0 : -ENOENT;
5577 }
5578
5579 static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
5580         .func           = bpf_kallsyms_lookup_name,
5581         .gpl_only       = false,
5582         .ret_type       = RET_INTEGER,
5583         .arg1_type      = ARG_PTR_TO_MEM,
5584         .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
5585         .arg3_type      = ARG_ANYTHING,
5586         .arg4_type      = ARG_PTR_TO_LONG,
5587 };
5588
5589 static const struct bpf_func_proto *
5590 syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5591 {
5592         switch (func_id) {
5593         case BPF_FUNC_sys_bpf:
5594                 return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
5595         case BPF_FUNC_btf_find_by_name_kind:
5596                 return &bpf_btf_find_by_name_kind_proto;
5597         case BPF_FUNC_sys_close:
5598                 return &bpf_sys_close_proto;
5599         case BPF_FUNC_kallsyms_lookup_name:
5600                 return &bpf_kallsyms_lookup_name_proto;
5601         default:
5602                 return tracing_prog_func_proto(func_id, prog);
5603         }
5604 }
5605
5606 const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
5607         .get_func_proto  = syscall_prog_func_proto,
5608         .is_valid_access = syscall_prog_is_valid_access,
5609 };
5610
5611 const struct bpf_prog_ops bpf_syscall_prog_ops = {
5612         .test_run = bpf_prog_test_run_syscall,
5613 };
5614
5615 #ifdef CONFIG_SYSCTL
5616 static int bpf_stats_handler(struct ctl_table *table, int write,
5617                              void *buffer, size_t *lenp, loff_t *ppos)
5618 {
5619         struct static_key *key = (struct static_key *)table->data;
5620         static int saved_val;
5621         int val, ret;
5622         struct ctl_table tmp = {
5623                 .data   = &val,
5624                 .maxlen = sizeof(val),
5625                 .mode   = table->mode,
5626                 .extra1 = SYSCTL_ZERO,
5627                 .extra2 = SYSCTL_ONE,
5628         };
5629
5630         if (write && !capable(CAP_SYS_ADMIN))
5631                 return -EPERM;
5632
5633         mutex_lock(&bpf_stats_enabled_mutex);
5634         val = saved_val;
5635         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5636         if (write && !ret && val != saved_val) {
5637                 if (val)
5638                         static_key_slow_inc(key);
5639                 else
5640                         static_key_slow_dec(key);
5641                 saved_val = val;
5642         }
5643         mutex_unlock(&bpf_stats_enabled_mutex);
5644         return ret;
5645 }
5646
5647 void __weak unpriv_ebpf_notify(int new_state)
5648 {
5649 }
5650
5651 static int bpf_unpriv_handler(struct ctl_table *table, int write,
5652                               void *buffer, size_t *lenp, loff_t *ppos)
5653 {
5654         int ret, unpriv_enable = *(int *)table->data;
5655         bool locked_state = unpriv_enable == 1;
5656         struct ctl_table tmp = *table;
5657
5658         if (write && !capable(CAP_SYS_ADMIN))
5659                 return -EPERM;
5660
5661         tmp.data = &unpriv_enable;
5662         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5663         if (write && !ret) {
5664                 if (locked_state && unpriv_enable != 1)
5665                         return -EPERM;
5666                 *(int *)table->data = unpriv_enable;
5667         }
5668
5669         if (write)
5670                 unpriv_ebpf_notify(unpriv_enable);
5671
5672         return ret;
5673 }
5674
5675 static struct ctl_table bpf_syscall_table[] = {
5676         {
5677                 .procname       = "unprivileged_bpf_disabled",
5678                 .data           = &sysctl_unprivileged_bpf_disabled,
5679                 .maxlen         = sizeof(sysctl_unprivileged_bpf_disabled),
5680                 .mode           = 0644,
5681                 .proc_handler   = bpf_unpriv_handler,
5682                 .extra1         = SYSCTL_ZERO,
5683                 .extra2         = SYSCTL_TWO,
5684         },
5685         {
5686                 .procname       = "bpf_stats_enabled",
5687                 .data           = &bpf_stats_enabled_key.key,
5688                 .mode           = 0644,
5689                 .proc_handler   = bpf_stats_handler,
5690         },
5691         { }
5692 };
5693
5694 static int __init bpf_syscall_sysctl_init(void)
5695 {
5696         register_sysctl_init("kernel", bpf_syscall_table);
5697         return 0;
5698 }
5699 late_initcall(bpf_syscall_sysctl_init);
5700 #endif /* CONFIG_SYSCTL */