Commit | Line | Data |
---|---|---|
fec56f58 AS |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Copyright (c) 2019 Facebook */ | |
3 | #include <linux/hash.h> | |
4 | #include <linux/bpf.h> | |
5 | #include <linux/filter.h> | |
b91e014f | 6 | #include <linux/ftrace.h> |
e9b4e606 | 7 | #include <linux/rbtree_latch.h> |
a108f7dc | 8 | #include <linux/perf_event.h> |
9e4e01df | 9 | #include <linux/btf.h> |
1e6c62a8 AS |
10 | #include <linux/rcupdate_trace.h> |
11 | #include <linux/rcupdate_wait.h> | |
fec56f58 | 12 | |
be8704ff AS |
13 | /* dummy _ops. The verifier will operate on target program's ops. */ |
14 | const struct bpf_verifier_ops bpf_extension_verifier_ops = { | |
15 | }; | |
16 | const struct bpf_prog_ops bpf_extension_prog_ops = { | |
17 | }; | |
18 | ||
fec56f58 AS |
19 | /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ |
20 | #define TRAMPOLINE_HASH_BITS 10 | |
21 | #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) | |
22 | ||
23 | static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; | |
24 | ||
7ac88eba | 25 | /* serializes access to trampoline_table */ |
fec56f58 AS |
26 | static DEFINE_MUTEX(trampoline_mutex); |
27 | ||
7ac88eba | 28 | void *bpf_jit_alloc_exec_page(void) |
98e8627e BT |
29 | { |
30 | void *image; | |
31 | ||
32 | image = bpf_jit_alloc_exec(PAGE_SIZE); | |
33 | if (!image) | |
34 | return NULL; | |
35 | ||
36 | set_vm_flush_reset_perms(image); | |
37 | /* Keep image as writeable. The alternative is to keep flipping ro/rw | |
38 | * everytime new program is attached or detached. | |
39 | */ | |
40 | set_memory_x((long)image, 1); | |
41 | return image; | |
42 | } | |
43 | ||
a108f7dc JO |
44 | void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) |
45 | { | |
46 | ksym->start = (unsigned long) data; | |
7ac88eba | 47 | ksym->end = ksym->start + PAGE_SIZE; |
a108f7dc JO |
48 | bpf_ksym_add(ksym); |
49 | perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, | |
7ac88eba | 50 | PAGE_SIZE, false, ksym->name); |
a108f7dc JO |
51 | } |
52 | ||
53 | void bpf_image_ksym_del(struct bpf_ksym *ksym) | |
54 | { | |
55 | bpf_ksym_del(ksym); | |
56 | perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, | |
7ac88eba | 57 | PAGE_SIZE, true, ksym->name); |
a108f7dc JO |
58 | } |
59 | ||
60 | static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) | |
61 | { | |
62 | struct bpf_ksym *ksym = &tr->ksym; | |
63 | ||
64 | snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); | |
65 | bpf_image_ksym_add(tr->image, ksym); | |
66 | } | |
67 | ||
f7b12b6f | 68 | static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) |
fec56f58 AS |
69 | { |
70 | struct bpf_trampoline *tr; | |
71 | struct hlist_head *head; | |
72 | void *image; | |
73 | int i; | |
74 | ||
75 | mutex_lock(&trampoline_mutex); | |
76 | head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; | |
77 | hlist_for_each_entry(tr, head, hlist) { | |
78 | if (tr->key == key) { | |
79 | refcount_inc(&tr->refcnt); | |
80 | goto out; | |
81 | } | |
82 | } | |
83 | tr = kzalloc(sizeof(*tr), GFP_KERNEL); | |
84 | if (!tr) | |
85 | goto out; | |
86 | ||
87 | /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ | |
7ac88eba | 88 | image = bpf_jit_alloc_exec_page(); |
fec56f58 AS |
89 | if (!image) { |
90 | kfree(tr); | |
91 | tr = NULL; | |
92 | goto out; | |
93 | } | |
94 | ||
95 | tr->key = key; | |
96 | INIT_HLIST_NODE(&tr->hlist); | |
97 | hlist_add_head(&tr->hlist, head); | |
98 | refcount_set(&tr->refcnt, 1); | |
99 | mutex_init(&tr->mutex); | |
100 | for (i = 0; i < BPF_TRAMP_MAX; i++) | |
101 | INIT_HLIST_HEAD(&tr->progs_hlist[i]); | |
fec56f58 | 102 | tr->image = image; |
a108f7dc JO |
103 | INIT_LIST_HEAD_RCU(&tr->ksym.lnode); |
104 | bpf_trampoline_ksym_add(tr); | |
fec56f58 AS |
105 | out: |
106 | mutex_unlock(&trampoline_mutex); | |
107 | return tr; | |
108 | } | |
109 | ||
b91e014f AS |
110 | static int is_ftrace_location(void *ip) |
111 | { | |
112 | long addr; | |
113 | ||
114 | addr = ftrace_location((long)ip); | |
115 | if (!addr) | |
116 | return 0; | |
117 | if (WARN_ON_ONCE(addr != (long)ip)) | |
118 | return -EFAULT; | |
119 | return 1; | |
120 | } | |
121 | ||
122 | static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) | |
123 | { | |
124 | void *ip = tr->func.addr; | |
125 | int ret; | |
126 | ||
127 | if (tr->func.ftrace_managed) | |
128 | ret = unregister_ftrace_direct((long)ip, (long)old_addr); | |
129 | else | |
130 | ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); | |
131 | return ret; | |
132 | } | |
133 | ||
134 | static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) | |
135 | { | |
136 | void *ip = tr->func.addr; | |
137 | int ret; | |
138 | ||
139 | if (tr->func.ftrace_managed) | |
140 | ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); | |
141 | else | |
142 | ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); | |
143 | return ret; | |
144 | } | |
145 | ||
146 | /* first time registering */ | |
147 | static int register_fentry(struct bpf_trampoline *tr, void *new_addr) | |
148 | { | |
149 | void *ip = tr->func.addr; | |
150 | int ret; | |
151 | ||
152 | ret = is_ftrace_location(ip); | |
153 | if (ret < 0) | |
154 | return ret; | |
155 | tr->func.ftrace_managed = ret; | |
156 | ||
157 | if (tr->func.ftrace_managed) | |
158 | ret = register_ftrace_direct((long)ip, (long)new_addr); | |
159 | else | |
160 | ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); | |
161 | return ret; | |
162 | } | |
163 | ||
88fd9e53 KS |
164 | static struct bpf_tramp_progs * |
165 | bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) | |
166 | { | |
167 | const struct bpf_prog_aux *aux; | |
168 | struct bpf_tramp_progs *tprogs; | |
169 | struct bpf_prog **progs; | |
170 | int kind; | |
171 | ||
172 | *total = 0; | |
173 | tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); | |
174 | if (!tprogs) | |
175 | return ERR_PTR(-ENOMEM); | |
176 | ||
177 | for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { | |
178 | tprogs[kind].nr_progs = tr->progs_cnt[kind]; | |
179 | *total += tr->progs_cnt[kind]; | |
180 | progs = tprogs[kind].progs; | |
181 | ||
182 | hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) | |
183 | *progs++ = aux->prog; | |
184 | } | |
185 | return tprogs; | |
186 | } | |
fec56f58 AS |
187 | |
188 | static int bpf_trampoline_update(struct bpf_trampoline *tr) | |
189 | { | |
7ac88eba JO |
190 | void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; |
191 | void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; | |
88fd9e53 | 192 | struct bpf_tramp_progs *tprogs; |
fec56f58 | 193 | u32 flags = BPF_TRAMP_F_RESTORE_REGS; |
88fd9e53 | 194 | int err, total; |
fec56f58 | 195 | |
88fd9e53 KS |
196 | tprogs = bpf_trampoline_get_progs(tr, &total); |
197 | if (IS_ERR(tprogs)) | |
198 | return PTR_ERR(tprogs); | |
199 | ||
200 | if (total == 0) { | |
b91e014f | 201 | err = unregister_fentry(tr, old_image); |
fec56f58 AS |
202 | tr->selector = 0; |
203 | goto out; | |
204 | } | |
205 | ||
ae240823 KS |
206 | if (tprogs[BPF_TRAMP_FEXIT].nr_progs || |
207 | tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) | |
fec56f58 AS |
208 | flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; |
209 | ||
05d57f17 AS |
210 | /* Though the second half of trampoline page is unused a task could be |
211 | * preempted in the middle of the first half of trampoline and two | |
212 | * updates to trampoline would change the code from underneath the | |
213 | * preempted task. Hence wait for tasks to voluntarily schedule or go | |
214 | * to userspace. | |
1e6c62a8 AS |
215 | * The same trampoline can hold both sleepable and non-sleepable progs. |
216 | * synchronize_rcu_tasks_trace() is needed to make sure all sleepable | |
217 | * programs finish executing. | |
218 | * Wait for these two grace periods together. | |
05d57f17 | 219 | */ |
1e6c62a8 | 220 | synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); |
05d57f17 | 221 | |
7ac88eba | 222 | err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, |
88fd9e53 | 223 | &tr->func.model, flags, tprogs, |
fec56f58 | 224 | tr->func.addr); |
85d33df3 | 225 | if (err < 0) |
fec56f58 AS |
226 | goto out; |
227 | ||
228 | if (tr->selector) | |
229 | /* progs already running at this address */ | |
b91e014f | 230 | err = modify_fentry(tr, old_image, new_image); |
fec56f58 AS |
231 | else |
232 | /* first time registering */ | |
b91e014f | 233 | err = register_fentry(tr, new_image); |
fec56f58 AS |
234 | if (err) |
235 | goto out; | |
236 | tr->selector++; | |
237 | out: | |
88fd9e53 | 238 | kfree(tprogs); |
fec56f58 AS |
239 | return err; |
240 | } | |
241 | ||
9e4e01df | 242 | static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) |
fec56f58 | 243 | { |
9e4e01df | 244 | switch (prog->expected_attach_type) { |
fec56f58 AS |
245 | case BPF_TRACE_FENTRY: |
246 | return BPF_TRAMP_FENTRY; | |
ae240823 KS |
247 | case BPF_MODIFY_RETURN: |
248 | return BPF_TRAMP_MODIFY_RETURN; | |
be8704ff | 249 | case BPF_TRACE_FEXIT: |
fec56f58 | 250 | return BPF_TRAMP_FEXIT; |
9e4e01df KS |
251 | case BPF_LSM_MAC: |
252 | if (!prog->aux->attach_func_proto->type) | |
253 | /* The function returns void, we cannot modify its | |
254 | * return value. | |
255 | */ | |
256 | return BPF_TRAMP_FEXIT; | |
257 | else | |
258 | return BPF_TRAMP_MODIFY_RETURN; | |
be8704ff AS |
259 | default: |
260 | return BPF_TRAMP_REPLACE; | |
fec56f58 AS |
261 | } |
262 | } | |
263 | ||
3aac1ead | 264 | int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) |
fec56f58 AS |
265 | { |
266 | enum bpf_tramp_prog_type kind; | |
fec56f58 | 267 | int err = 0; |
be8704ff | 268 | int cnt; |
fec56f58 | 269 | |
9e4e01df | 270 | kind = bpf_attach_type_to_tramp(prog); |
fec56f58 | 271 | mutex_lock(&tr->mutex); |
be8704ff AS |
272 | if (tr->extension_prog) { |
273 | /* cannot attach fentry/fexit if extension prog is attached. | |
274 | * cannot overwrite extension prog either. | |
275 | */ | |
276 | err = -EBUSY; | |
277 | goto out; | |
278 | } | |
279 | cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; | |
280 | if (kind == BPF_TRAMP_REPLACE) { | |
281 | /* Cannot attach extension if fentry/fexit are in use. */ | |
282 | if (cnt) { | |
283 | err = -EBUSY; | |
284 | goto out; | |
285 | } | |
286 | tr->extension_prog = prog; | |
287 | err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, | |
288 | prog->bpf_func); | |
289 | goto out; | |
290 | } | |
291 | if (cnt >= BPF_MAX_TRAMP_PROGS) { | |
fec56f58 AS |
292 | err = -E2BIG; |
293 | goto out; | |
294 | } | |
295 | if (!hlist_unhashed(&prog->aux->tramp_hlist)) { | |
296 | /* prog already linked */ | |
297 | err = -EBUSY; | |
298 | goto out; | |
299 | } | |
300 | hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); | |
301 | tr->progs_cnt[kind]++; | |
3aac1ead | 302 | err = bpf_trampoline_update(tr); |
fec56f58 AS |
303 | if (err) { |
304 | hlist_del(&prog->aux->tramp_hlist); | |
305 | tr->progs_cnt[kind]--; | |
306 | } | |
307 | out: | |
308 | mutex_unlock(&tr->mutex); | |
309 | return err; | |
310 | } | |
311 | ||
312 | /* bpf_trampoline_unlink_prog() should never fail. */ | |
3aac1ead | 313 | int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) |
fec56f58 AS |
314 | { |
315 | enum bpf_tramp_prog_type kind; | |
fec56f58 AS |
316 | int err; |
317 | ||
9e4e01df | 318 | kind = bpf_attach_type_to_tramp(prog); |
fec56f58 | 319 | mutex_lock(&tr->mutex); |
be8704ff AS |
320 | if (kind == BPF_TRAMP_REPLACE) { |
321 | WARN_ON_ONCE(!tr->extension_prog); | |
322 | err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, | |
323 | tr->extension_prog->bpf_func, NULL); | |
324 | tr->extension_prog = NULL; | |
325 | goto out; | |
326 | } | |
fec56f58 AS |
327 | hlist_del(&prog->aux->tramp_hlist); |
328 | tr->progs_cnt[kind]--; | |
3aac1ead | 329 | err = bpf_trampoline_update(tr); |
be8704ff | 330 | out: |
fec56f58 AS |
331 | mutex_unlock(&tr->mutex); |
332 | return err; | |
333 | } | |
334 | ||
f7b12b6f THJ |
335 | struct bpf_trampoline *bpf_trampoline_get(u64 key, |
336 | struct bpf_attach_target_info *tgt_info) | |
337 | { | |
338 | struct bpf_trampoline *tr; | |
339 | ||
340 | tr = bpf_trampoline_lookup(key); | |
341 | if (!tr) | |
342 | return NULL; | |
343 | ||
344 | mutex_lock(&tr->mutex); | |
345 | if (tr->func.addr) | |
346 | goto out; | |
347 | ||
348 | memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); | |
349 | tr->func.addr = (void *)tgt_info->tgt_addr; | |
350 | out: | |
351 | mutex_unlock(&tr->mutex); | |
352 | return tr; | |
353 | } | |
354 | ||
fec56f58 AS |
355 | void bpf_trampoline_put(struct bpf_trampoline *tr) |
356 | { | |
357 | if (!tr) | |
358 | return; | |
359 | mutex_lock(&trampoline_mutex); | |
360 | if (!refcount_dec_and_test(&tr->refcnt)) | |
361 | goto out; | |
362 | WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); | |
363 | if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) | |
364 | goto out; | |
365 | if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) | |
366 | goto out; | |
a108f7dc | 367 | bpf_image_ksym_del(&tr->ksym); |
1e6c62a8 AS |
368 | /* This code will be executed when all bpf progs (both sleepable and |
369 | * non-sleepable) went through | |
370 | * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). | |
371 | * Hence no need for another synchronize_rcu_tasks_trace() here, | |
372 | * but synchronize_rcu_tasks() is still needed, since trampoline | |
373 | * may not have had any sleepable programs and we need to wait | |
374 | * for tasks to get out of trampoline code before freeing it. | |
375 | */ | |
05d57f17 | 376 | synchronize_rcu_tasks(); |
7ac88eba | 377 | bpf_jit_free_exec(tr->image); |
fec56f58 AS |
378 | hlist_del(&tr->hlist); |
379 | kfree(tr); | |
380 | out: | |
381 | mutex_unlock(&trampoline_mutex); | |
382 | } | |
383 | ||
ca06f55b | 384 | #define NO_START_TIME 1 |
f2dd3b39 AS |
385 | static u64 notrace bpf_prog_start_time(void) |
386 | { | |
387 | u64 start = NO_START_TIME; | |
388 | ||
ca06f55b | 389 | if (static_branch_unlikely(&bpf_stats_enabled_key)) { |
f2dd3b39 | 390 | start = sched_clock(); |
ca06f55b AS |
391 | if (unlikely(!start)) |
392 | start = NO_START_TIME; | |
393 | } | |
f2dd3b39 AS |
394 | return start; |
395 | } | |
396 | ||
02ad0596 DM |
397 | /* The logic is similar to BPF_PROG_RUN, but with an explicit |
398 | * rcu_read_lock() and migrate_disable() which are required | |
399 | * for the trampoline. The macro is split into | |
f2dd3b39 | 400 | * call __bpf_prog_enter |
fec56f58 AS |
401 | * call prog->bpf_func |
402 | * call __bpf_prog_exit | |
ca06f55b AS |
403 | * |
404 | * __bpf_prog_enter returns: | |
405 | * 0 - skip execution of the bpf prog | |
406 | * 1 - execute bpf prog | |
407 | * [2..MAX_U64] - excute bpf prog and record execution time. | |
408 | * This is start time. | |
fec56f58 | 409 | */ |
ca06f55b | 410 | u64 notrace __bpf_prog_enter(struct bpf_prog *prog) |
dcce11d5 | 411 | __acquires(RCU) |
fec56f58 | 412 | { |
fec56f58 | 413 | rcu_read_lock(); |
02ad0596 | 414 | migrate_disable(); |
ca06f55b AS |
415 | if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) |
416 | return 0; | |
f2dd3b39 | 417 | return bpf_prog_start_time(); |
fec56f58 AS |
418 | } |
419 | ||
f2dd3b39 AS |
420 | static void notrace update_prog_stats(struct bpf_prog *prog, |
421 | u64 start) | |
fec56f58 AS |
422 | { |
423 | struct bpf_prog_stats *stats; | |
424 | ||
425 | if (static_branch_unlikely(&bpf_stats_enabled_key) && | |
f2dd3b39 AS |
426 | /* static_key could be enabled in __bpf_prog_enter* |
427 | * and disabled in __bpf_prog_exit*. | |
fec56f58 | 428 | * And vice versa. |
f2dd3b39 | 429 | * Hence check that 'start' is valid. |
fec56f58 | 430 | */ |
f2dd3b39 | 431 | start > NO_START_TIME) { |
700d4796 | 432 | stats = this_cpu_ptr(prog->stats); |
fec56f58 AS |
433 | u64_stats_update_begin(&stats->syncp); |
434 | stats->cnt++; | |
435 | stats->nsecs += sched_clock() - start; | |
436 | u64_stats_update_end(&stats->syncp); | |
437 | } | |
f2dd3b39 AS |
438 | } |
439 | ||
440 | void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) | |
441 | __releases(RCU) | |
442 | { | |
443 | update_prog_stats(prog, start); | |
ca06f55b | 444 | __this_cpu_dec(*(prog->active)); |
02ad0596 | 445 | migrate_enable(); |
fec56f58 AS |
446 | rcu_read_unlock(); |
447 | } | |
448 | ||
ca06f55b | 449 | u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) |
1e6c62a8 AS |
450 | { |
451 | rcu_read_lock_trace(); | |
031d6e02 | 452 | migrate_disable(); |
f56407fa | 453 | might_fault(); |
ca06f55b AS |
454 | if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) |
455 | return 0; | |
f2dd3b39 | 456 | return bpf_prog_start_time(); |
1e6c62a8 AS |
457 | } |
458 | ||
f2dd3b39 | 459 | void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) |
1e6c62a8 | 460 | { |
f2dd3b39 | 461 | update_prog_stats(prog, start); |
ca06f55b | 462 | __this_cpu_dec(*(prog->active)); |
031d6e02 | 463 | migrate_enable(); |
1e6c62a8 AS |
464 | rcu_read_unlock_trace(); |
465 | } | |
466 | ||
fec56f58 | 467 | int __weak |
85d33df3 MKL |
468 | arch_prepare_bpf_trampoline(void *image, void *image_end, |
469 | const struct btf_func_model *m, u32 flags, | |
88fd9e53 | 470 | struct bpf_tramp_progs *tprogs, |
fec56f58 AS |
471 | void *orig_call) |
472 | { | |
473 | return -ENOTSUPP; | |
474 | } | |
475 | ||
476 | static int __init init_trampolines(void) | |
477 | { | |
478 | int i; | |
479 | ||
480 | for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) | |
481 | INIT_HLIST_HEAD(&trampoline_table[i]); | |
482 | return 0; | |
483 | } | |
484 | late_initcall(init_trampolines); |