Commit | Line | Data |
---|---|---|
eaaacd23 YS |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Copyright (c) 2020 Facebook */ | |
3 | ||
4 | #include <linux/init.h> | |
5 | #include <linux/namei.h> | |
6 | #include <linux/pid_namespace.h> | |
7 | #include <linux/fs.h> | |
8 | #include <linux/fdtable.h> | |
9 | #include <linux/filter.h> | |
951cf368 | 10 | #include <linux/btf_ids.h> |
7c7e3d31 | 11 | #include "mmap_unlock_work.h" |
eaaacd23 | 12 | |
2c4fe44f KFL |
13 | static const char * const iter_task_type_names[] = { |
14 | "ALL", | |
15 | "TID", | |
16 | "PID", | |
17 | }; | |
18 | ||
eaaacd23 YS |
19 | struct bpf_iter_seq_task_common { |
20 | struct pid_namespace *ns; | |
f0d74c4d KFL |
21 | enum bpf_iter_task_type type; |
22 | u32 pid; | |
23 | u32 pid_visiting; | |
eaaacd23 YS |
24 | }; |
25 | ||
26 | struct bpf_iter_seq_task_info { | |
27 | /* The first field must be struct bpf_iter_seq_task_common. | |
28 | * this is assumed by {init, fini}_seq_pidns() callback functions. | |
29 | */ | |
30 | struct bpf_iter_seq_task_common common; | |
31 | u32 tid; | |
32 | }; | |
33 | ||
f0d74c4d KFL |
34 | static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, |
35 | u32 *tid, | |
36 | bool skip_if_dup_files) | |
37 | { | |
38 | struct task_struct *task, *next_task; | |
39 | struct pid *pid; | |
40 | u32 saved_tid; | |
41 | ||
42 | if (!*tid) { | |
43 | /* The first time, the iterator calls this function. */ | |
44 | pid = find_pid_ns(common->pid, common->ns); | |
45 | if (!pid) | |
46 | return NULL; | |
47 | ||
48 | task = get_pid_task(pid, PIDTYPE_TGID); | |
49 | if (!task) | |
50 | return NULL; | |
51 | ||
52 | *tid = common->pid; | |
53 | common->pid_visiting = common->pid; | |
54 | ||
55 | return task; | |
56 | } | |
57 | ||
58 | /* If the control returns to user space and comes back to the | |
59 | * kernel again, *tid and common->pid_visiting should be the | |
60 | * same for task_seq_start() to pick up the correct task. | |
61 | */ | |
62 | if (*tid == common->pid_visiting) { | |
63 | pid = find_pid_ns(common->pid_visiting, common->ns); | |
64 | task = get_pid_task(pid, PIDTYPE_PID); | |
65 | ||
66 | return task; | |
67 | } | |
68 | ||
69 | pid = find_pid_ns(common->pid_visiting, common->ns); | |
70 | if (!pid) | |
71 | return NULL; | |
72 | ||
73 | task = get_pid_task(pid, PIDTYPE_PID); | |
74 | if (!task) | |
75 | return NULL; | |
76 | ||
77 | retry: | |
78 | if (!pid_alive(task)) { | |
79 | put_task_struct(task); | |
80 | return NULL; | |
81 | } | |
82 | ||
83 | next_task = next_thread(task); | |
84 | put_task_struct(task); | |
85 | if (!next_task) | |
86 | return NULL; | |
87 | ||
88 | saved_tid = *tid; | |
89 | *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); | |
90 | if (!*tid || *tid == common->pid) { | |
91 | /* Run out of tasks of a process. The tasks of a | |
92 | * thread_group are linked as circular linked list. | |
93 | */ | |
94 | *tid = saved_tid; | |
95 | return NULL; | |
96 | } | |
97 | ||
98 | get_task_struct(next_task); | |
99 | common->pid_visiting = *tid; | |
100 | ||
101 | if (skip_if_dup_files && task->files == task->group_leader->files) { | |
102 | task = next_task; | |
103 | goto retry; | |
104 | } | |
105 | ||
106 | return next_task; | |
107 | } | |
108 | ||
109 | static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, | |
203d7b05 YS |
110 | u32 *tid, |
111 | bool skip_if_dup_files) | |
eaaacd23 YS |
112 | { |
113 | struct task_struct *task = NULL; | |
114 | struct pid *pid; | |
115 | ||
f0d74c4d KFL |
116 | if (common->type == BPF_TASK_ITER_TID) { |
117 | if (*tid && *tid != common->pid) | |
118 | return NULL; | |
119 | rcu_read_lock(); | |
120 | pid = find_pid_ns(common->pid, common->ns); | |
121 | if (pid) { | |
122 | task = get_pid_task(pid, PIDTYPE_TGID); | |
123 | *tid = common->pid; | |
124 | } | |
125 | rcu_read_unlock(); | |
126 | ||
127 | return task; | |
128 | } | |
129 | ||
130 | if (common->type == BPF_TASK_ITER_TGID) { | |
131 | rcu_read_lock(); | |
132 | task = task_group_seq_get_next(common, tid, skip_if_dup_files); | |
133 | rcu_read_unlock(); | |
134 | ||
135 | return task; | |
136 | } | |
137 | ||
eaaacd23 | 138 | rcu_read_lock(); |
c70f34a8 | 139 | retry: |
f0d74c4d | 140 | pid = find_ge_pid(*tid, common->ns); |
c70f34a8 | 141 | if (pid) { |
f0d74c4d | 142 | *tid = pid_nr_ns(pid, common->ns); |
eaaacd23 | 143 | task = get_pid_task(pid, PIDTYPE_PID); |
c70f34a8 AN |
144 | if (!task) { |
145 | ++*tid; | |
146 | goto retry; | |
a61daaf3 | 147 | } else if (skip_if_dup_files && !thread_group_leader(task) && |
203d7b05 YS |
148 | task->files == task->group_leader->files) { |
149 | put_task_struct(task); | |
150 | task = NULL; | |
151 | ++*tid; | |
152 | goto retry; | |
c70f34a8 AN |
153 | } |
154 | } | |
eaaacd23 YS |
155 | rcu_read_unlock(); |
156 | ||
157 | return task; | |
158 | } | |
159 | ||
160 | static void *task_seq_start(struct seq_file *seq, loff_t *pos) | |
161 | { | |
162 | struct bpf_iter_seq_task_info *info = seq->private; | |
163 | struct task_struct *task; | |
164 | ||
f0d74c4d | 165 | task = task_seq_get_next(&info->common, &info->tid, false); |
eaaacd23 YS |
166 | if (!task) |
167 | return NULL; | |
168 | ||
3f9969f2 YS |
169 | if (*pos == 0) |
170 | ++*pos; | |
eaaacd23 YS |
171 | return task; |
172 | } | |
173 | ||
174 | static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |
175 | { | |
176 | struct bpf_iter_seq_task_info *info = seq->private; | |
177 | struct task_struct *task; | |
178 | ||
179 | ++*pos; | |
180 | ++info->tid; | |
181 | put_task_struct((struct task_struct *)v); | |
f0d74c4d | 182 | task = task_seq_get_next(&info->common, &info->tid, false); |
eaaacd23 YS |
183 | if (!task) |
184 | return NULL; | |
185 | ||
186 | return task; | |
187 | } | |
188 | ||
189 | struct bpf_iter__task { | |
190 | __bpf_md_ptr(struct bpf_iter_meta *, meta); | |
191 | __bpf_md_ptr(struct task_struct *, task); | |
192 | }; | |
193 | ||
194 | DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) | |
195 | ||
196 | static int __task_seq_show(struct seq_file *seq, struct task_struct *task, | |
197 | bool in_stop) | |
198 | { | |
199 | struct bpf_iter_meta meta; | |
200 | struct bpf_iter__task ctx; | |
201 | struct bpf_prog *prog; | |
202 | ||
203 | meta.seq = seq; | |
204 | prog = bpf_iter_get_info(&meta, in_stop); | |
205 | if (!prog) | |
206 | return 0; | |
207 | ||
eaaacd23 YS |
208 | ctx.meta = &meta; |
209 | ctx.task = task; | |
210 | return bpf_iter_run_prog(prog, &ctx); | |
211 | } | |
212 | ||
213 | static int task_seq_show(struct seq_file *seq, void *v) | |
214 | { | |
215 | return __task_seq_show(seq, v, false); | |
216 | } | |
217 | ||
218 | static void task_seq_stop(struct seq_file *seq, void *v) | |
219 | { | |
220 | if (!v) | |
221 | (void)__task_seq_show(seq, v, true); | |
222 | else | |
223 | put_task_struct((struct task_struct *)v); | |
224 | } | |
225 | ||
f0d74c4d KFL |
226 | static int bpf_iter_attach_task(struct bpf_prog *prog, |
227 | union bpf_iter_link_info *linfo, | |
228 | struct bpf_iter_aux_info *aux) | |
229 | { | |
230 | unsigned int flags; | |
231 | struct pid *pid; | |
232 | pid_t tgid; | |
233 | ||
234 | if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) | |
235 | return -EINVAL; | |
236 | ||
237 | aux->task.type = BPF_TASK_ITER_ALL; | |
238 | if (linfo->task.tid != 0) { | |
239 | aux->task.type = BPF_TASK_ITER_TID; | |
240 | aux->task.pid = linfo->task.tid; | |
241 | } | |
242 | if (linfo->task.pid != 0) { | |
243 | aux->task.type = BPF_TASK_ITER_TGID; | |
244 | aux->task.pid = linfo->task.pid; | |
245 | } | |
246 | if (linfo->task.pid_fd != 0) { | |
247 | aux->task.type = BPF_TASK_ITER_TGID; | |
248 | ||
249 | pid = pidfd_get_pid(linfo->task.pid_fd, &flags); | |
250 | if (IS_ERR(pid)) | |
251 | return PTR_ERR(pid); | |
252 | ||
253 | tgid = pid_nr_ns(pid, task_active_pid_ns(current)); | |
254 | aux->task.pid = tgid; | |
255 | put_pid(pid); | |
256 | } | |
257 | ||
258 | return 0; | |
259 | } | |
260 | ||
eaaacd23 YS |
261 | static const struct seq_operations task_seq_ops = { |
262 | .start = task_seq_start, | |
263 | .next = task_seq_next, | |
264 | .stop = task_seq_stop, | |
265 | .show = task_seq_show, | |
266 | }; | |
267 | ||
268 | struct bpf_iter_seq_task_file_info { | |
269 | /* The first field must be struct bpf_iter_seq_task_common. | |
270 | * this is assumed by {init, fini}_seq_pidns() callback functions. | |
271 | */ | |
272 | struct bpf_iter_seq_task_common common; | |
273 | struct task_struct *task; | |
eaaacd23 YS |
274 | u32 tid; |
275 | u32 fd; | |
276 | }; | |
277 | ||
278 | static struct file * | |
91b2db27 | 279 | task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) |
eaaacd23 | 280 | { |
f0d74c4d | 281 | u32 saved_tid = info->tid; |
eaaacd23 | 282 | struct task_struct *curr_task; |
66ed5944 | 283 | unsigned int curr_fd = info->fd; |
eaaacd23 YS |
284 | |
285 | /* If this function returns a non-NULL file object, | |
66ed5944 | 286 | * it held a reference to the task/file. |
eaaacd23 YS |
287 | * Otherwise, it does not hold any reference. |
288 | */ | |
289 | again: | |
91b2db27 SL |
290 | if (info->task) { |
291 | curr_task = info->task; | |
eaaacd23 YS |
292 | curr_fd = info->fd; |
293 | } else { | |
f0d74c4d | 294 | curr_task = task_seq_get_next(&info->common, &info->tid, true); |
4bfc4714 DM |
295 | if (!curr_task) { |
296 | info->task = NULL; | |
4bfc4714 DM |
297 | return NULL; |
298 | } | |
299 | ||
f0d74c4d | 300 | /* set info->task */ |
04901aab | 301 | info->task = curr_task; |
f0d74c4d | 302 | if (saved_tid == info->tid) |
eaaacd23 | 303 | curr_fd = info->fd; |
f0d74c4d | 304 | else |
eaaacd23 | 305 | curr_fd = 0; |
eaaacd23 YS |
306 | } |
307 | ||
308 | rcu_read_lock(); | |
66ed5944 | 309 | for (;; curr_fd++) { |
eaaacd23 | 310 | struct file *f; |
66ed5944 | 311 | f = task_lookup_next_fd_rcu(curr_task, &curr_fd); |
eaaacd23 | 312 | if (!f) |
66ed5944 | 313 | break; |
cf28f3bb YS |
314 | if (!get_file_rcu(f)) |
315 | continue; | |
eaaacd23 YS |
316 | |
317 | /* set info->fd */ | |
318 | info->fd = curr_fd; | |
eaaacd23 YS |
319 | rcu_read_unlock(); |
320 | return f; | |
321 | } | |
322 | ||
323 | /* the current task is done, go to the next task */ | |
324 | rcu_read_unlock(); | |
eaaacd23 | 325 | put_task_struct(curr_task); |
f0d74c4d KFL |
326 | |
327 | if (info->common.type == BPF_TASK_ITER_TID) { | |
328 | info->task = NULL; | |
329 | return NULL; | |
330 | } | |
331 | ||
91b2db27 | 332 | info->task = NULL; |
eaaacd23 | 333 | info->fd = 0; |
f0d74c4d | 334 | saved_tid = ++(info->tid); |
eaaacd23 YS |
335 | goto again; |
336 | } | |
337 | ||
338 | static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) | |
339 | { | |
340 | struct bpf_iter_seq_task_file_info *info = seq->private; | |
eaaacd23 YS |
341 | struct file *file; |
342 | ||
91b2db27 | 343 | info->task = NULL; |
91b2db27 SL |
344 | file = task_file_seq_get_next(info); |
345 | if (file && *pos == 0) | |
3f9969f2 | 346 | ++*pos; |
eaaacd23 YS |
347 | |
348 | return file; | |
349 | } | |
350 | ||
351 | static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |
352 | { | |
353 | struct bpf_iter_seq_task_file_info *info = seq->private; | |
eaaacd23 YS |
354 | |
355 | ++*pos; | |
356 | ++info->fd; | |
357 | fput((struct file *)v); | |
91b2db27 | 358 | return task_file_seq_get_next(info); |
eaaacd23 YS |
359 | } |
360 | ||
361 | struct bpf_iter__task_file { | |
362 | __bpf_md_ptr(struct bpf_iter_meta *, meta); | |
363 | __bpf_md_ptr(struct task_struct *, task); | |
364 | u32 fd __aligned(8); | |
365 | __bpf_md_ptr(struct file *, file); | |
366 | }; | |
367 | ||
368 | DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, | |
369 | struct task_struct *task, u32 fd, | |
370 | struct file *file) | |
371 | ||
372 | static int __task_file_seq_show(struct seq_file *seq, struct file *file, | |
373 | bool in_stop) | |
374 | { | |
375 | struct bpf_iter_seq_task_file_info *info = seq->private; | |
376 | struct bpf_iter__task_file ctx; | |
377 | struct bpf_iter_meta meta; | |
378 | struct bpf_prog *prog; | |
379 | ||
380 | meta.seq = seq; | |
381 | prog = bpf_iter_get_info(&meta, in_stop); | |
382 | if (!prog) | |
383 | return 0; | |
384 | ||
385 | ctx.meta = &meta; | |
386 | ctx.task = info->task; | |
387 | ctx.fd = info->fd; | |
388 | ctx.file = file; | |
389 | return bpf_iter_run_prog(prog, &ctx); | |
390 | } | |
391 | ||
392 | static int task_file_seq_show(struct seq_file *seq, void *v) | |
393 | { | |
394 | return __task_file_seq_show(seq, v, false); | |
395 | } | |
396 | ||
397 | static void task_file_seq_stop(struct seq_file *seq, void *v) | |
398 | { | |
399 | struct bpf_iter_seq_task_file_info *info = seq->private; | |
400 | ||
401 | if (!v) { | |
402 | (void)__task_file_seq_show(seq, v, true); | |
403 | } else { | |
404 | fput((struct file *)v); | |
eaaacd23 | 405 | put_task_struct(info->task); |
eaaacd23 YS |
406 | info->task = NULL; |
407 | } | |
408 | } | |
409 | ||
f9c79272 | 410 | static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) |
eaaacd23 YS |
411 | { |
412 | struct bpf_iter_seq_task_common *common = priv_data; | |
413 | ||
414 | common->ns = get_pid_ns(task_active_pid_ns(current)); | |
f0d74c4d KFL |
415 | common->type = aux->task.type; |
416 | common->pid = aux->task.pid; | |
417 | ||
eaaacd23 YS |
418 | return 0; |
419 | } | |
420 | ||
421 | static void fini_seq_pidns(void *priv_data) | |
422 | { | |
423 | struct bpf_iter_seq_task_common *common = priv_data; | |
424 | ||
425 | put_pid_ns(common->ns); | |
426 | } | |
427 | ||
428 | static const struct seq_operations task_file_seq_ops = { | |
429 | .start = task_file_seq_start, | |
430 | .next = task_file_seq_next, | |
431 | .stop = task_file_seq_stop, | |
432 | .show = task_file_seq_show, | |
433 | }; | |
434 | ||
3a7b35b8 SL |
435 | struct bpf_iter_seq_task_vma_info { |
436 | /* The first field must be struct bpf_iter_seq_task_common. | |
437 | * this is assumed by {init, fini}_seq_pidns() callback functions. | |
438 | */ | |
439 | struct bpf_iter_seq_task_common common; | |
440 | struct task_struct *task; | |
7ff94f27 | 441 | struct mm_struct *mm; |
3a7b35b8 SL |
442 | struct vm_area_struct *vma; |
443 | u32 tid; | |
444 | unsigned long prev_vm_start; | |
445 | unsigned long prev_vm_end; | |
446 | }; | |
447 | ||
448 | enum bpf_task_vma_iter_find_op { | |
becc8cdb LH |
449 | task_vma_iter_first_vma, /* use find_vma() with addr 0 */ |
450 | task_vma_iter_next_vma, /* use vma_next() with curr_vma */ | |
3a7b35b8 SL |
451 | task_vma_iter_find_vma, /* use find_vma() to find next vma */ |
452 | }; | |
453 | ||
454 | static struct vm_area_struct * | |
455 | task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) | |
456 | { | |
3a7b35b8 SL |
457 | enum bpf_task_vma_iter_find_op op; |
458 | struct vm_area_struct *curr_vma; | |
459 | struct task_struct *curr_task; | |
7ff94f27 | 460 | struct mm_struct *curr_mm; |
f0d74c4d | 461 | u32 saved_tid = info->tid; |
3a7b35b8 SL |
462 | |
463 | /* If this function returns a non-NULL vma, it holds a reference to | |
7ff94f27 KFL |
464 | * the task_struct, holds a refcount on mm->mm_users, and holds |
465 | * read lock on vma->mm->mmap_lock. | |
3a7b35b8 SL |
466 | * If this function returns NULL, it does not hold any reference or |
467 | * lock. | |
468 | */ | |
469 | if (info->task) { | |
470 | curr_task = info->task; | |
471 | curr_vma = info->vma; | |
7ff94f27 | 472 | curr_mm = info->mm; |
3a7b35b8 SL |
473 | /* In case of lock contention, drop mmap_lock to unblock |
474 | * the writer. | |
475 | * | |
476 | * After relock, call find(mm, prev_vm_end - 1) to find | |
477 | * new vma to process. | |
478 | * | |
479 | * +------+------+-----------+ | |
480 | * | VMA1 | VMA2 | VMA3 | | |
481 | * +------+------+-----------+ | |
482 | * | | | | | |
483 | * 4k 8k 16k 400k | |
484 | * | |
485 | * For example, curr_vma == VMA2. Before unlock, we set | |
486 | * | |
487 | * prev_vm_start = 8k | |
488 | * prev_vm_end = 16k | |
489 | * | |
490 | * There are a few cases: | |
491 | * | |
492 | * 1) VMA2 is freed, but VMA3 exists. | |
493 | * | |
494 | * find_vma() will return VMA3, just process VMA3. | |
495 | * | |
496 | * 2) VMA2 still exists. | |
497 | * | |
498 | * find_vma() will return VMA2, process VMA2->next. | |
499 | * | |
500 | * 3) no more vma in this mm. | |
501 | * | |
502 | * Process the next task. | |
503 | * | |
504 | * 4) find_vma() returns a different vma, VMA2'. | |
505 | * | |
506 | * 4.1) If VMA2 covers same range as VMA2', skip VMA2', | |
507 | * because we already covered the range; | |
508 | * 4.2) VMA2 and VMA2' covers different ranges, process | |
509 | * VMA2'. | |
510 | */ | |
7ff94f27 | 511 | if (mmap_lock_is_contended(curr_mm)) { |
3a7b35b8 SL |
512 | info->prev_vm_start = curr_vma->vm_start; |
513 | info->prev_vm_end = curr_vma->vm_end; | |
514 | op = task_vma_iter_find_vma; | |
7ff94f27 KFL |
515 | mmap_read_unlock(curr_mm); |
516 | if (mmap_read_lock_killable(curr_mm)) { | |
517 | mmput(curr_mm); | |
3a7b35b8 | 518 | goto finish; |
7ff94f27 | 519 | } |
3a7b35b8 SL |
520 | } else { |
521 | op = task_vma_iter_next_vma; | |
522 | } | |
523 | } else { | |
524 | again: | |
f0d74c4d | 525 | curr_task = task_seq_get_next(&info->common, &info->tid, true); |
3a7b35b8 | 526 | if (!curr_task) { |
f0d74c4d | 527 | info->tid++; |
3a7b35b8 SL |
528 | goto finish; |
529 | } | |
530 | ||
f0d74c4d | 531 | if (saved_tid != info->tid) { |
3a7b35b8 SL |
532 | /* new task, process the first vma */ |
533 | op = task_vma_iter_first_vma; | |
534 | } else { | |
535 | /* Found the same tid, which means the user space | |
536 | * finished data in previous buffer and read more. | |
537 | * We dropped mmap_lock before returning to user | |
538 | * space, so it is necessary to use find_vma() to | |
539 | * find the next vma to process. | |
540 | */ | |
541 | op = task_vma_iter_find_vma; | |
542 | } | |
543 | ||
7ff94f27 KFL |
544 | curr_mm = get_task_mm(curr_task); |
545 | if (!curr_mm) | |
3a7b35b8 SL |
546 | goto next_task; |
547 | ||
7ff94f27 KFL |
548 | if (mmap_read_lock_killable(curr_mm)) { |
549 | mmput(curr_mm); | |
3a7b35b8 | 550 | goto finish; |
7ff94f27 | 551 | } |
3a7b35b8 SL |
552 | } |
553 | ||
554 | switch (op) { | |
555 | case task_vma_iter_first_vma: | |
7ff94f27 | 556 | curr_vma = find_vma(curr_mm, 0); |
3a7b35b8 SL |
557 | break; |
558 | case task_vma_iter_next_vma: | |
7ff94f27 | 559 | curr_vma = find_vma(curr_mm, curr_vma->vm_end); |
3a7b35b8 SL |
560 | break; |
561 | case task_vma_iter_find_vma: | |
562 | /* We dropped mmap_lock so it is necessary to use find_vma | |
563 | * to find the next vma. This is similar to the mechanism | |
564 | * in show_smaps_rollup(). | |
565 | */ | |
7ff94f27 | 566 | curr_vma = find_vma(curr_mm, info->prev_vm_end - 1); |
3a7b35b8 SL |
567 | /* case 1) and 4.2) above just use curr_vma */ |
568 | ||
569 | /* check for case 2) or case 4.1) above */ | |
570 | if (curr_vma && | |
571 | curr_vma->vm_start == info->prev_vm_start && | |
572 | curr_vma->vm_end == info->prev_vm_end) | |
7ff94f27 | 573 | curr_vma = find_vma(curr_mm, curr_vma->vm_end); |
3a7b35b8 SL |
574 | break; |
575 | } | |
576 | if (!curr_vma) { | |
577 | /* case 3) above, or case 2) 4.1) with vma->next == NULL */ | |
7ff94f27 KFL |
578 | mmap_read_unlock(curr_mm); |
579 | mmput(curr_mm); | |
3a7b35b8 SL |
580 | goto next_task; |
581 | } | |
582 | info->task = curr_task; | |
583 | info->vma = curr_vma; | |
7ff94f27 | 584 | info->mm = curr_mm; |
3a7b35b8 SL |
585 | return curr_vma; |
586 | ||
587 | next_task: | |
f0d74c4d KFL |
588 | if (info->common.type == BPF_TASK_ITER_TID) |
589 | goto finish; | |
590 | ||
3a7b35b8 SL |
591 | put_task_struct(curr_task); |
592 | info->task = NULL; | |
7ff94f27 | 593 | info->mm = NULL; |
f0d74c4d | 594 | info->tid++; |
3a7b35b8 SL |
595 | goto again; |
596 | ||
597 | finish: | |
598 | if (curr_task) | |
599 | put_task_struct(curr_task); | |
600 | info->task = NULL; | |
601 | info->vma = NULL; | |
7ff94f27 | 602 | info->mm = NULL; |
3a7b35b8 SL |
603 | return NULL; |
604 | } | |
605 | ||
606 | static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) | |
607 | { | |
608 | struct bpf_iter_seq_task_vma_info *info = seq->private; | |
609 | struct vm_area_struct *vma; | |
610 | ||
611 | vma = task_vma_seq_get_next(info); | |
612 | if (vma && *pos == 0) | |
613 | ++*pos; | |
614 | ||
615 | return vma; | |
616 | } | |
617 | ||
618 | static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |
619 | { | |
620 | struct bpf_iter_seq_task_vma_info *info = seq->private; | |
621 | ||
622 | ++*pos; | |
623 | return task_vma_seq_get_next(info); | |
624 | } | |
625 | ||
626 | struct bpf_iter__task_vma { | |
627 | __bpf_md_ptr(struct bpf_iter_meta *, meta); | |
628 | __bpf_md_ptr(struct task_struct *, task); | |
629 | __bpf_md_ptr(struct vm_area_struct *, vma); | |
630 | }; | |
631 | ||
632 | DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, | |
633 | struct task_struct *task, struct vm_area_struct *vma) | |
634 | ||
635 | static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) | |
636 | { | |
637 | struct bpf_iter_seq_task_vma_info *info = seq->private; | |
638 | struct bpf_iter__task_vma ctx; | |
639 | struct bpf_iter_meta meta; | |
640 | struct bpf_prog *prog; | |
641 | ||
642 | meta.seq = seq; | |
643 | prog = bpf_iter_get_info(&meta, in_stop); | |
644 | if (!prog) | |
645 | return 0; | |
646 | ||
647 | ctx.meta = &meta; | |
648 | ctx.task = info->task; | |
649 | ctx.vma = info->vma; | |
650 | return bpf_iter_run_prog(prog, &ctx); | |
651 | } | |
652 | ||
653 | static int task_vma_seq_show(struct seq_file *seq, void *v) | |
654 | { | |
655 | return __task_vma_seq_show(seq, false); | |
656 | } | |
657 | ||
658 | static void task_vma_seq_stop(struct seq_file *seq, void *v) | |
659 | { | |
660 | struct bpf_iter_seq_task_vma_info *info = seq->private; | |
661 | ||
662 | if (!v) { | |
663 | (void)__task_vma_seq_show(seq, true); | |
664 | } else { | |
665 | /* info->vma has not been seen by the BPF program. If the | |
666 | * user space reads more, task_vma_seq_get_next should | |
667 | * return this vma again. Set prev_vm_start to ~0UL, | |
668 | * so that we don't skip the vma returned by the next | |
669 | * find_vma() (case task_vma_iter_find_vma in | |
670 | * task_vma_seq_get_next()). | |
671 | */ | |
672 | info->prev_vm_start = ~0UL; | |
673 | info->prev_vm_end = info->vma->vm_end; | |
7ff94f27 KFL |
674 | mmap_read_unlock(info->mm); |
675 | mmput(info->mm); | |
676 | info->mm = NULL; | |
3a7b35b8 SL |
677 | put_task_struct(info->task); |
678 | info->task = NULL; | |
679 | } | |
680 | } | |
681 | ||
682 | static const struct seq_operations task_vma_seq_ops = { | |
683 | .start = task_vma_seq_start, | |
684 | .next = task_vma_seq_next, | |
685 | .stop = task_vma_seq_stop, | |
686 | .show = task_vma_seq_show, | |
687 | }; | |
688 | ||
14fc6bd6 | 689 | static const struct bpf_iter_seq_info task_seq_info = { |
15172a46 YS |
690 | .seq_ops = &task_seq_ops, |
691 | .init_seq_private = init_seq_pidns, | |
692 | .fini_seq_private = fini_seq_pidns, | |
693 | .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), | |
14fc6bd6 YS |
694 | }; |
695 | ||
21fb6f2a KFL |
696 | static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) |
697 | { | |
698 | switch (aux->task.type) { | |
699 | case BPF_TASK_ITER_TID: | |
700 | info->iter.task.tid = aux->task.pid; | |
701 | break; | |
702 | case BPF_TASK_ITER_TGID: | |
703 | info->iter.task.pid = aux->task.pid; | |
704 | break; | |
705 | default: | |
706 | break; | |
707 | } | |
708 | return 0; | |
709 | } | |
710 | ||
2c4fe44f KFL |
711 | static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) |
712 | { | |
713 | seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); | |
714 | if (aux->task.type == BPF_TASK_ITER_TID) | |
715 | seq_printf(seq, "tid:\t%u\n", aux->task.pid); | |
716 | else if (aux->task.type == BPF_TASK_ITER_TGID) | |
717 | seq_printf(seq, "pid:\t%u\n", aux->task.pid); | |
718 | } | |
719 | ||
14fc6bd6 YS |
720 | static struct bpf_iter_reg task_reg_info = { |
721 | .target = "task", | |
f0d74c4d | 722 | .attach_target = bpf_iter_attach_task, |
cf83b2d2 | 723 | .feature = BPF_ITER_RESCHED, |
3c32cc1b YS |
724 | .ctx_arg_info_size = 1, |
725 | .ctx_arg_info = { | |
726 | { offsetof(struct bpf_iter__task, task), | |
727 | PTR_TO_BTF_ID_OR_NULL }, | |
728 | }, | |
14fc6bd6 | 729 | .seq_info = &task_seq_info, |
21fb6f2a | 730 | .fill_link_info = bpf_iter_fill_link_info, |
2c4fe44f | 731 | .show_fdinfo = bpf_iter_task_show_fdinfo, |
15172a46 YS |
732 | }; |
733 | ||
14fc6bd6 | 734 | static const struct bpf_iter_seq_info task_file_seq_info = { |
15172a46 YS |
735 | .seq_ops = &task_file_seq_ops, |
736 | .init_seq_private = init_seq_pidns, | |
737 | .fini_seq_private = fini_seq_pidns, | |
738 | .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), | |
14fc6bd6 YS |
739 | }; |
740 | ||
741 | static struct bpf_iter_reg task_file_reg_info = { | |
742 | .target = "task_file", | |
f0d74c4d | 743 | .attach_target = bpf_iter_attach_task, |
cf83b2d2 | 744 | .feature = BPF_ITER_RESCHED, |
3c32cc1b YS |
745 | .ctx_arg_info_size = 2, |
746 | .ctx_arg_info = { | |
747 | { offsetof(struct bpf_iter__task_file, task), | |
748 | PTR_TO_BTF_ID_OR_NULL }, | |
749 | { offsetof(struct bpf_iter__task_file, file), | |
750 | PTR_TO_BTF_ID_OR_NULL }, | |
751 | }, | |
14fc6bd6 | 752 | .seq_info = &task_file_seq_info, |
21fb6f2a | 753 | .fill_link_info = bpf_iter_fill_link_info, |
2c4fe44f | 754 | .show_fdinfo = bpf_iter_task_show_fdinfo, |
15172a46 YS |
755 | }; |
756 | ||
3a7b35b8 SL |
757 | static const struct bpf_iter_seq_info task_vma_seq_info = { |
758 | .seq_ops = &task_vma_seq_ops, | |
759 | .init_seq_private = init_seq_pidns, | |
760 | .fini_seq_private = fini_seq_pidns, | |
761 | .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), | |
762 | }; | |
763 | ||
764 | static struct bpf_iter_reg task_vma_reg_info = { | |
765 | .target = "task_vma", | |
f0d74c4d | 766 | .attach_target = bpf_iter_attach_task, |
3a7b35b8 SL |
767 | .feature = BPF_ITER_RESCHED, |
768 | .ctx_arg_info_size = 2, | |
769 | .ctx_arg_info = { | |
770 | { offsetof(struct bpf_iter__task_vma, task), | |
771 | PTR_TO_BTF_ID_OR_NULL }, | |
772 | { offsetof(struct bpf_iter__task_vma, vma), | |
773 | PTR_TO_BTF_ID_OR_NULL }, | |
774 | }, | |
775 | .seq_info = &task_vma_seq_info, | |
21fb6f2a | 776 | .fill_link_info = bpf_iter_fill_link_info, |
2c4fe44f | 777 | .show_fdinfo = bpf_iter_task_show_fdinfo, |
3a7b35b8 SL |
778 | }; |
779 | ||
7c7e3d31 SL |
780 | BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, |
781 | bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) | |
782 | { | |
783 | struct mmap_unlock_irq_work *work = NULL; | |
784 | struct vm_area_struct *vma; | |
785 | bool irq_work_busy = false; | |
786 | struct mm_struct *mm; | |
787 | int ret = -ENOENT; | |
788 | ||
789 | if (flags) | |
790 | return -EINVAL; | |
791 | ||
792 | if (!task) | |
793 | return -ENOENT; | |
794 | ||
795 | mm = task->mm; | |
796 | if (!mm) | |
797 | return -ENOENT; | |
798 | ||
799 | irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); | |
800 | ||
801 | if (irq_work_busy || !mmap_read_trylock(mm)) | |
802 | return -EBUSY; | |
803 | ||
804 | vma = find_vma(mm, start); | |
805 | ||
806 | if (vma && vma->vm_start <= start && vma->vm_end > start) { | |
807 | callback_fn((u64)(long)task, (u64)(long)vma, | |
808 | (u64)(long)callback_ctx, 0, 0); | |
809 | ret = 0; | |
810 | } | |
811 | bpf_mmap_unlock_mm(work, mm); | |
812 | return ret; | |
813 | } | |
814 | ||
815 | const struct bpf_func_proto bpf_find_vma_proto = { | |
816 | .func = bpf_find_vma, | |
817 | .ret_type = RET_INTEGER, | |
818 | .arg1_type = ARG_PTR_TO_BTF_ID, | |
d19ddb47 | 819 | .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], |
7c7e3d31 SL |
820 | .arg2_type = ARG_ANYTHING, |
821 | .arg3_type = ARG_PTR_TO_FUNC, | |
822 | .arg4_type = ARG_PTR_TO_STACK_OR_NULL, | |
823 | .arg5_type = ARG_ANYTHING, | |
824 | }; | |
825 | ||
826 | DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); | |
827 | ||
828 | static void do_mmap_read_unlock(struct irq_work *entry) | |
829 | { | |
830 | struct mmap_unlock_irq_work *work; | |
831 | ||
832 | if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) | |
833 | return; | |
834 | ||
835 | work = container_of(entry, struct mmap_unlock_irq_work, irq_work); | |
836 | mmap_read_unlock_non_owner(work->mm); | |
837 | } | |
838 | ||
eaaacd23 YS |
839 | static int __init task_iter_init(void) |
840 | { | |
7c7e3d31 SL |
841 | struct mmap_unlock_irq_work *work; |
842 | int ret, cpu; | |
843 | ||
844 | for_each_possible_cpu(cpu) { | |
845 | work = per_cpu_ptr(&mmap_unlock_work, cpu); | |
846 | init_irq_work(&work->irq_work, do_mmap_read_unlock); | |
847 | } | |
eaaacd23 | 848 | |
d19ddb47 | 849 | task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; |
eaaacd23 YS |
850 | ret = bpf_iter_reg_target(&task_reg_info); |
851 | if (ret) | |
852 | return ret; | |
853 | ||
d19ddb47 SL |
854 | task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; |
855 | task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; | |
3a7b35b8 SL |
856 | ret = bpf_iter_reg_target(&task_file_reg_info); |
857 | if (ret) | |
858 | return ret; | |
859 | ||
d19ddb47 SL |
860 | task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; |
861 | task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; | |
3a7b35b8 | 862 | return bpf_iter_reg_target(&task_vma_reg_info); |
eaaacd23 YS |
863 | } |
864 | late_initcall(task_iter_init); |