Commit | Line | Data |
---|---|---|
2e4913e0 AN |
1 | // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) |
2 | /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ | |
3 | #include <ctype.h> | |
4 | #include <stdio.h> | |
5 | #include <stdlib.h> | |
6 | #include <string.h> | |
7 | #include <libelf.h> | |
8 | #include <gelf.h> | |
9 | #include <unistd.h> | |
10 | #include <linux/ptrace.h> | |
11 | #include <linux/kernel.h> | |
12 | ||
13 | #include "bpf.h" | |
14 | #include "libbpf.h" | |
15 | #include "libbpf_common.h" | |
16 | #include "libbpf_internal.h" | |
17 | #include "hashmap.h" | |
18 | ||
19 | /* libbpf's USDT support consists of BPF-side state/code and user-space | |
20 | * state/code working together in concert. BPF-side parts are defined in | |
21 | * usdt.bpf.h header library. User-space state is encapsulated by struct | |
22 | * usdt_manager and all the supporting code centered around usdt_manager. | |
23 | * | |
24 | * usdt.bpf.h defines two BPF maps that usdt_manager expects: USDT spec map | |
25 | * and IP-to-spec-ID map, which is auxiliary map necessary for kernels that | |
26 | * don't support BPF cookie (see below). These two maps are implicitly | |
27 | * embedded into user's end BPF object file when user's code included | |
28 | * usdt.bpf.h. This means that libbpf doesn't do anything special to create | |
29 | * these USDT support maps. They are created by normal libbpf logic of | |
30 | * instantiating BPF maps when opening and loading BPF object. | |
31 | * | |
32 | * As such, libbpf is basically unaware of the need to do anything | |
33 | * USDT-related until the very first call to bpf_program__attach_usdt(), which | |
34 | * can be called by user explicitly or happen automatically during skeleton | |
35 | * attach (or, equivalently, through generic bpf_program__attach() call). At | |
36 | * this point, libbpf will instantiate and initialize struct usdt_manager and | |
37 | * store it in bpf_object. USDT manager is per-BPF object construct, as each | |
38 | * independent BPF object might or might not have USDT programs, and thus all | |
39 | * the expected USDT-related state. There is no coordination between two | |
40 | * bpf_object in parts of USDT attachment, they are oblivious of each other's | |
41 | * existence and libbpf is just oblivious, dealing with bpf_object-specific | |
42 | * USDT state. | |
43 | * | |
44 | * Quick crash course on USDTs. | |
45 | * | |
46 | * From user-space application's point of view, USDT is essentially just | |
47 | * a slightly special function call that normally has zero overhead, unless it | |
48 | * is being traced by some external entity (e.g, BPF-based tool). Here's how | |
49 | * a typical application can trigger USDT probe: | |
50 | * | |
51 | * #include <sys/sdt.h> // provided by systemtap-sdt-devel package | |
52 | * // folly also provide similar functionality in folly/tracing/StaticTracepoint.h | |
53 | * | |
54 | * STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y); | |
55 | * | |
56 | * USDT is identified by it's <provider-name>:<probe-name> pair of names. Each | |
57 | * individual USDT has a fixed number of arguments (3 in the above example) | |
58 | * and specifies values of each argument as if it was a function call. | |
59 | * | |
60 | * USDT call is actually not a function call, but is instead replaced by | |
61 | * a single NOP instruction (thus zero overhead, effectively). But in addition | |
62 | * to that, those USDT macros generate special SHT_NOTE ELF records in | |
63 | * .note.stapsdt ELF section. Here's an example USDT definition as emitted by | |
64 | * `readelf -n <binary>`: | |
65 | * | |
66 | * stapsdt 0x00000089 NT_STAPSDT (SystemTap probe descriptors) | |
67 | * Provider: test | |
68 | * Name: usdt12 | |
69 | * Location: 0x0000000000549df3, Base: 0x00000000008effa4, Semaphore: 0x0000000000a4606e | |
70 | * Arguments: -4@-1204(%rbp) -4@%edi -8@-1216(%rbp) -8@%r8 -4@$5 -8@%r9 8@%rdx 8@%r10 -4@$-9 -2@%cx -2@%ax -1@%sil | |
71 | * | |
72 | * In this case we have USDT test:usdt12 with 12 arguments. | |
73 | * | |
74 | * Location and base are offsets used to calculate absolute IP address of that | |
75 | * NOP instruction that kernel can replace with an interrupt instruction to | |
76 | * trigger instrumentation code (BPF program for all that we care about). | |
77 | * | |
78 | * Semaphore above is and optional feature. It records an address of a 2-byte | |
79 | * refcount variable (normally in '.probes' ELF section) used for signaling if | |
80 | * there is anything that is attached to USDT. This is useful for user | |
81 | * applications if, for example, they need to prepare some arguments that are | |
82 | * passed only to USDTs and preparation is expensive. By checking if USDT is | |
83 | * "activated", an application can avoid paying those costs unnecessarily. | |
84 | * Recent enough kernel has built-in support for automatically managing this | |
85 | * refcount, which libbpf expects and relies on. If USDT is defined without | |
86 | * associated semaphore, this value will be zero. See selftests for semaphore | |
87 | * examples. | |
88 | * | |
89 | * Arguments is the most interesting part. This USDT specification string is | |
90 | * providing information about all the USDT arguments and their locations. The | |
91 | * part before @ sign defined byte size of the argument (1, 2, 4, or 8) and | |
92 | * whether the argument is signed or unsigned (negative size means signed). | |
93 | * The part after @ sign is assembly-like definition of argument location | |
94 | * (see [0] for more details). Technically, assembler can provide some pretty | |
95 | * advanced definitions, but libbpf is currently supporting three most common | |
96 | * cases: | |
97 | * 1) immediate constant, see 5th and 9th args above (-4@$5 and -4@-9); | |
98 | * 2) register value, e.g., 8@%rdx, which means "unsigned 8-byte integer | |
99 | * whose value is in register %rdx"; | |
100 | * 3) memory dereference addressed by register, e.g., -4@-1204(%rbp), which | |
101 | * specifies signed 32-bit integer stored at offset -1204 bytes from | |
102 | * memory address stored in %rbp. | |
103 | * | |
104 | * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation | |
105 | * | |
106 | * During attachment, libbpf parses all the relevant USDT specifications and | |
107 | * prepares `struct usdt_spec` (USDT spec), which is then provided to BPF-side | |
108 | * code through spec map. This allows BPF applications to quickly fetch the | |
109 | * actual value at runtime using a simple BPF-side code. | |
110 | * | |
e1b6df59 | 111 | * With basics out of the way, let's go over less immediately obvious aspects |
2e4913e0 AN |
112 | * of supporting USDTs. |
113 | * | |
114 | * First, there is no special USDT BPF program type. It is actually just | |
115 | * a uprobe BPF program (which for kernel, at least currently, is just a kprobe | |
116 | * program, so BPF_PROG_TYPE_KPROBE program type). With the only difference | |
117 | * that uprobe is usually attached at the function entry, while USDT will | |
118 | * normally will be somewhere inside the function. But it should always be | |
119 | * pointing to NOP instruction, which makes such uprobes the fastest uprobe | |
120 | * kind. | |
121 | * | |
122 | * Second, it's important to realize that such STAP_PROBEn(provider, name, ...) | |
123 | * macro invocations can end up being inlined many-many times, depending on | |
124 | * specifics of each individual user application. So single conceptual USDT | |
125 | * (identified by provider:name pair of identifiers) is, generally speaking, | |
126 | * multiple uprobe locations (USDT call sites) in different places in user | |
127 | * application. Further, again due to inlining, each USDT call site might end | |
128 | * up having the same argument #N be located in a different place. In one call | |
129 | * site it could be a constant, in another will end up in a register, and in | |
130 | * yet another could be some other register or even somewhere on the stack. | |
131 | * | |
132 | * As such, "attaching to USDT" means (in general case) attaching the same | |
133 | * uprobe BPF program to multiple target locations in user application, each | |
134 | * potentially having a completely different USDT spec associated with it. | |
135 | * To wire all this up together libbpf allocates a unique integer spec ID for | |
136 | * each unique USDT spec. Spec IDs are allocated as sequential small integers | |
137 | * so that they can be used as keys in array BPF map (for performance reasons). | |
138 | * Spec ID allocation and accounting is big part of what usdt_manager is | |
139 | * about. This state has to be maintained per-BPF object and coordinate | |
140 | * between different USDT attachments within the same BPF object. | |
141 | * | |
142 | * Spec ID is the key in spec BPF map, value is the actual USDT spec layed out | |
143 | * as struct usdt_spec. Each invocation of BPF program at runtime needs to | |
144 | * know its associated spec ID. It gets it either through BPF cookie, which | |
145 | * libbpf sets to spec ID during attach time, or, if kernel is too old to | |
146 | * support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such | |
147 | * case. The latter means that some modes of operation can't be supported | |
148 | * without BPF cookie. Such mode is attaching to shared library "generically", | |
149 | * without specifying target process. In such case, it's impossible to | |
150 | * calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode | |
151 | * is not supported without BPF cookie support. | |
152 | * | |
153 | * Note that libbpf is using BPF cookie functionality for its own internal | |
154 | * needs, so user itself can't rely on BPF cookie feature. To that end, libbpf | |
155 | * provides conceptually equivalent USDT cookie support. It's still u64 | |
156 | * user-provided value that can be associated with USDT attachment. Note that | |
157 | * this will be the same value for all USDT call sites within the same single | |
158 | * *logical* USDT attachment. This makes sense because to user attaching to | |
159 | * USDT is a single BPF program triggered for singular USDT probe. The fact | |
160 | * that this is done at multiple actual locations is a mostly hidden | |
161 | * implementation details. This USDT cookie value can be fetched with | |
162 | * bpf_usdt_cookie(ctx) API provided by usdt.bpf.h | |
163 | * | |
164 | * Lastly, while single USDT can have tons of USDT call sites, it doesn't | |
165 | * necessarily have that many different USDT specs. It very well might be | |
166 | * that 1000 USDT call sites only need 5 different USDT specs, because all the | |
167 | * arguments are typically contained in a small set of registers or stack | |
168 | * locations. As such, it's wasteful to allocate as many USDT spec IDs as | |
169 | * there are USDT call sites. So libbpf tries to be frugal and performs | |
170 | * on-the-fly deduplication during a single USDT attachment to only allocate | |
171 | * the minimal required amount of unique USDT specs (and thus spec IDs). This | |
172 | * is trivially achieved by using USDT spec string (Arguments string from USDT | |
173 | * note) as a lookup key in a hashmap. USDT spec string uniquely defines | |
174 | * everything about how to fetch USDT arguments, so two USDT call sites | |
175 | * sharing USDT spec string can safely share the same USDT spec and spec ID. | |
176 | * Note, this spec string deduplication is happening only during the same USDT | |
177 | * attachment, so each USDT spec shares the same USDT cookie value. This is | |
178 | * not generally true for other USDT attachments within the same BPF object, | |
179 | * as even if USDT spec string is the same, USDT cookie value can be | |
180 | * different. It was deemed excessive to try to deduplicate across independent | |
181 | * USDT attachments by taking into account USDT spec string *and* USDT cookie | |
182 | * value, which would complicated spec ID accounting significantly for little | |
183 | * gain. | |
184 | */ | |
185 | ||
74cc6311 AN |
186 | #define USDT_BASE_SEC ".stapsdt.base" |
187 | #define USDT_SEMA_SEC ".probes" | |
188 | #define USDT_NOTE_SEC ".note.stapsdt" | |
189 | #define USDT_NOTE_TYPE 3 | |
190 | #define USDT_NOTE_NAME "stapsdt" | |
191 | ||
e1b6df59 | 192 | /* should match exactly enum __bpf_usdt_arg_type from usdt.bpf.h */ |
74cc6311 AN |
193 | enum usdt_arg_type { |
194 | USDT_ARG_CONST, | |
195 | USDT_ARG_REG, | |
196 | USDT_ARG_REG_DEREF, | |
197 | }; | |
198 | ||
e1b6df59 | 199 | /* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */ |
74cc6311 AN |
200 | struct usdt_arg_spec { |
201 | __u64 val_off; | |
202 | enum usdt_arg_type arg_type; | |
203 | short reg_off; | |
204 | bool arg_signed; | |
205 | char arg_bitshift; | |
206 | }; | |
207 | ||
208 | /* should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h */ | |
209 | #define USDT_MAX_ARG_CNT 12 | |
210 | ||
211 | /* should match struct __bpf_usdt_spec from usdt.bpf.h */ | |
212 | struct usdt_spec { | |
213 | struct usdt_arg_spec args[USDT_MAX_ARG_CNT]; | |
214 | __u64 usdt_cookie; | |
215 | short arg_cnt; | |
216 | }; | |
217 | ||
218 | struct usdt_note { | |
219 | const char *provider; | |
220 | const char *name; | |
221 | /* USDT args specification string, e.g.: | |
222 | * "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx" | |
223 | */ | |
224 | const char *args; | |
225 | long loc_addr; | |
226 | long base_addr; | |
227 | long sema_addr; | |
228 | }; | |
229 | ||
2e4913e0 AN |
230 | struct usdt_target { |
231 | long abs_ip; | |
232 | long rel_ip; | |
233 | long sema_off; | |
74cc6311 AN |
234 | struct usdt_spec spec; |
235 | const char *spec_str; | |
2e4913e0 AN |
236 | }; |
237 | ||
238 | struct usdt_manager { | |
239 | struct bpf_map *specs_map; | |
240 | struct bpf_map *ip_to_spec_id_map; | |
241 | ||
999783c8 AN |
242 | int *free_spec_ids; |
243 | size_t free_spec_cnt; | |
244 | size_t next_free_spec_id; | |
245 | ||
2e4913e0 AN |
246 | bool has_bpf_cookie; |
247 | bool has_sema_refcnt; | |
248 | }; | |
249 | ||
250 | struct usdt_manager *usdt_manager_new(struct bpf_object *obj) | |
251 | { | |
252 | static const char *ref_ctr_sysfs_path = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset"; | |
253 | struct usdt_manager *man; | |
254 | struct bpf_map *specs_map, *ip_to_spec_id_map; | |
255 | ||
256 | specs_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_specs"); | |
257 | ip_to_spec_id_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_ip_to_spec_id"); | |
258 | if (!specs_map || !ip_to_spec_id_map) { | |
259 | pr_warn("usdt: failed to find USDT support BPF maps, did you forget to include bpf/usdt.bpf.h?\n"); | |
260 | return ERR_PTR(-ESRCH); | |
261 | } | |
262 | ||
263 | man = calloc(1, sizeof(*man)); | |
264 | if (!man) | |
265 | return ERR_PTR(-ENOMEM); | |
266 | ||
267 | man->specs_map = specs_map; | |
268 | man->ip_to_spec_id_map = ip_to_spec_id_map; | |
269 | ||
270 | /* Detect if BPF cookie is supported for kprobes. | |
271 | * We don't need IP-to-ID mapping if we can use BPF cookies. | |
272 | * Added in: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value") | |
273 | */ | |
274 | man->has_bpf_cookie = kernel_supports(obj, FEAT_BPF_COOKIE); | |
275 | ||
276 | /* Detect kernel support for automatic refcounting of USDT semaphore. | |
277 | * If this is not supported, USDTs with semaphores will not be supported. | |
278 | * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") | |
279 | */ | |
280 | man->has_sema_refcnt = access(ref_ctr_sysfs_path, F_OK) == 0; | |
281 | ||
282 | return man; | |
283 | } | |
284 | ||
285 | void usdt_manager_free(struct usdt_manager *man) | |
286 | { | |
287 | if (IS_ERR_OR_NULL(man)) | |
288 | return; | |
289 | ||
999783c8 | 290 | free(man->free_spec_ids); |
2e4913e0 AN |
291 | free(man); |
292 | } | |
293 | ||
294 | static int sanity_check_usdt_elf(Elf *elf, const char *path) | |
295 | { | |
296 | GElf_Ehdr ehdr; | |
297 | int endianness; | |
298 | ||
299 | if (elf_kind(elf) != ELF_K_ELF) { | |
300 | pr_warn("usdt: unrecognized ELF kind %d for '%s'\n", elf_kind(elf), path); | |
301 | return -EBADF; | |
302 | } | |
303 | ||
304 | switch (gelf_getclass(elf)) { | |
305 | case ELFCLASS64: | |
306 | if (sizeof(void *) != 8) { | |
307 | pr_warn("usdt: attaching to 64-bit ELF binary '%s' is not supported\n", path); | |
308 | return -EBADF; | |
309 | } | |
310 | break; | |
311 | case ELFCLASS32: | |
312 | if (sizeof(void *) != 4) { | |
313 | pr_warn("usdt: attaching to 32-bit ELF binary '%s' is not supported\n", path); | |
314 | return -EBADF; | |
315 | } | |
316 | break; | |
317 | default: | |
318 | pr_warn("usdt: unsupported ELF class for '%s'\n", path); | |
319 | return -EBADF; | |
320 | } | |
321 | ||
322 | if (!gelf_getehdr(elf, &ehdr)) | |
323 | return -EINVAL; | |
324 | ||
325 | if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) { | |
326 | pr_warn("usdt: unsupported type of ELF binary '%s' (%d), only ET_EXEC and ET_DYN are supported\n", | |
327 | path, ehdr.e_type); | |
328 | return -EBADF; | |
329 | } | |
330 | ||
e1b6df59 | 331 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
2e4913e0 | 332 | endianness = ELFDATA2LSB; |
e1b6df59 | 333 | #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
2e4913e0 AN |
334 | endianness = ELFDATA2MSB; |
335 | #else | |
336 | # error "Unrecognized __BYTE_ORDER__" | |
337 | #endif | |
338 | if (endianness != ehdr.e_ident[EI_DATA]) { | |
339 | pr_warn("usdt: ELF endianness mismatch for '%s'\n", path); | |
340 | return -EBADF; | |
341 | } | |
342 | ||
343 | return 0; | |
344 | } | |
345 | ||
74cc6311 AN |
346 | static int find_elf_sec_by_name(Elf *elf, const char *sec_name, GElf_Shdr *shdr, Elf_Scn **scn) |
347 | { | |
348 | Elf_Scn *sec = NULL; | |
349 | size_t shstrndx; | |
350 | ||
351 | if (elf_getshdrstrndx(elf, &shstrndx)) | |
352 | return -EINVAL; | |
353 | ||
354 | /* check if ELF is corrupted and avoid calling elf_strptr if yes */ | |
355 | if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) | |
356 | return -EINVAL; | |
357 | ||
358 | while ((sec = elf_nextscn(elf, sec)) != NULL) { | |
359 | char *name; | |
360 | ||
361 | if (!gelf_getshdr(sec, shdr)) | |
362 | return -EINVAL; | |
363 | ||
364 | name = elf_strptr(elf, shstrndx, shdr->sh_name); | |
365 | if (name && strcmp(sec_name, name) == 0) { | |
366 | *scn = sec; | |
367 | return 0; | |
368 | } | |
369 | } | |
370 | ||
371 | return -ENOENT; | |
372 | } | |
373 | ||
374 | struct elf_seg { | |
375 | long start; | |
376 | long end; | |
377 | long offset; | |
378 | bool is_exec; | |
379 | }; | |
380 | ||
381 | static int cmp_elf_segs(const void *_a, const void *_b) | |
382 | { | |
383 | const struct elf_seg *a = _a; | |
384 | const struct elf_seg *b = _b; | |
385 | ||
386 | return a->start < b->start ? -1 : 1; | |
387 | } | |
388 | ||
389 | static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, size_t *seg_cnt) | |
390 | { | |
391 | GElf_Phdr phdr; | |
392 | size_t n; | |
393 | int i, err; | |
394 | struct elf_seg *seg; | |
395 | void *tmp; | |
396 | ||
397 | *seg_cnt = 0; | |
398 | ||
399 | if (elf_getphdrnum(elf, &n)) { | |
400 | err = -errno; | |
401 | return err; | |
402 | } | |
403 | ||
404 | for (i = 0; i < n; i++) { | |
405 | if (!gelf_getphdr(elf, i, &phdr)) { | |
406 | err = -errno; | |
407 | return err; | |
408 | } | |
409 | ||
410 | pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n", | |
411 | i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset, | |
412 | (long)phdr.p_type, (long)phdr.p_flags); | |
413 | if (phdr.p_type != PT_LOAD) | |
414 | continue; | |
415 | ||
416 | tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); | |
417 | if (!tmp) | |
418 | return -ENOMEM; | |
419 | ||
420 | *segs = tmp; | |
421 | seg = *segs + *seg_cnt; | |
422 | (*seg_cnt)++; | |
423 | ||
424 | seg->start = phdr.p_vaddr; | |
425 | seg->end = phdr.p_vaddr + phdr.p_memsz; | |
426 | seg->offset = phdr.p_offset; | |
427 | seg->is_exec = phdr.p_flags & PF_X; | |
428 | } | |
429 | ||
430 | if (*seg_cnt == 0) { | |
431 | pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path); | |
432 | return -ESRCH; | |
433 | } | |
434 | ||
435 | qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); | |
436 | return 0; | |
437 | } | |
438 | ||
439 | static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) | |
440 | { | |
441 | char path[PATH_MAX], line[PATH_MAX], mode[16]; | |
442 | size_t seg_start, seg_end, seg_off; | |
443 | struct elf_seg *seg; | |
444 | int tmp_pid, i, err; | |
445 | FILE *f; | |
446 | ||
447 | *seg_cnt = 0; | |
448 | ||
449 | /* Handle containerized binaries only accessible from | |
450 | * /proc/<pid>/root/<path>. They will be reported as just /<path> in | |
451 | * /proc/<pid>/maps. | |
452 | */ | |
453 | if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid) | |
454 | goto proceed; | |
455 | ||
456 | if (!realpath(lib_path, path)) { | |
457 | pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n", | |
458 | lib_path, -errno); | |
3c0dfe6e | 459 | libbpf_strlcpy(path, lib_path, sizeof(path)); |
74cc6311 AN |
460 | } |
461 | ||
462 | proceed: | |
463 | sprintf(line, "/proc/%d/maps", pid); | |
464 | f = fopen(line, "r"); | |
465 | if (!f) { | |
466 | err = -errno; | |
467 | pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", | |
468 | line, lib_path, err); | |
469 | return err; | |
470 | } | |
471 | ||
472 | /* We need to handle lines with no path at the end: | |
473 | * | |
474 | * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so | |
475 | * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0 | |
476 | * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so | |
477 | */ | |
478 | while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", | |
479 | &seg_start, &seg_end, mode, &seg_off, line) == 5) { | |
480 | void *tmp; | |
481 | ||
482 | /* to handle no path case (see above) we need to capture line | |
483 | * without skipping any whitespaces. So we need to strip | |
484 | * leading whitespaces manually here | |
485 | */ | |
486 | i = 0; | |
487 | while (isblank(line[i])) | |
488 | i++; | |
489 | if (strcmp(line + i, path) != 0) | |
490 | continue; | |
491 | ||
492 | pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n", | |
493 | path, seg_start, seg_end, mode, seg_off); | |
494 | ||
495 | /* ignore non-executable sections for shared libs */ | |
496 | if (mode[2] != 'x') | |
497 | continue; | |
498 | ||
499 | tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); | |
500 | if (!tmp) { | |
501 | err = -ENOMEM; | |
502 | goto err_out; | |
503 | } | |
504 | ||
505 | *segs = tmp; | |
506 | seg = *segs + *seg_cnt; | |
507 | *seg_cnt += 1; | |
508 | ||
509 | seg->start = seg_start; | |
510 | seg->end = seg_end; | |
511 | seg->offset = seg_off; | |
512 | seg->is_exec = true; | |
513 | } | |
514 | ||
515 | if (*seg_cnt == 0) { | |
516 | pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n", | |
517 | lib_path, path, pid); | |
518 | err = -ESRCH; | |
519 | goto err_out; | |
520 | } | |
521 | ||
522 | qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); | |
523 | err = 0; | |
524 | err_out: | |
525 | fclose(f); | |
526 | return err; | |
527 | } | |
528 | ||
529 | static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long addr, bool relative) | |
530 | { | |
531 | struct elf_seg *seg; | |
532 | int i; | |
533 | ||
534 | if (relative) { | |
535 | /* for shared libraries, address is relative offset and thus | |
536 | * should be fall within logical offset-based range of | |
537 | * [offset_start, offset_end) | |
538 | */ | |
539 | for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { | |
540 | if (seg->offset <= addr && addr < seg->offset + (seg->end - seg->start)) | |
541 | return seg; | |
542 | } | |
543 | } else { | |
544 | /* for binaries, address is absolute and thus should be within | |
545 | * absolute address range of [seg_start, seg_end) | |
546 | */ | |
547 | for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { | |
548 | if (seg->start <= addr && addr < seg->end) | |
549 | return seg; | |
550 | } | |
551 | } | |
552 | ||
553 | return NULL; | |
554 | } | |
555 | ||
556 | static int parse_usdt_note(Elf *elf, const char *path, long base_addr, | |
557 | GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, | |
558 | struct usdt_note *usdt_note); | |
559 | ||
560 | static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie); | |
561 | ||
2e4913e0 AN |
562 | static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, |
563 | const char *usdt_provider, const char *usdt_name, long usdt_cookie, | |
564 | struct usdt_target **out_targets, size_t *out_target_cnt) | |
565 | { | |
74cc6311 AN |
566 | size_t off, name_off, desc_off, seg_cnt = 0, lib_seg_cnt = 0, target_cnt = 0; |
567 | struct elf_seg *segs = NULL, *lib_segs = NULL; | |
568 | struct usdt_target *targets = NULL, *target; | |
569 | long base_addr = 0; | |
570 | Elf_Scn *notes_scn, *base_scn; | |
571 | GElf_Shdr base_shdr, notes_shdr; | |
572 | GElf_Ehdr ehdr; | |
573 | GElf_Nhdr nhdr; | |
574 | Elf_Data *data; | |
575 | int err; | |
576 | ||
577 | *out_targets = NULL; | |
578 | *out_target_cnt = 0; | |
579 | ||
580 | err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, ¬es_shdr, ¬es_scn); | |
581 | if (err) { | |
582 | pr_warn("usdt: no USDT notes section (%s) found in '%s'\n", USDT_NOTE_SEC, path); | |
583 | return err; | |
584 | } | |
585 | ||
586 | if (notes_shdr.sh_type != SHT_NOTE || !gelf_getehdr(elf, &ehdr)) { | |
587 | pr_warn("usdt: invalid USDT notes section (%s) in '%s'\n", USDT_NOTE_SEC, path); | |
588 | return -EINVAL; | |
589 | } | |
590 | ||
591 | err = parse_elf_segs(elf, path, &segs, &seg_cnt); | |
592 | if (err) { | |
593 | pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err); | |
594 | goto err_out; | |
595 | } | |
596 | ||
597 | /* .stapsdt.base ELF section is optional, but is used for prelink | |
598 | * offset compensation (see a big comment further below) | |
599 | */ | |
600 | if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == 0) | |
601 | base_addr = base_shdr.sh_addr; | |
602 | ||
603 | data = elf_getdata(notes_scn, 0); | |
604 | off = 0; | |
605 | while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > 0) { | |
606 | long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = 0; | |
607 | struct usdt_note note; | |
608 | struct elf_seg *seg = NULL; | |
609 | void *tmp; | |
610 | ||
611 | err = parse_usdt_note(elf, path, base_addr, &nhdr, | |
612 | data->d_buf, name_off, desc_off, ¬e); | |
613 | if (err) | |
614 | goto err_out; | |
615 | ||
616 | if (strcmp(note.provider, usdt_provider) != 0 || strcmp(note.name, usdt_name) != 0) | |
617 | continue; | |
618 | ||
619 | /* We need to compensate "prelink effect". See [0] for details, | |
620 | * relevant parts quoted here: | |
621 | * | |
622 | * Each SDT probe also expands into a non-allocated ELF note. You can | |
623 | * find this by looking at SHT_NOTE sections and decoding the format; | |
624 | * see below for details. Because the note is non-allocated, it means | |
625 | * there is no runtime cost, and also preserved in both stripped files | |
626 | * and .debug files. | |
627 | * | |
628 | * However, this means that prelink won't adjust the note's contents | |
629 | * for address offsets. Instead, this is done via the .stapsdt.base | |
630 | * section. This is a special section that is added to the text. We | |
631 | * will only ever have one of these sections in a final link and it | |
632 | * will only ever be one byte long. Nothing about this section itself | |
633 | * matters, we just use it as a marker to detect prelink address | |
634 | * adjustments. | |
635 | * | |
636 | * Each probe note records the link-time address of the .stapsdt.base | |
637 | * section alongside the probe PC address. The decoder compares the | |
638 | * base address stored in the note with the .stapsdt.base section's | |
639 | * sh_addr. Initially these are the same, but the section header will | |
640 | * be adjusted by prelink. So the decoder applies the difference to | |
641 | * the probe PC address to get the correct prelinked PC address; the | |
642 | * same adjustment is applied to the semaphore address, if any. | |
643 | * | |
644 | * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation | |
645 | */ | |
646 | usdt_rel_ip = usdt_abs_ip = note.loc_addr; | |
647 | if (base_addr) { | |
648 | usdt_abs_ip += base_addr - note.base_addr; | |
649 | usdt_rel_ip += base_addr - note.base_addr; | |
650 | } | |
651 | ||
652 | if (ehdr.e_type == ET_EXEC) { | |
653 | /* When attaching uprobes (which what USDTs basically | |
654 | * are) kernel expects a relative IP to be specified, | |
655 | * so if we are attaching to an executable ELF binary | |
656 | * (i.e., not a shared library), we need to calculate | |
657 | * proper relative IP based on ELF's load address | |
658 | */ | |
659 | seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip, false /* relative */); | |
660 | if (!seg) { | |
661 | err = -ESRCH; | |
662 | pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", | |
663 | usdt_provider, usdt_name, path, usdt_abs_ip); | |
664 | goto err_out; | |
665 | } | |
666 | if (!seg->is_exec) { | |
667 | err = -ESRCH; | |
668 | pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", | |
669 | path, seg->start, seg->end, usdt_provider, usdt_name, | |
670 | usdt_abs_ip); | |
671 | goto err_out; | |
672 | } | |
673 | ||
674 | usdt_rel_ip = usdt_abs_ip - (seg->start - seg->offset); | |
675 | } else if (!man->has_bpf_cookie) { /* ehdr.e_type == ET_DYN */ | |
676 | /* If we don't have BPF cookie support but need to | |
677 | * attach to a shared library, we'll need to know and | |
678 | * record absolute addresses of attach points due to | |
679 | * the need to lookup USDT spec by absolute IP of | |
680 | * triggered uprobe. Doing this resolution is only | |
681 | * possible when we have a specific PID of the process | |
682 | * that's using specified shared library. BPF cookie | |
683 | * removes the absolute address limitation as we don't | |
684 | * need to do this lookup (we just use BPF cookie as | |
685 | * an index of USDT spec), so for newer kernels with | |
686 | * BPF cookie support libbpf supports USDT attachment | |
687 | * to shared libraries with no PID filter. | |
688 | */ | |
689 | if (pid < 0) { | |
a8d600f6 | 690 | pr_warn("usdt: attaching to shared libraries without specific PID is not supported on current kernel\n"); |
74cc6311 AN |
691 | err = -ENOTSUP; |
692 | goto err_out; | |
693 | } | |
694 | ||
695 | /* lib_segs are lazily initialized only if necessary */ | |
696 | if (lib_seg_cnt == 0) { | |
697 | err = parse_lib_segs(pid, path, &lib_segs, &lib_seg_cnt); | |
698 | if (err) { | |
699 | pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", | |
700 | pid, path, err); | |
701 | goto err_out; | |
702 | } | |
703 | } | |
704 | ||
705 | seg = find_elf_seg(lib_segs, lib_seg_cnt, usdt_rel_ip, true /* relative */); | |
706 | if (!seg) { | |
707 | err = -ESRCH; | |
708 | pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", | |
709 | usdt_provider, usdt_name, path, usdt_rel_ip); | |
710 | goto err_out; | |
711 | } | |
712 | ||
713 | usdt_abs_ip = seg->start + (usdt_rel_ip - seg->offset); | |
714 | } | |
715 | ||
716 | pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", | |
717 | usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path, | |
718 | note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, | |
719 | seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); | |
720 | ||
721 | /* Adjust semaphore address to be a relative offset */ | |
722 | if (note.sema_addr) { | |
723 | if (!man->has_sema_refcnt) { | |
724 | pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", | |
725 | usdt_provider, usdt_name, path); | |
726 | err = -ENOTSUP; | |
727 | goto err_out; | |
728 | } | |
729 | ||
730 | seg = find_elf_seg(segs, seg_cnt, note.sema_addr, false /* relative */); | |
731 | if (!seg) { | |
732 | err = -ESRCH; | |
733 | pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", | |
734 | usdt_provider, usdt_name, path, note.sema_addr); | |
735 | goto err_out; | |
736 | } | |
737 | if (seg->is_exec) { | |
738 | err = -ESRCH; | |
739 | pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n", | |
740 | path, seg->start, seg->end, usdt_provider, usdt_name, | |
741 | note.sema_addr); | |
742 | goto err_out; | |
743 | } | |
744 | ||
745 | usdt_sema_off = note.sema_addr - (seg->start - seg->offset); | |
746 | ||
747 | pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n", | |
748 | usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", | |
749 | path, note.sema_addr, note.base_addr, usdt_sema_off, | |
750 | seg->start, seg->end, seg->offset); | |
751 | } | |
752 | ||
753 | /* Record adjusted addresses and offsets and parse USDT spec */ | |
754 | tmp = libbpf_reallocarray(targets, target_cnt + 1, sizeof(*targets)); | |
755 | if (!tmp) { | |
756 | err = -ENOMEM; | |
757 | goto err_out; | |
758 | } | |
759 | targets = tmp; | |
760 | ||
761 | target = &targets[target_cnt]; | |
762 | memset(target, 0, sizeof(*target)); | |
763 | ||
764 | target->abs_ip = usdt_abs_ip; | |
765 | target->rel_ip = usdt_rel_ip; | |
766 | target->sema_off = usdt_sema_off; | |
767 | ||
768 | /* notes->args references strings from Elf itself, so they can | |
769 | * be referenced safely until elf_end() call | |
770 | */ | |
771 | target->spec_str = note.args; | |
772 | ||
773 | err = parse_usdt_spec(&target->spec, ¬e, usdt_cookie); | |
774 | if (err) | |
775 | goto err_out; | |
776 | ||
777 | target_cnt++; | |
778 | } | |
779 | ||
780 | *out_targets = targets; | |
781 | *out_target_cnt = target_cnt; | |
782 | err = target_cnt; | |
783 | ||
784 | err_out: | |
785 | free(segs); | |
786 | free(lib_segs); | |
787 | if (err < 0) | |
788 | free(targets); | |
789 | return err; | |
2e4913e0 AN |
790 | } |
791 | ||
792 | struct bpf_link_usdt { | |
793 | struct bpf_link link; | |
794 | ||
795 | struct usdt_manager *usdt_man; | |
796 | ||
999783c8 AN |
797 | size_t spec_cnt; |
798 | int *spec_ids; | |
799 | ||
2e4913e0 AN |
800 | size_t uprobe_cnt; |
801 | struct { | |
802 | long abs_ip; | |
803 | struct bpf_link *link; | |
804 | } *uprobes; | |
805 | }; | |
806 | ||
807 | static int bpf_link_usdt_detach(struct bpf_link *link) | |
808 | { | |
809 | struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); | |
999783c8 | 810 | struct usdt_manager *man = usdt_link->usdt_man; |
2e4913e0 AN |
811 | int i; |
812 | ||
813 | for (i = 0; i < usdt_link->uprobe_cnt; i++) { | |
814 | /* detach underlying uprobe link */ | |
815 | bpf_link__destroy(usdt_link->uprobes[i].link); | |
999783c8 AN |
816 | /* there is no need to update specs map because it will be |
817 | * unconditionally overwritten on subsequent USDT attaches, | |
818 | * but if BPF cookies are not used we need to remove entry | |
819 | * from ip_to_spec_id map, otherwise we'll run into false | |
820 | * conflicting IP errors | |
821 | */ | |
822 | if (!man->has_bpf_cookie) { | |
823 | /* not much we can do about errors here */ | |
824 | (void)bpf_map_delete_elem(bpf_map__fd(man->ip_to_spec_id_map), | |
825 | &usdt_link->uprobes[i].abs_ip); | |
826 | } | |
827 | } | |
828 | ||
829 | /* try to return the list of previously used spec IDs to usdt_manager | |
830 | * for future reuse for subsequent USDT attaches | |
831 | */ | |
832 | if (!man->free_spec_ids) { | |
833 | /* if there were no free spec IDs yet, just transfer our IDs */ | |
834 | man->free_spec_ids = usdt_link->spec_ids; | |
835 | man->free_spec_cnt = usdt_link->spec_cnt; | |
836 | usdt_link->spec_ids = NULL; | |
837 | } else { | |
838 | /* otherwise concat IDs */ | |
839 | size_t new_cnt = man->free_spec_cnt + usdt_link->spec_cnt; | |
840 | int *new_free_ids; | |
841 | ||
842 | new_free_ids = libbpf_reallocarray(man->free_spec_ids, new_cnt, | |
843 | sizeof(*new_free_ids)); | |
844 | /* If we couldn't resize free_spec_ids, we'll just leak | |
845 | * a bunch of free IDs; this is very unlikely to happen and if | |
e1b6df59 | 846 | * system is so exhausted on memory, it's the least of user's |
999783c8 AN |
847 | * concerns, probably. |
848 | * So just do our best here to return those IDs to usdt_manager. | |
849 | */ | |
850 | if (new_free_ids) { | |
851 | memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids, | |
852 | usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids)); | |
853 | man->free_spec_ids = new_free_ids; | |
854 | man->free_spec_cnt = new_cnt; | |
855 | } | |
2e4913e0 AN |
856 | } |
857 | ||
858 | return 0; | |
859 | } | |
860 | ||
861 | static void bpf_link_usdt_dealloc(struct bpf_link *link) | |
862 | { | |
863 | struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); | |
864 | ||
999783c8 | 865 | free(usdt_link->spec_ids); |
2e4913e0 AN |
866 | free(usdt_link->uprobes); |
867 | free(usdt_link); | |
868 | } | |
869 | ||
999783c8 AN |
870 | static size_t specs_hash_fn(const void *key, void *ctx) |
871 | { | |
872 | const char *s = key; | |
873 | ||
874 | return str_hash(s); | |
875 | } | |
876 | ||
877 | static bool specs_equal_fn(const void *key1, const void *key2, void *ctx) | |
878 | { | |
879 | const char *s1 = key1; | |
880 | const char *s2 = key2; | |
881 | ||
882 | return strcmp(s1, s2) == 0; | |
883 | } | |
884 | ||
885 | static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash, | |
886 | struct bpf_link_usdt *link, struct usdt_target *target, | |
887 | int *spec_id, bool *is_new) | |
888 | { | |
889 | void *tmp; | |
890 | int err; | |
891 | ||
892 | /* check if we already allocated spec ID for this spec string */ | |
893 | if (hashmap__find(specs_hash, target->spec_str, &tmp)) { | |
894 | *spec_id = (long)tmp; | |
895 | *is_new = false; | |
896 | return 0; | |
897 | } | |
898 | ||
899 | /* otherwise it's a new ID that needs to be set up in specs map and | |
900 | * returned back to usdt_manager when USDT link is detached | |
901 | */ | |
902 | tmp = libbpf_reallocarray(link->spec_ids, link->spec_cnt + 1, sizeof(*link->spec_ids)); | |
903 | if (!tmp) | |
904 | return -ENOMEM; | |
905 | link->spec_ids = tmp; | |
906 | ||
907 | /* get next free spec ID, giving preference to free list, if not empty */ | |
908 | if (man->free_spec_cnt) { | |
909 | *spec_id = man->free_spec_ids[man->free_spec_cnt - 1]; | |
910 | ||
911 | /* cache spec ID for current spec string for future lookups */ | |
912 | err = hashmap__add(specs_hash, target->spec_str, (void *)(long)*spec_id); | |
913 | if (err) | |
914 | return err; | |
915 | ||
916 | man->free_spec_cnt--; | |
917 | } else { | |
918 | /* don't allocate spec ID bigger than what fits in specs map */ | |
919 | if (man->next_free_spec_id >= bpf_map__max_entries(man->specs_map)) | |
920 | return -E2BIG; | |
921 | ||
922 | *spec_id = man->next_free_spec_id; | |
923 | ||
924 | /* cache spec ID for current spec string for future lookups */ | |
925 | err = hashmap__add(specs_hash, target->spec_str, (void *)(long)*spec_id); | |
926 | if (err) | |
927 | return err; | |
928 | ||
929 | man->next_free_spec_id++; | |
930 | } | |
931 | ||
932 | /* remember new spec ID in the link for later return back to free list on detach */ | |
933 | link->spec_ids[link->spec_cnt] = *spec_id; | |
934 | link->spec_cnt++; | |
935 | *is_new = true; | |
936 | return 0; | |
937 | } | |
938 | ||
2e4913e0 AN |
939 | struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog, |
940 | pid_t pid, const char *path, | |
941 | const char *usdt_provider, const char *usdt_name, | |
942 | long usdt_cookie) | |
943 | { | |
999783c8 | 944 | int i, fd, err, spec_map_fd, ip_map_fd; |
2e4913e0 | 945 | LIBBPF_OPTS(bpf_uprobe_opts, opts); |
999783c8 | 946 | struct hashmap *specs_hash = NULL; |
2e4913e0 AN |
947 | struct bpf_link_usdt *link = NULL; |
948 | struct usdt_target *targets = NULL; | |
949 | size_t target_cnt; | |
2e4913e0 AN |
950 | Elf *elf; |
951 | ||
999783c8 AN |
952 | spec_map_fd = bpf_map__fd(man->specs_map); |
953 | ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map); | |
954 | ||
2e4913e0 AN |
955 | /* TODO: perform path resolution similar to uprobe's */ |
956 | fd = open(path, O_RDONLY); | |
957 | if (fd < 0) { | |
958 | err = -errno; | |
959 | pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err); | |
960 | return libbpf_err_ptr(err); | |
961 | } | |
962 | ||
963 | elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); | |
964 | if (!elf) { | |
965 | err = -EBADF; | |
966 | pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1)); | |
967 | goto err_out; | |
968 | } | |
969 | ||
970 | err = sanity_check_usdt_elf(elf, path); | |
971 | if (err) | |
972 | goto err_out; | |
973 | ||
974 | /* normalize PID filter */ | |
975 | if (pid < 0) | |
976 | pid = -1; | |
977 | else if (pid == 0) | |
978 | pid = getpid(); | |
979 | ||
980 | /* discover USDT in given binary, optionally limiting | |
981 | * activations to a given PID, if pid > 0 | |
982 | */ | |
983 | err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name, | |
984 | usdt_cookie, &targets, &target_cnt); | |
985 | if (err <= 0) { | |
986 | err = (err == 0) ? -ENOENT : err; | |
987 | goto err_out; | |
988 | } | |
989 | ||
999783c8 AN |
990 | specs_hash = hashmap__new(specs_hash_fn, specs_equal_fn, NULL); |
991 | if (IS_ERR(specs_hash)) { | |
992 | err = PTR_ERR(specs_hash); | |
993 | goto err_out; | |
994 | } | |
995 | ||
2e4913e0 AN |
996 | link = calloc(1, sizeof(*link)); |
997 | if (!link) { | |
998 | err = -ENOMEM; | |
999 | goto err_out; | |
1000 | } | |
1001 | ||
1002 | link->usdt_man = man; | |
1003 | link->link.detach = &bpf_link_usdt_detach; | |
1004 | link->link.dealloc = &bpf_link_usdt_dealloc; | |
1005 | ||
1006 | link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); | |
1007 | if (!link->uprobes) { | |
1008 | err = -ENOMEM; | |
1009 | goto err_out; | |
1010 | } | |
1011 | ||
1012 | for (i = 0; i < target_cnt; i++) { | |
1013 | struct usdt_target *target = &targets[i]; | |
1014 | struct bpf_link *uprobe_link; | |
999783c8 AN |
1015 | bool is_new; |
1016 | int spec_id; | |
1017 | ||
1018 | /* Spec ID can be either reused or newly allocated. If it is | |
1019 | * newly allocated, we'll need to fill out spec map, otherwise | |
1020 | * entire spec should be valid and can be just used by a new | |
1021 | * uprobe. We reuse spec when USDT arg spec is identical. We | |
1022 | * also never share specs between two different USDT | |
1023 | * attachments ("links"), so all the reused specs already | |
1024 | * share USDT cookie value implicitly. | |
1025 | */ | |
1026 | err = allocate_spec_id(man, specs_hash, link, target, &spec_id, &is_new); | |
1027 | if (err) | |
1028 | goto err_out; | |
1029 | ||
1030 | if (is_new && bpf_map_update_elem(spec_map_fd, &spec_id, &target->spec, BPF_ANY)) { | |
1031 | err = -errno; | |
1032 | pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n", | |
1033 | spec_id, usdt_provider, usdt_name, path, err); | |
1034 | goto err_out; | |
1035 | } | |
1036 | if (!man->has_bpf_cookie && | |
1037 | bpf_map_update_elem(ip_map_fd, &target->abs_ip, &spec_id, BPF_NOEXIST)) { | |
1038 | err = -errno; | |
1039 | if (err == -EEXIST) { | |
1040 | pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n", | |
1041 | spec_id, usdt_provider, usdt_name, path); | |
1042 | } else { | |
1043 | pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n", | |
1044 | target->abs_ip, spec_id, usdt_provider, usdt_name, | |
1045 | path, err); | |
1046 | } | |
1047 | goto err_out; | |
1048 | } | |
2e4913e0 AN |
1049 | |
1050 | opts.ref_ctr_offset = target->sema_off; | |
999783c8 | 1051 | opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; |
2e4913e0 AN |
1052 | uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, |
1053 | target->rel_ip, &opts); | |
1054 | err = libbpf_get_error(uprobe_link); | |
1055 | if (err) { | |
1056 | pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", | |
1057 | i, usdt_provider, usdt_name, path, err); | |
1058 | goto err_out; | |
1059 | } | |
1060 | ||
1061 | link->uprobes[i].link = uprobe_link; | |
1062 | link->uprobes[i].abs_ip = target->abs_ip; | |
1063 | link->uprobe_cnt++; | |
1064 | } | |
1065 | ||
74cc6311 | 1066 | free(targets); |
999783c8 | 1067 | hashmap__free(specs_hash); |
2e4913e0 AN |
1068 | elf_end(elf); |
1069 | close(fd); | |
1070 | ||
1071 | return &link->link; | |
1072 | ||
1073 | err_out: | |
e58c5c97 HB |
1074 | if (link) |
1075 | bpf_link__destroy(&link->link); | |
74cc6311 | 1076 | free(targets); |
999783c8 | 1077 | hashmap__free(specs_hash); |
2e4913e0 AN |
1078 | if (elf) |
1079 | elf_end(elf); | |
1080 | close(fd); | |
1081 | return libbpf_err_ptr(err); | |
1082 | } | |
74cc6311 AN |
1083 | |
1084 | /* Parse out USDT ELF note from '.note.stapsdt' section. | |
1085 | * Logic inspired by perf's code. | |
1086 | */ | |
1087 | static int parse_usdt_note(Elf *elf, const char *path, long base_addr, | |
1088 | GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, | |
1089 | struct usdt_note *note) | |
1090 | { | |
1091 | const char *provider, *name, *args; | |
1092 | long addrs[3]; | |
1093 | size_t len; | |
1094 | ||
1095 | /* sanity check USDT note name and type first */ | |
1096 | if (strncmp(data + name_off, USDT_NOTE_NAME, nhdr->n_namesz) != 0) | |
1097 | return -EINVAL; | |
1098 | if (nhdr->n_type != USDT_NOTE_TYPE) | |
1099 | return -EINVAL; | |
1100 | ||
1101 | /* sanity check USDT note contents ("description" in ELF terminology) */ | |
1102 | len = nhdr->n_descsz; | |
1103 | data = data + desc_off; | |
1104 | ||
1105 | /* +3 is the very minimum required to store three empty strings */ | |
1106 | if (len < sizeof(addrs) + 3) | |
1107 | return -EINVAL; | |
1108 | ||
1109 | /* get location, base, and semaphore addrs */ | |
1110 | memcpy(&addrs, data, sizeof(addrs)); | |
1111 | ||
1112 | /* parse string fields: provider, name, args */ | |
1113 | provider = data + sizeof(addrs); | |
1114 | ||
1115 | name = (const char *)memchr(provider, '\0', data + len - provider); | |
1116 | if (!name) /* non-zero-terminated provider */ | |
1117 | return -EINVAL; | |
1118 | name++; | |
1119 | if (name >= data + len || *name == '\0') /* missing or empty name */ | |
1120 | return -EINVAL; | |
1121 | ||
1122 | args = memchr(name, '\0', data + len - name); | |
1123 | if (!args) /* non-zero-terminated name */ | |
1124 | return -EINVAL; | |
1125 | ++args; | |
1126 | if (args >= data + len) /* missing arguments spec */ | |
1127 | return -EINVAL; | |
1128 | ||
1129 | note->provider = provider; | |
1130 | note->name = name; | |
1131 | if (*args == '\0' || *args == ':') | |
1132 | note->args = ""; | |
1133 | else | |
1134 | note->args = args; | |
1135 | note->loc_addr = addrs[0]; | |
1136 | note->base_addr = addrs[1]; | |
1137 | note->sema_addr = addrs[2]; | |
1138 | ||
1139 | return 0; | |
1140 | } | |
1141 | ||
1142 | static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg); | |
1143 | ||
1144 | static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie) | |
1145 | { | |
1146 | const char *s; | |
1147 | int len; | |
1148 | ||
1149 | spec->usdt_cookie = usdt_cookie; | |
1150 | spec->arg_cnt = 0; | |
1151 | ||
1152 | s = note->args; | |
1153 | while (s[0]) { | |
1154 | if (spec->arg_cnt >= USDT_MAX_ARG_CNT) { | |
1155 | pr_warn("usdt: too many USDT arguments (> %d) for '%s:%s' with args spec '%s'\n", | |
1156 | USDT_MAX_ARG_CNT, note->provider, note->name, note->args); | |
1157 | return -E2BIG; | |
1158 | } | |
1159 | ||
1160 | len = parse_usdt_arg(s, spec->arg_cnt, &spec->args[spec->arg_cnt]); | |
1161 | if (len < 0) | |
1162 | return len; | |
1163 | ||
1164 | s += len; | |
1165 | spec->arg_cnt++; | |
1166 | } | |
1167 | ||
1168 | return 0; | |
1169 | } | |
1170 | ||
4c59e584 AN |
1171 | /* Architecture-specific logic for parsing USDT argument location specs */ |
1172 | ||
1173 | #if defined(__x86_64__) || defined(__i386__) | |
1174 | ||
1175 | static int calc_pt_regs_off(const char *reg_name) | |
1176 | { | |
1177 | static struct { | |
1178 | const char *names[4]; | |
1179 | size_t pt_regs_off; | |
1180 | } reg_map[] = { | |
ded6dffa | 1181 | #ifdef __x86_64__ |
4c59e584 AN |
1182 | #define reg_off(reg64, reg32) offsetof(struct pt_regs, reg64) |
1183 | #else | |
1184 | #define reg_off(reg64, reg32) offsetof(struct pt_regs, reg32) | |
1185 | #endif | |
1186 | { {"rip", "eip", "", ""}, reg_off(rip, eip) }, | |
1187 | { {"rax", "eax", "ax", "al"}, reg_off(rax, eax) }, | |
1188 | { {"rbx", "ebx", "bx", "bl"}, reg_off(rbx, ebx) }, | |
1189 | { {"rcx", "ecx", "cx", "cl"}, reg_off(rcx, ecx) }, | |
1190 | { {"rdx", "edx", "dx", "dl"}, reg_off(rdx, edx) }, | |
1191 | { {"rsi", "esi", "si", "sil"}, reg_off(rsi, esi) }, | |
1192 | { {"rdi", "edi", "di", "dil"}, reg_off(rdi, edi) }, | |
1193 | { {"rbp", "ebp", "bp", "bpl"}, reg_off(rbp, ebp) }, | |
1194 | { {"rsp", "esp", "sp", "spl"}, reg_off(rsp, esp) }, | |
1195 | #undef reg_off | |
ded6dffa | 1196 | #ifdef __x86_64__ |
4c59e584 AN |
1197 | { {"r8", "r8d", "r8w", "r8b"}, offsetof(struct pt_regs, r8) }, |
1198 | { {"r9", "r9d", "r9w", "r9b"}, offsetof(struct pt_regs, r9) }, | |
1199 | { {"r10", "r10d", "r10w", "r10b"}, offsetof(struct pt_regs, r10) }, | |
1200 | { {"r11", "r11d", "r11w", "r11b"}, offsetof(struct pt_regs, r11) }, | |
1201 | { {"r12", "r12d", "r12w", "r12b"}, offsetof(struct pt_regs, r12) }, | |
1202 | { {"r13", "r13d", "r13w", "r13b"}, offsetof(struct pt_regs, r13) }, | |
1203 | { {"r14", "r14d", "r14w", "r14b"}, offsetof(struct pt_regs, r14) }, | |
1204 | { {"r15", "r15d", "r15w", "r15b"}, offsetof(struct pt_regs, r15) }, | |
1205 | #endif | |
1206 | }; | |
1207 | int i, j; | |
1208 | ||
1209 | for (i = 0; i < ARRAY_SIZE(reg_map); i++) { | |
1210 | for (j = 0; j < ARRAY_SIZE(reg_map[i].names); j++) { | |
1211 | if (strcmp(reg_name, reg_map[i].names[j]) == 0) | |
1212 | return reg_map[i].pt_regs_off; | |
1213 | } | |
1214 | } | |
1215 | ||
1216 | pr_warn("usdt: unrecognized register '%s'\n", reg_name); | |
1217 | return -ENOENT; | |
1218 | } | |
1219 | ||
1220 | static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) | |
1221 | { | |
1222 | char *reg_name = NULL; | |
1223 | int arg_sz, len, reg_off; | |
1224 | long off; | |
1225 | ||
1226 | if (sscanf(arg_str, " %d @ %ld ( %%%m[^)] ) %n", &arg_sz, &off, ®_name, &len) == 3) { | |
1227 | /* Memory dereference case, e.g., -4@-20(%rbp) */ | |
1228 | arg->arg_type = USDT_ARG_REG_DEREF; | |
1229 | arg->val_off = off; | |
1230 | reg_off = calc_pt_regs_off(reg_name); | |
1231 | free(reg_name); | |
1232 | if (reg_off < 0) | |
1233 | return reg_off; | |
1234 | arg->reg_off = reg_off; | |
1235 | } else if (sscanf(arg_str, " %d @ %%%ms %n", &arg_sz, ®_name, &len) == 2) { | |
1236 | /* Register read case, e.g., -4@%eax */ | |
1237 | arg->arg_type = USDT_ARG_REG; | |
1238 | arg->val_off = 0; | |
1239 | ||
1240 | reg_off = calc_pt_regs_off(reg_name); | |
1241 | free(reg_name); | |
1242 | if (reg_off < 0) | |
1243 | return reg_off; | |
1244 | arg->reg_off = reg_off; | |
1245 | } else if (sscanf(arg_str, " %d @ $%ld %n", &arg_sz, &off, &len) == 2) { | |
1246 | /* Constant value case, e.g., 4@$71 */ | |
1247 | arg->arg_type = USDT_ARG_CONST; | |
1248 | arg->val_off = off; | |
1249 | arg->reg_off = 0; | |
1250 | } else { | |
1251 | pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); | |
1252 | return -EINVAL; | |
1253 | } | |
1254 | ||
1255 | arg->arg_signed = arg_sz < 0; | |
1256 | if (arg_sz < 0) | |
1257 | arg_sz = -arg_sz; | |
1258 | ||
1259 | switch (arg_sz) { | |
1260 | case 1: case 2: case 4: case 8: | |
1261 | arg->arg_bitshift = 64 - arg_sz * 8; | |
1262 | break; | |
1263 | default: | |
1264 | pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", | |
1265 | arg_num, arg_str, arg_sz); | |
1266 | return -EINVAL; | |
1267 | } | |
1268 | ||
1269 | return len; | |
1270 | } | |
1271 | ||
bd022685 IL |
1272 | #elif defined(__s390x__) |
1273 | ||
1274 | /* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ | |
1275 | ||
1276 | static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) | |
1277 | { | |
1278 | unsigned int reg; | |
1279 | int arg_sz, len; | |
1280 | long off; | |
1281 | ||
1282 | if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", &arg_sz, &off, ®, &len) == 3) { | |
1283 | /* Memory dereference case, e.g., -2@-28(%r15) */ | |
1284 | arg->arg_type = USDT_ARG_REG_DEREF; | |
1285 | arg->val_off = off; | |
1286 | if (reg > 15) { | |
1287 | pr_warn("usdt: unrecognized register '%%r%u'\n", reg); | |
1288 | return -EINVAL; | |
1289 | } | |
1290 | arg->reg_off = offsetof(user_pt_regs, gprs[reg]); | |
1291 | } else if (sscanf(arg_str, " %d @ %%r%u %n", &arg_sz, ®, &len) == 2) { | |
1292 | /* Register read case, e.g., -8@%r0 */ | |
1293 | arg->arg_type = USDT_ARG_REG; | |
1294 | arg->val_off = 0; | |
1295 | if (reg > 15) { | |
1296 | pr_warn("usdt: unrecognized register '%%r%u'\n", reg); | |
1297 | return -EINVAL; | |
1298 | } | |
1299 | arg->reg_off = offsetof(user_pt_regs, gprs[reg]); | |
1300 | } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { | |
1301 | /* Constant value case, e.g., 4@71 */ | |
1302 | arg->arg_type = USDT_ARG_CONST; | |
1303 | arg->val_off = off; | |
1304 | arg->reg_off = 0; | |
1305 | } else { | |
1306 | pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); | |
1307 | return -EINVAL; | |
1308 | } | |
1309 | ||
1310 | arg->arg_signed = arg_sz < 0; | |
1311 | if (arg_sz < 0) | |
1312 | arg_sz = -arg_sz; | |
1313 | ||
1314 | switch (arg_sz) { | |
1315 | case 1: case 2: case 4: case 8: | |
1316 | arg->arg_bitshift = 64 - arg_sz * 8; | |
1317 | break; | |
1318 | default: | |
1319 | pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", | |
1320 | arg_num, arg_str, arg_sz); | |
1321 | return -EINVAL; | |
1322 | } | |
1323 | ||
1324 | return len; | |
1325 | } | |
1326 | ||
0f861992 AM |
1327 | #elif defined(__aarch64__) |
1328 | ||
1329 | static int calc_pt_regs_off(const char *reg_name) | |
1330 | { | |
1331 | int reg_num; | |
1332 | ||
1333 | if (sscanf(reg_name, "x%d", ®_num) == 1) { | |
1334 | if (reg_num >= 0 && reg_num < 31) | |
1335 | return offsetof(struct user_pt_regs, regs[reg_num]); | |
1336 | } else if (strcmp(reg_name, "sp") == 0) { | |
1337 | return offsetof(struct user_pt_regs, sp); | |
1338 | } | |
1339 | pr_warn("usdt: unrecognized register '%s'\n", reg_name); | |
1340 | return -ENOENT; | |
1341 | } | |
1342 | ||
1343 | static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) | |
1344 | { | |
1345 | char *reg_name = NULL; | |
1346 | int arg_sz, len, reg_off; | |
1347 | long off; | |
1348 | ||
1349 | if (sscanf(arg_str, " %d @ \[ %m[a-z0-9], %ld ] %n", &arg_sz, ®_name, &off, &len) == 3) { | |
1350 | /* Memory dereference case, e.g., -4@[sp, 96] */ | |
1351 | arg->arg_type = USDT_ARG_REG_DEREF; | |
1352 | arg->val_off = off; | |
1353 | reg_off = calc_pt_regs_off(reg_name); | |
1354 | free(reg_name); | |
1355 | if (reg_off < 0) | |
1356 | return reg_off; | |
1357 | arg->reg_off = reg_off; | |
1358 | } else if (sscanf(arg_str, " %d @ \[ %m[a-z0-9] ] %n", &arg_sz, ®_name, &len) == 2) { | |
1359 | /* Memory dereference case, e.g., -4@[sp] */ | |
1360 | arg->arg_type = USDT_ARG_REG_DEREF; | |
1361 | arg->val_off = 0; | |
1362 | reg_off = calc_pt_regs_off(reg_name); | |
1363 | free(reg_name); | |
1364 | if (reg_off < 0) | |
1365 | return reg_off; | |
1366 | arg->reg_off = reg_off; | |
1367 | } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { | |
1368 | /* Constant value case, e.g., 4@5 */ | |
1369 | arg->arg_type = USDT_ARG_CONST; | |
1370 | arg->val_off = off; | |
1371 | arg->reg_off = 0; | |
1372 | } else if (sscanf(arg_str, " %d @ %m[a-z0-9] %n", &arg_sz, ®_name, &len) == 2) { | |
1373 | /* Register read case, e.g., -8@x4 */ | |
1374 | arg->arg_type = USDT_ARG_REG; | |
1375 | arg->val_off = 0; | |
1376 | reg_off = calc_pt_regs_off(reg_name); | |
1377 | free(reg_name); | |
1378 | if (reg_off < 0) | |
1379 | return reg_off; | |
1380 | arg->reg_off = reg_off; | |
1381 | } else { | |
1382 | pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); | |
1383 | return -EINVAL; | |
1384 | } | |
1385 | ||
1386 | arg->arg_signed = arg_sz < 0; | |
1387 | if (arg_sz < 0) | |
1388 | arg_sz = -arg_sz; | |
1389 | ||
1390 | switch (arg_sz) { | |
1391 | case 1: case 2: case 4: case 8: | |
1392 | arg->arg_bitshift = 64 - arg_sz * 8; | |
1393 | break; | |
1394 | default: | |
1395 | pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", | |
1396 | arg_num, arg_str, arg_sz); | |
1397 | return -EINVAL; | |
1398 | } | |
1399 | ||
1400 | return len; | |
1401 | } | |
1402 | ||
4c59e584 AN |
1403 | #else |
1404 | ||
74cc6311 AN |
1405 | static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) |
1406 | { | |
1407 | pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); | |
1408 | return -ENOTSUP; | |
1409 | } | |
4c59e584 AN |
1410 | |
1411 | #endif |