Commit | Line | Data |
---|---|---|
bdbda395 DV |
1 | .. SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | .. _cpumasks-header-label: | |
4 | ||
5 | ================== | |
6 | BPF cpumask kfuncs | |
7 | ================== | |
8 | ||
9 | 1. Introduction | |
10 | =============== | |
11 | ||
12 | ``struct cpumask`` is a bitmap data structure in the kernel whose indices | |
13 | reflect the CPUs on the system. Commonly, cpumasks are used to track which CPUs | |
14 | a task is affinitized to, but they can also be used to e.g. track which cores | |
15 | are associated with a scheduling domain, which cores on a machine are idle, | |
16 | etc. | |
17 | ||
18 | BPF provides programs with a set of :ref:`kfuncs-header-label` that can be | |
19 | used to allocate, mutate, query, and free cpumasks. | |
20 | ||
21 | 2. BPF cpumask objects | |
22 | ====================== | |
23 | ||
24 | There are two different types of cpumasks that can be used by BPF programs. | |
25 | ||
26 | 2.1 ``struct bpf_cpumask *`` | |
27 | ---------------------------- | |
28 | ||
29 | ``struct bpf_cpumask *`` is a cpumask that is allocated by BPF, on behalf of a | |
30 | BPF program, and whose lifecycle is entirely controlled by BPF. These cpumasks | |
31 | are RCU-protected, can be mutated, can be used as kptrs, and can be safely cast | |
32 | to a ``struct cpumask *``. | |
33 | ||
34 | 2.1.1 ``struct bpf_cpumask *`` lifecycle | |
35 | ---------------------------------------- | |
36 | ||
37 | A ``struct bpf_cpumask *`` is allocated, acquired, and released, using the | |
38 | following functions: | |
39 | ||
40 | .. kernel-doc:: kernel/bpf/cpumask.c | |
41 | :identifiers: bpf_cpumask_create | |
42 | ||
43 | .. kernel-doc:: kernel/bpf/cpumask.c | |
44 | :identifiers: bpf_cpumask_acquire | |
45 | ||
46 | .. kernel-doc:: kernel/bpf/cpumask.c | |
47 | :identifiers: bpf_cpumask_release | |
48 | ||
49 | For example: | |
50 | ||
51 | .. code-block:: c | |
52 | ||
53 | struct cpumask_map_value { | |
03b77e17 | 54 | struct bpf_cpumask __kptr * cpumask; |
bdbda395 DV |
55 | }; |
56 | ||
57 | struct array_map { | |
58 | __uint(type, BPF_MAP_TYPE_ARRAY); | |
59 | __type(key, int); | |
60 | __type(value, struct cpumask_map_value); | |
61 | __uint(max_entries, 65536); | |
62 | } cpumask_map SEC(".maps"); | |
63 | ||
64 | static int cpumask_map_insert(struct bpf_cpumask *mask, u32 pid) | |
65 | { | |
66 | struct cpumask_map_value local, *v; | |
67 | long status; | |
68 | struct bpf_cpumask *old; | |
69 | u32 key = pid; | |
70 | ||
71 | local.cpumask = NULL; | |
72 | status = bpf_map_update_elem(&cpumask_map, &key, &local, 0); | |
73 | if (status) { | |
74 | bpf_cpumask_release(mask); | |
75 | return status; | |
76 | } | |
77 | ||
78 | v = bpf_map_lookup_elem(&cpumask_map, &key); | |
79 | if (!v) { | |
80 | bpf_cpumask_release(mask); | |
81 | return -ENOENT; | |
82 | } | |
83 | ||
84 | old = bpf_kptr_xchg(&v->cpumask, mask); | |
85 | if (old) | |
86 | bpf_cpumask_release(old); | |
87 | ||
88 | return 0; | |
89 | } | |
90 | ||
91 | /** | |
92 | * A sample tracepoint showing how a task's cpumask can be queried and | |
93 | * recorded as a kptr. | |
94 | */ | |
95 | SEC("tp_btf/task_newtask") | |
96 | int BPF_PROG(record_task_cpumask, struct task_struct *task, u64 clone_flags) | |
97 | { | |
98 | struct bpf_cpumask *cpumask; | |
99 | int ret; | |
100 | ||
101 | cpumask = bpf_cpumask_create(); | |
102 | if (!cpumask) | |
103 | return -ENOMEM; | |
104 | ||
105 | if (!bpf_cpumask_full(task->cpus_ptr)) | |
106 | bpf_printk("task %s has CPU affinity", task->comm); | |
107 | ||
108 | bpf_cpumask_copy(cpumask, task->cpus_ptr); | |
109 | return cpumask_map_insert(cpumask, task->pid); | |
110 | } | |
111 | ||
112 | ---- | |
113 | ||
114 | 2.1.1 ``struct bpf_cpumask *`` as kptrs | |
115 | --------------------------------------- | |
116 | ||
117 | As mentioned and illustrated above, these ``struct bpf_cpumask *`` objects can | |
118 | also be stored in a map and used as kptrs. If a ``struct bpf_cpumask *`` is in | |
119 | a map, the reference can be removed from the map with bpf_kptr_xchg(), or | |
fec2c6d1 | 120 | opportunistically acquired using RCU: |
bdbda395 DV |
121 | |
122 | .. code-block:: c | |
123 | ||
124 | /* struct containing the struct bpf_cpumask kptr which is stored in the map. */ | |
125 | struct cpumasks_kfunc_map_value { | |
03b77e17 | 126 | struct bpf_cpumask __kptr * bpf_cpumask; |
bdbda395 DV |
127 | }; |
128 | ||
129 | /* The map containing struct cpumasks_kfunc_map_value entries. */ | |
130 | struct { | |
131 | __uint(type, BPF_MAP_TYPE_ARRAY); | |
132 | __type(key, int); | |
133 | __type(value, struct cpumasks_kfunc_map_value); | |
134 | __uint(max_entries, 1); | |
135 | } cpumasks_kfunc_map SEC(".maps"); | |
136 | ||
137 | /* ... */ | |
138 | ||
139 | /** | |
140 | * A simple example tracepoint program showing how a | |
141 | * struct bpf_cpumask * kptr that is stored in a map can | |
fec2c6d1 | 142 | * be passed to kfuncs using RCU protection. |
bdbda395 DV |
143 | */ |
144 | SEC("tp_btf/cgroup_mkdir") | |
145 | int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path) | |
146 | { | |
147 | struct bpf_cpumask *kptr; | |
148 | struct cpumasks_kfunc_map_value *v; | |
149 | u32 key = 0; | |
150 | ||
151 | /* Assume a bpf_cpumask * kptr was previously stored in the map. */ | |
152 | v = bpf_map_lookup_elem(&cpumasks_kfunc_map, &key); | |
153 | if (!v) | |
154 | return -ENOENT; | |
155 | ||
fec2c6d1 | 156 | bpf_rcu_read_lock(); |
bdbda395 | 157 | /* Acquire a reference to the bpf_cpumask * kptr that's already stored in the map. */ |
fec2c6d1 DV |
158 | kptr = v->cpumask; |
159 | if (!kptr) { | |
bdbda395 DV |
160 | /* If no bpf_cpumask was present in the map, it's because |
161 | * we're racing with another CPU that removed it with | |
162 | * bpf_kptr_xchg() between the bpf_map_lookup_elem() | |
fec2c6d1 | 163 | * above, and our load of the pointer from the map. |
bdbda395 | 164 | */ |
fec2c6d1 | 165 | bpf_rcu_read_unlock(); |
bdbda395 | 166 | return -EBUSY; |
fec2c6d1 | 167 | } |
bdbda395 | 168 | |
fec2c6d1 DV |
169 | bpf_cpumask_setall(kptr); |
170 | bpf_rcu_read_unlock(); | |
bdbda395 DV |
171 | |
172 | return 0; | |
173 | } | |
174 | ||
175 | ---- | |
176 | ||
177 | 2.2 ``struct cpumask`` | |
178 | ---------------------- | |
179 | ||
180 | ``struct cpumask`` is the object that actually contains the cpumask bitmap | |
181 | being queried, mutated, etc. A ``struct bpf_cpumask`` wraps a ``struct | |
182 | cpumask``, which is why it's safe to cast it as such (note however that it is | |
183 | **not** safe to cast a ``struct cpumask *`` to a ``struct bpf_cpumask *``, and | |
184 | the verifier will reject any program that tries to do so). | |
185 | ||
186 | As we'll see below, any kfunc that mutates its cpumask argument will take a | |
187 | ``struct bpf_cpumask *`` as that argument. Any argument that simply queries the | |
188 | cpumask will instead take a ``struct cpumask *``. | |
189 | ||
190 | 3. cpumask kfuncs | |
191 | ================= | |
192 | ||
193 | Above, we described the kfuncs that can be used to allocate, acquire, release, | |
194 | etc a ``struct bpf_cpumask *``. This section of the document will describe the | |
195 | kfuncs for mutating and querying cpumasks. | |
196 | ||
197 | 3.1 Mutating cpumasks | |
198 | --------------------- | |
199 | ||
200 | Some cpumask kfuncs are "read-only" in that they don't mutate any of their | |
201 | arguments, whereas others mutate at least one argument (which means that the | |
202 | argument must be a ``struct bpf_cpumask *``, as described above). | |
203 | ||
204 | This section will describe all of the cpumask kfuncs which mutate at least one | |
205 | argument. :ref:`cpumasks-querying-label` below describes the read-only kfuncs. | |
206 | ||
207 | 3.1.1 Setting and clearing CPUs | |
208 | ------------------------------- | |
209 | ||
210 | bpf_cpumask_set_cpu() and bpf_cpumask_clear_cpu() can be used to set and clear | |
211 | a CPU in a ``struct bpf_cpumask`` respectively: | |
212 | ||
213 | .. kernel-doc:: kernel/bpf/cpumask.c | |
214 | :identifiers: bpf_cpumask_set_cpu bpf_cpumask_clear_cpu | |
215 | ||
216 | These kfuncs are pretty straightforward, and can be used, for example, as | |
217 | follows: | |
218 | ||
219 | .. code-block:: c | |
220 | ||
221 | /** | |
222 | * A sample tracepoint showing how a cpumask can be queried. | |
223 | */ | |
224 | SEC("tp_btf/task_newtask") | |
225 | int BPF_PROG(test_set_clear_cpu, struct task_struct *task, u64 clone_flags) | |
226 | { | |
227 | struct bpf_cpumask *cpumask; | |
228 | ||
229 | cpumask = bpf_cpumask_create(); | |
230 | if (!cpumask) | |
231 | return -ENOMEM; | |
232 | ||
233 | bpf_cpumask_set_cpu(0, cpumask); | |
234 | if (!bpf_cpumask_test_cpu(0, cast(cpumask))) | |
235 | /* Should never happen. */ | |
236 | goto release_exit; | |
237 | ||
238 | bpf_cpumask_clear_cpu(0, cpumask); | |
239 | if (bpf_cpumask_test_cpu(0, cast(cpumask))) | |
240 | /* Should never happen. */ | |
241 | goto release_exit; | |
242 | ||
243 | /* struct cpumask * pointers such as task->cpus_ptr can also be queried. */ | |
244 | if (bpf_cpumask_test_cpu(0, task->cpus_ptr)) | |
245 | bpf_printk("task %s can use CPU %d", task->comm, 0); | |
246 | ||
247 | release_exit: | |
248 | bpf_cpumask_release(cpumask); | |
249 | return 0; | |
250 | } | |
251 | ||
252 | ---- | |
253 | ||
254 | bpf_cpumask_test_and_set_cpu() and bpf_cpumask_test_and_clear_cpu() are | |
255 | complementary kfuncs that allow callers to atomically test and set (or clear) | |
256 | CPUs: | |
257 | ||
258 | .. kernel-doc:: kernel/bpf/cpumask.c | |
259 | :identifiers: bpf_cpumask_test_and_set_cpu bpf_cpumask_test_and_clear_cpu | |
260 | ||
261 | ---- | |
262 | ||
263 | We can also set and clear entire ``struct bpf_cpumask *`` objects in one | |
264 | operation using bpf_cpumask_setall() and bpf_cpumask_clear(): | |
265 | ||
266 | .. kernel-doc:: kernel/bpf/cpumask.c | |
267 | :identifiers: bpf_cpumask_setall bpf_cpumask_clear | |
268 | ||
269 | 3.1.2 Operations between cpumasks | |
270 | --------------------------------- | |
271 | ||
272 | In addition to setting and clearing individual CPUs in a single cpumask, | |
273 | callers can also perform bitwise operations between multiple cpumasks using | |
274 | bpf_cpumask_and(), bpf_cpumask_or(), and bpf_cpumask_xor(): | |
275 | ||
276 | .. kernel-doc:: kernel/bpf/cpumask.c | |
277 | :identifiers: bpf_cpumask_and bpf_cpumask_or bpf_cpumask_xor | |
278 | ||
279 | The following is an example of how they may be used. Note that some of the | |
280 | kfuncs shown in this example will be covered in more detail below. | |
281 | ||
282 | .. code-block:: c | |
283 | ||
284 | /** | |
285 | * A sample tracepoint showing how a cpumask can be mutated using | |
286 | bitwise operators (and queried). | |
287 | */ | |
288 | SEC("tp_btf/task_newtask") | |
289 | int BPF_PROG(test_and_or_xor, struct task_struct *task, u64 clone_flags) | |
290 | { | |
291 | struct bpf_cpumask *mask1, *mask2, *dst1, *dst2; | |
292 | ||
293 | mask1 = bpf_cpumask_create(); | |
294 | if (!mask1) | |
295 | return -ENOMEM; | |
296 | ||
297 | mask2 = bpf_cpumask_create(); | |
298 | if (!mask2) { | |
299 | bpf_cpumask_release(mask1); | |
300 | return -ENOMEM; | |
301 | } | |
302 | ||
303 | // ...Safely create the other two masks... */ | |
304 | ||
305 | bpf_cpumask_set_cpu(0, mask1); | |
306 | bpf_cpumask_set_cpu(1, mask2); | |
307 | bpf_cpumask_and(dst1, (const struct cpumask *)mask1, (const struct cpumask *)mask2); | |
308 | if (!bpf_cpumask_empty((const struct cpumask *)dst1)) | |
309 | /* Should never happen. */ | |
310 | goto release_exit; | |
311 | ||
312 | bpf_cpumask_or(dst1, (const struct cpumask *)mask1, (const struct cpumask *)mask2); | |
313 | if (!bpf_cpumask_test_cpu(0, (const struct cpumask *)dst1)) | |
314 | /* Should never happen. */ | |
315 | goto release_exit; | |
316 | ||
317 | if (!bpf_cpumask_test_cpu(1, (const struct cpumask *)dst1)) | |
318 | /* Should never happen. */ | |
319 | goto release_exit; | |
320 | ||
321 | bpf_cpumask_xor(dst2, (const struct cpumask *)mask1, (const struct cpumask *)mask2); | |
322 | if (!bpf_cpumask_equal((const struct cpumask *)dst1, | |
323 | (const struct cpumask *)dst2)) | |
324 | /* Should never happen. */ | |
325 | goto release_exit; | |
326 | ||
327 | release_exit: | |
328 | bpf_cpumask_release(mask1); | |
329 | bpf_cpumask_release(mask2); | |
330 | bpf_cpumask_release(dst1); | |
331 | bpf_cpumask_release(dst2); | |
332 | return 0; | |
333 | } | |
334 | ||
335 | ---- | |
336 | ||
337 | The contents of an entire cpumask may be copied to another using | |
338 | bpf_cpumask_copy(): | |
339 | ||
340 | .. kernel-doc:: kernel/bpf/cpumask.c | |
341 | :identifiers: bpf_cpumask_copy | |
342 | ||
343 | ---- | |
344 | ||
345 | .. _cpumasks-querying-label: | |
346 | ||
347 | 3.2 Querying cpumasks | |
348 | --------------------- | |
349 | ||
350 | In addition to the above kfuncs, there is also a set of read-only kfuncs that | |
351 | can be used to query the contents of cpumasks. | |
352 | ||
353 | .. kernel-doc:: kernel/bpf/cpumask.c | |
25085b4e | 354 | :identifiers: bpf_cpumask_first bpf_cpumask_first_zero bpf_cpumask_first_and |
a6de18f3 | 355 | bpf_cpumask_test_cpu bpf_cpumask_weight |
bdbda395 DV |
356 | |
357 | .. kernel-doc:: kernel/bpf/cpumask.c | |
358 | :identifiers: bpf_cpumask_equal bpf_cpumask_intersects bpf_cpumask_subset | |
359 | bpf_cpumask_empty bpf_cpumask_full | |
360 | ||
361 | .. kernel-doc:: kernel/bpf/cpumask.c | |
25085b4e | 362 | :identifiers: bpf_cpumask_any_distribute bpf_cpumask_any_and_distribute |
bdbda395 DV |
363 | |
364 | ---- | |
365 | ||
366 | Some example usages of these querying kfuncs were shown above. We will not | |
d56b699d | 367 | replicate those examples here. Note, however, that all of the aforementioned |
bdbda395 DV |
368 | kfuncs are tested in `tools/testing/selftests/bpf/progs/cpumask_success.c`_, so |
369 | please take a look there if you're looking for more examples of how they can be | |
370 | used. | |
371 | ||
372 | .. _tools/testing/selftests/bpf/progs/cpumask_success.c: | |
373 | https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/cpumask_success.c | |
374 | ||
375 | ||
376 | 4. Adding BPF cpumask kfuncs | |
377 | ============================ | |
378 | ||
379 | The set of supported BPF cpumask kfuncs are not (yet) a 1-1 match with the | |
380 | cpumask operations in include/linux/cpumask.h. Any of those cpumask operations | |
381 | could easily be encapsulated in a new kfunc if and when required. If you'd like | |
382 | to support a new cpumask operation, please feel free to submit a patch. If you | |
383 | do add a new cpumask kfunc, please document it here, and add any relevant | |
384 | selftest testcases to the cpumask selftest suite. |