| 1 | .. SPDX-License-Identifier: GPL-2.0 |
| 2 | |
| 3 | .. _cpumasks-header-label: |
| 4 | |
| 5 | ================== |
| 6 | BPF cpumask kfuncs |
| 7 | ================== |
| 8 | |
| 9 | 1. Introduction |
| 10 | =============== |
| 11 | |
| 12 | ``struct cpumask`` is a bitmap data structure in the kernel whose indices |
| 13 | reflect the CPUs on the system. Commonly, cpumasks are used to track which CPUs |
| 14 | a task is affinitized to, but they can also be used to e.g. track which cores |
| 15 | are associated with a scheduling domain, which cores on a machine are idle, |
| 16 | etc. |
| 17 | |
| 18 | BPF provides programs with a set of :ref:`kfuncs-header-label` that can be |
| 19 | used to allocate, mutate, query, and free cpumasks. |
| 20 | |
| 21 | 2. BPF cpumask objects |
| 22 | ====================== |
| 23 | |
| 24 | There are two different types of cpumasks that can be used by BPF programs. |
| 25 | |
| 26 | 2.1 ``struct bpf_cpumask *`` |
| 27 | ---------------------------- |
| 28 | |
| 29 | ``struct bpf_cpumask *`` is a cpumask that is allocated by BPF, on behalf of a |
| 30 | BPF program, and whose lifecycle is entirely controlled by BPF. These cpumasks |
| 31 | are RCU-protected, can be mutated, can be used as kptrs, and can be safely cast |
| 32 | to a ``struct cpumask *``. |
| 33 | |
| 34 | 2.1.1 ``struct bpf_cpumask *`` lifecycle |
| 35 | ---------------------------------------- |
| 36 | |
| 37 | A ``struct bpf_cpumask *`` is allocated, acquired, and released, using the |
| 38 | following functions: |
| 39 | |
| 40 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 41 | :identifiers: bpf_cpumask_create |
| 42 | |
| 43 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 44 | :identifiers: bpf_cpumask_acquire |
| 45 | |
| 46 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 47 | :identifiers: bpf_cpumask_release |
| 48 | |
| 49 | For example: |
| 50 | |
| 51 | .. code-block:: c |
| 52 | |
| 53 | struct cpumask_map_value { |
| 54 | struct bpf_cpumask __kptr * cpumask; |
| 55 | }; |
| 56 | |
| 57 | struct array_map { |
| 58 | __uint(type, BPF_MAP_TYPE_ARRAY); |
| 59 | __type(key, int); |
| 60 | __type(value, struct cpumask_map_value); |
| 61 | __uint(max_entries, 65536); |
| 62 | } cpumask_map SEC(".maps"); |
| 63 | |
| 64 | static int cpumask_map_insert(struct bpf_cpumask *mask, u32 pid) |
| 65 | { |
| 66 | struct cpumask_map_value local, *v; |
| 67 | long status; |
| 68 | struct bpf_cpumask *old; |
| 69 | u32 key = pid; |
| 70 | |
| 71 | local.cpumask = NULL; |
| 72 | status = bpf_map_update_elem(&cpumask_map, &key, &local, 0); |
| 73 | if (status) { |
| 74 | bpf_cpumask_release(mask); |
| 75 | return status; |
| 76 | } |
| 77 | |
| 78 | v = bpf_map_lookup_elem(&cpumask_map, &key); |
| 79 | if (!v) { |
| 80 | bpf_cpumask_release(mask); |
| 81 | return -ENOENT; |
| 82 | } |
| 83 | |
| 84 | old = bpf_kptr_xchg(&v->cpumask, mask); |
| 85 | if (old) |
| 86 | bpf_cpumask_release(old); |
| 87 | |
| 88 | return 0; |
| 89 | } |
| 90 | |
| 91 | /** |
| 92 | * A sample tracepoint showing how a task's cpumask can be queried and |
| 93 | * recorded as a kptr. |
| 94 | */ |
| 95 | SEC("tp_btf/task_newtask") |
| 96 | int BPF_PROG(record_task_cpumask, struct task_struct *task, u64 clone_flags) |
| 97 | { |
| 98 | struct bpf_cpumask *cpumask; |
| 99 | int ret; |
| 100 | |
| 101 | cpumask = bpf_cpumask_create(); |
| 102 | if (!cpumask) |
| 103 | return -ENOMEM; |
| 104 | |
| 105 | if (!bpf_cpumask_full(task->cpus_ptr)) |
| 106 | bpf_printk("task %s has CPU affinity", task->comm); |
| 107 | |
| 108 | bpf_cpumask_copy(cpumask, task->cpus_ptr); |
| 109 | return cpumask_map_insert(cpumask, task->pid); |
| 110 | } |
| 111 | |
| 112 | ---- |
| 113 | |
| 114 | 2.1.1 ``struct bpf_cpumask *`` as kptrs |
| 115 | --------------------------------------- |
| 116 | |
| 117 | As mentioned and illustrated above, these ``struct bpf_cpumask *`` objects can |
| 118 | also be stored in a map and used as kptrs. If a ``struct bpf_cpumask *`` is in |
| 119 | a map, the reference can be removed from the map with bpf_kptr_xchg(), or |
| 120 | opportunistically acquired using RCU: |
| 121 | |
| 122 | .. code-block:: c |
| 123 | |
| 124 | /* struct containing the struct bpf_cpumask kptr which is stored in the map. */ |
| 125 | struct cpumasks_kfunc_map_value { |
| 126 | struct bpf_cpumask __kptr * bpf_cpumask; |
| 127 | }; |
| 128 | |
| 129 | /* The map containing struct cpumasks_kfunc_map_value entries. */ |
| 130 | struct { |
| 131 | __uint(type, BPF_MAP_TYPE_ARRAY); |
| 132 | __type(key, int); |
| 133 | __type(value, struct cpumasks_kfunc_map_value); |
| 134 | __uint(max_entries, 1); |
| 135 | } cpumasks_kfunc_map SEC(".maps"); |
| 136 | |
| 137 | /* ... */ |
| 138 | |
| 139 | /** |
| 140 | * A simple example tracepoint program showing how a |
| 141 | * struct bpf_cpumask * kptr that is stored in a map can |
| 142 | * be passed to kfuncs using RCU protection. |
| 143 | */ |
| 144 | SEC("tp_btf/cgroup_mkdir") |
| 145 | int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path) |
| 146 | { |
| 147 | struct bpf_cpumask *kptr; |
| 148 | struct cpumasks_kfunc_map_value *v; |
| 149 | u32 key = 0; |
| 150 | |
| 151 | /* Assume a bpf_cpumask * kptr was previously stored in the map. */ |
| 152 | v = bpf_map_lookup_elem(&cpumasks_kfunc_map, &key); |
| 153 | if (!v) |
| 154 | return -ENOENT; |
| 155 | |
| 156 | bpf_rcu_read_lock(); |
| 157 | /* Acquire a reference to the bpf_cpumask * kptr that's already stored in the map. */ |
| 158 | kptr = v->cpumask; |
| 159 | if (!kptr) { |
| 160 | /* If no bpf_cpumask was present in the map, it's because |
| 161 | * we're racing with another CPU that removed it with |
| 162 | * bpf_kptr_xchg() between the bpf_map_lookup_elem() |
| 163 | * above, and our load of the pointer from the map. |
| 164 | */ |
| 165 | bpf_rcu_read_unlock(); |
| 166 | return -EBUSY; |
| 167 | } |
| 168 | |
| 169 | bpf_cpumask_setall(kptr); |
| 170 | bpf_rcu_read_unlock(); |
| 171 | |
| 172 | return 0; |
| 173 | } |
| 174 | |
| 175 | ---- |
| 176 | |
| 177 | 2.2 ``struct cpumask`` |
| 178 | ---------------------- |
| 179 | |
| 180 | ``struct cpumask`` is the object that actually contains the cpumask bitmap |
| 181 | being queried, mutated, etc. A ``struct bpf_cpumask`` wraps a ``struct |
| 182 | cpumask``, which is why it's safe to cast it as such (note however that it is |
| 183 | **not** safe to cast a ``struct cpumask *`` to a ``struct bpf_cpumask *``, and |
| 184 | the verifier will reject any program that tries to do so). |
| 185 | |
| 186 | As we'll see below, any kfunc that mutates its cpumask argument will take a |
| 187 | ``struct bpf_cpumask *`` as that argument. Any argument that simply queries the |
| 188 | cpumask will instead take a ``struct cpumask *``. |
| 189 | |
| 190 | 3. cpumask kfuncs |
| 191 | ================= |
| 192 | |
| 193 | Above, we described the kfuncs that can be used to allocate, acquire, release, |
| 194 | etc a ``struct bpf_cpumask *``. This section of the document will describe the |
| 195 | kfuncs for mutating and querying cpumasks. |
| 196 | |
| 197 | 3.1 Mutating cpumasks |
| 198 | --------------------- |
| 199 | |
| 200 | Some cpumask kfuncs are "read-only" in that they don't mutate any of their |
| 201 | arguments, whereas others mutate at least one argument (which means that the |
| 202 | argument must be a ``struct bpf_cpumask *``, as described above). |
| 203 | |
| 204 | This section will describe all of the cpumask kfuncs which mutate at least one |
| 205 | argument. :ref:`cpumasks-querying-label` below describes the read-only kfuncs. |
| 206 | |
| 207 | 3.1.1 Setting and clearing CPUs |
| 208 | ------------------------------- |
| 209 | |
| 210 | bpf_cpumask_set_cpu() and bpf_cpumask_clear_cpu() can be used to set and clear |
| 211 | a CPU in a ``struct bpf_cpumask`` respectively: |
| 212 | |
| 213 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 214 | :identifiers: bpf_cpumask_set_cpu bpf_cpumask_clear_cpu |
| 215 | |
| 216 | These kfuncs are pretty straightforward, and can be used, for example, as |
| 217 | follows: |
| 218 | |
| 219 | .. code-block:: c |
| 220 | |
| 221 | /** |
| 222 | * A sample tracepoint showing how a cpumask can be queried. |
| 223 | */ |
| 224 | SEC("tp_btf/task_newtask") |
| 225 | int BPF_PROG(test_set_clear_cpu, struct task_struct *task, u64 clone_flags) |
| 226 | { |
| 227 | struct bpf_cpumask *cpumask; |
| 228 | |
| 229 | cpumask = bpf_cpumask_create(); |
| 230 | if (!cpumask) |
| 231 | return -ENOMEM; |
| 232 | |
| 233 | bpf_cpumask_set_cpu(0, cpumask); |
| 234 | if (!bpf_cpumask_test_cpu(0, cast(cpumask))) |
| 235 | /* Should never happen. */ |
| 236 | goto release_exit; |
| 237 | |
| 238 | bpf_cpumask_clear_cpu(0, cpumask); |
| 239 | if (bpf_cpumask_test_cpu(0, cast(cpumask))) |
| 240 | /* Should never happen. */ |
| 241 | goto release_exit; |
| 242 | |
| 243 | /* struct cpumask * pointers such as task->cpus_ptr can also be queried. */ |
| 244 | if (bpf_cpumask_test_cpu(0, task->cpus_ptr)) |
| 245 | bpf_printk("task %s can use CPU %d", task->comm, 0); |
| 246 | |
| 247 | release_exit: |
| 248 | bpf_cpumask_release(cpumask); |
| 249 | return 0; |
| 250 | } |
| 251 | |
| 252 | ---- |
| 253 | |
| 254 | bpf_cpumask_test_and_set_cpu() and bpf_cpumask_test_and_clear_cpu() are |
| 255 | complementary kfuncs that allow callers to atomically test and set (or clear) |
| 256 | CPUs: |
| 257 | |
| 258 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 259 | :identifiers: bpf_cpumask_test_and_set_cpu bpf_cpumask_test_and_clear_cpu |
| 260 | |
| 261 | ---- |
| 262 | |
| 263 | We can also set and clear entire ``struct bpf_cpumask *`` objects in one |
| 264 | operation using bpf_cpumask_setall() and bpf_cpumask_clear(): |
| 265 | |
| 266 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 267 | :identifiers: bpf_cpumask_setall bpf_cpumask_clear |
| 268 | |
| 269 | 3.1.2 Operations between cpumasks |
| 270 | --------------------------------- |
| 271 | |
| 272 | In addition to setting and clearing individual CPUs in a single cpumask, |
| 273 | callers can also perform bitwise operations between multiple cpumasks using |
| 274 | bpf_cpumask_and(), bpf_cpumask_or(), and bpf_cpumask_xor(): |
| 275 | |
| 276 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 277 | :identifiers: bpf_cpumask_and bpf_cpumask_or bpf_cpumask_xor |
| 278 | |
| 279 | The following is an example of how they may be used. Note that some of the |
| 280 | kfuncs shown in this example will be covered in more detail below. |
| 281 | |
| 282 | .. code-block:: c |
| 283 | |
| 284 | /** |
| 285 | * A sample tracepoint showing how a cpumask can be mutated using |
| 286 | bitwise operators (and queried). |
| 287 | */ |
| 288 | SEC("tp_btf/task_newtask") |
| 289 | int BPF_PROG(test_and_or_xor, struct task_struct *task, u64 clone_flags) |
| 290 | { |
| 291 | struct bpf_cpumask *mask1, *mask2, *dst1, *dst2; |
| 292 | |
| 293 | mask1 = bpf_cpumask_create(); |
| 294 | if (!mask1) |
| 295 | return -ENOMEM; |
| 296 | |
| 297 | mask2 = bpf_cpumask_create(); |
| 298 | if (!mask2) { |
| 299 | bpf_cpumask_release(mask1); |
| 300 | return -ENOMEM; |
| 301 | } |
| 302 | |
| 303 | // ...Safely create the other two masks... */ |
| 304 | |
| 305 | bpf_cpumask_set_cpu(0, mask1); |
| 306 | bpf_cpumask_set_cpu(1, mask2); |
| 307 | bpf_cpumask_and(dst1, (const struct cpumask *)mask1, (const struct cpumask *)mask2); |
| 308 | if (!bpf_cpumask_empty((const struct cpumask *)dst1)) |
| 309 | /* Should never happen. */ |
| 310 | goto release_exit; |
| 311 | |
| 312 | bpf_cpumask_or(dst1, (const struct cpumask *)mask1, (const struct cpumask *)mask2); |
| 313 | if (!bpf_cpumask_test_cpu(0, (const struct cpumask *)dst1)) |
| 314 | /* Should never happen. */ |
| 315 | goto release_exit; |
| 316 | |
| 317 | if (!bpf_cpumask_test_cpu(1, (const struct cpumask *)dst1)) |
| 318 | /* Should never happen. */ |
| 319 | goto release_exit; |
| 320 | |
| 321 | bpf_cpumask_xor(dst2, (const struct cpumask *)mask1, (const struct cpumask *)mask2); |
| 322 | if (!bpf_cpumask_equal((const struct cpumask *)dst1, |
| 323 | (const struct cpumask *)dst2)) |
| 324 | /* Should never happen. */ |
| 325 | goto release_exit; |
| 326 | |
| 327 | release_exit: |
| 328 | bpf_cpumask_release(mask1); |
| 329 | bpf_cpumask_release(mask2); |
| 330 | bpf_cpumask_release(dst1); |
| 331 | bpf_cpumask_release(dst2); |
| 332 | return 0; |
| 333 | } |
| 334 | |
| 335 | ---- |
| 336 | |
| 337 | The contents of an entire cpumask may be copied to another using |
| 338 | bpf_cpumask_copy(): |
| 339 | |
| 340 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 341 | :identifiers: bpf_cpumask_copy |
| 342 | |
| 343 | ---- |
| 344 | |
| 345 | .. _cpumasks-querying-label: |
| 346 | |
| 347 | 3.2 Querying cpumasks |
| 348 | --------------------- |
| 349 | |
| 350 | In addition to the above kfuncs, there is also a set of read-only kfuncs that |
| 351 | can be used to query the contents of cpumasks. |
| 352 | |
| 353 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 354 | :identifiers: bpf_cpumask_first bpf_cpumask_first_zero bpf_cpumask_test_cpu |
| 355 | |
| 356 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 357 | :identifiers: bpf_cpumask_equal bpf_cpumask_intersects bpf_cpumask_subset |
| 358 | bpf_cpumask_empty bpf_cpumask_full |
| 359 | |
| 360 | .. kernel-doc:: kernel/bpf/cpumask.c |
| 361 | :identifiers: bpf_cpumask_any bpf_cpumask_any_and |
| 362 | |
| 363 | ---- |
| 364 | |
| 365 | Some example usages of these querying kfuncs were shown above. We will not |
| 366 | replicate those exmaples here. Note, however, that all of the aforementioned |
| 367 | kfuncs are tested in `tools/testing/selftests/bpf/progs/cpumask_success.c`_, so |
| 368 | please take a look there if you're looking for more examples of how they can be |
| 369 | used. |
| 370 | |
| 371 | .. _tools/testing/selftests/bpf/progs/cpumask_success.c: |
| 372 | https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/cpumask_success.c |
| 373 | |
| 374 | |
| 375 | 4. Adding BPF cpumask kfuncs |
| 376 | ============================ |
| 377 | |
| 378 | The set of supported BPF cpumask kfuncs are not (yet) a 1-1 match with the |
| 379 | cpumask operations in include/linux/cpumask.h. Any of those cpumask operations |
| 380 | could easily be encapsulated in a new kfunc if and when required. If you'd like |
| 381 | to support a new cpumask operation, please feel free to submit a patch. If you |
| 382 | do add a new cpumask kfunc, please document it here, and add any relevant |
| 383 | selftest testcases to the cpumask selftest suite. |