| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Copyright (C) 2016 Thomas Gleixner. |
| 4 | * Copyright (C) 2016-2017 Christoph Hellwig. |
| 5 | */ |
| 6 | #include <linux/kernel.h> |
| 7 | #include <linux/slab.h> |
| 8 | #include <linux/cpu.h> |
| 9 | #include <linux/sort.h> |
| 10 | #include <linux/group_cpus.h> |
| 11 | |
| 12 | #ifdef CONFIG_SMP |
| 13 | |
| 14 | static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, |
| 15 | unsigned int cpus_per_grp) |
| 16 | { |
| 17 | const struct cpumask *siblmsk; |
| 18 | int cpu, sibl; |
| 19 | |
| 20 | for ( ; cpus_per_grp > 0; ) { |
| 21 | cpu = cpumask_first(nmsk); |
| 22 | |
| 23 | /* Should not happen, but I'm too lazy to think about it */ |
| 24 | if (cpu >= nr_cpu_ids) |
| 25 | return; |
| 26 | |
| 27 | cpumask_clear_cpu(cpu, nmsk); |
| 28 | cpumask_set_cpu(cpu, irqmsk); |
| 29 | cpus_per_grp--; |
| 30 | |
| 31 | /* If the cpu has siblings, use them first */ |
| 32 | siblmsk = topology_sibling_cpumask(cpu); |
| 33 | for (sibl = -1; cpus_per_grp > 0; ) { |
| 34 | sibl = cpumask_next(sibl, siblmsk); |
| 35 | if (sibl >= nr_cpu_ids) |
| 36 | break; |
| 37 | if (!cpumask_test_and_clear_cpu(sibl, nmsk)) |
| 38 | continue; |
| 39 | cpumask_set_cpu(sibl, irqmsk); |
| 40 | cpus_per_grp--; |
| 41 | } |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | static cpumask_var_t *alloc_node_to_cpumask(void) |
| 46 | { |
| 47 | cpumask_var_t *masks; |
| 48 | int node; |
| 49 | |
| 50 | masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); |
| 51 | if (!masks) |
| 52 | return NULL; |
| 53 | |
| 54 | for (node = 0; node < nr_node_ids; node++) { |
| 55 | if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) |
| 56 | goto out_unwind; |
| 57 | } |
| 58 | |
| 59 | return masks; |
| 60 | |
| 61 | out_unwind: |
| 62 | while (--node >= 0) |
| 63 | free_cpumask_var(masks[node]); |
| 64 | kfree(masks); |
| 65 | return NULL; |
| 66 | } |
| 67 | |
| 68 | static void free_node_to_cpumask(cpumask_var_t *masks) |
| 69 | { |
| 70 | int node; |
| 71 | |
| 72 | for (node = 0; node < nr_node_ids; node++) |
| 73 | free_cpumask_var(masks[node]); |
| 74 | kfree(masks); |
| 75 | } |
| 76 | |
| 77 | static void build_node_to_cpumask(cpumask_var_t *masks) |
| 78 | { |
| 79 | int cpu; |
| 80 | |
| 81 | for_each_possible_cpu(cpu) |
| 82 | cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); |
| 83 | } |
| 84 | |
| 85 | static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, |
| 86 | const struct cpumask *mask, nodemask_t *nodemsk) |
| 87 | { |
| 88 | int n, nodes = 0; |
| 89 | |
| 90 | /* Calculate the number of nodes in the supplied affinity mask */ |
| 91 | for_each_node(n) { |
| 92 | if (cpumask_intersects(mask, node_to_cpumask[n])) { |
| 93 | node_set(n, *nodemsk); |
| 94 | nodes++; |
| 95 | } |
| 96 | } |
| 97 | return nodes; |
| 98 | } |
| 99 | |
| 100 | struct node_groups { |
| 101 | unsigned id; |
| 102 | |
| 103 | union { |
| 104 | unsigned ngroups; |
| 105 | unsigned ncpus; |
| 106 | }; |
| 107 | }; |
| 108 | |
| 109 | static int ncpus_cmp_func(const void *l, const void *r) |
| 110 | { |
| 111 | const struct node_groups *ln = l; |
| 112 | const struct node_groups *rn = r; |
| 113 | |
| 114 | return ln->ncpus - rn->ncpus; |
| 115 | } |
| 116 | |
| 117 | /* |
| 118 | * Allocate group number for each node, so that for each node: |
| 119 | * |
| 120 | * 1) the allocated number is >= 1 |
| 121 | * |
| 122 | * 2) the allocated number is <= active CPU number of this node |
| 123 | * |
| 124 | * The actual allocated total groups may be less than @numgrps when |
| 125 | * active total CPU number is less than @numgrps. |
| 126 | * |
| 127 | * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' |
| 128 | * for each node. |
| 129 | */ |
| 130 | static void alloc_nodes_groups(unsigned int numgrps, |
| 131 | cpumask_var_t *node_to_cpumask, |
| 132 | const struct cpumask *cpu_mask, |
| 133 | const nodemask_t nodemsk, |
| 134 | struct cpumask *nmsk, |
| 135 | struct node_groups *node_groups) |
| 136 | { |
| 137 | unsigned n, remaining_ncpus = 0; |
| 138 | |
| 139 | for (n = 0; n < nr_node_ids; n++) { |
| 140 | node_groups[n].id = n; |
| 141 | node_groups[n].ncpus = UINT_MAX; |
| 142 | } |
| 143 | |
| 144 | for_each_node_mask(n, nodemsk) { |
| 145 | unsigned ncpus; |
| 146 | |
| 147 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); |
| 148 | ncpus = cpumask_weight(nmsk); |
| 149 | |
| 150 | if (!ncpus) |
| 151 | continue; |
| 152 | remaining_ncpus += ncpus; |
| 153 | node_groups[n].ncpus = ncpus; |
| 154 | } |
| 155 | |
| 156 | numgrps = min_t(unsigned, remaining_ncpus, numgrps); |
| 157 | |
| 158 | sort(node_groups, nr_node_ids, sizeof(node_groups[0]), |
| 159 | ncpus_cmp_func, NULL); |
| 160 | |
| 161 | /* |
| 162 | * Allocate groups for each node according to the ratio of this |
| 163 | * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is |
| 164 | * bigger than number of active numa nodes. Always start the |
| 165 | * allocation from the node with minimized nr_cpus. |
| 166 | * |
| 167 | * This way guarantees that each active node gets allocated at |
| 168 | * least one group, and the theory is simple: over-allocation |
| 169 | * is only done when this node is assigned by one group, so |
| 170 | * other nodes will be allocated >= 1 groups, since 'numgrps' is |
| 171 | * bigger than number of numa nodes. |
| 172 | * |
| 173 | * One perfect invariant is that number of allocated groups for |
| 174 | * each node is <= CPU count of this node: |
| 175 | * |
| 176 | * 1) suppose there are two nodes: A and B |
| 177 | * ncpu(X) is CPU count of node X |
| 178 | * grps(X) is the group count allocated to node X via this |
| 179 | * algorithm |
| 180 | * |
| 181 | * ncpu(A) <= ncpu(B) |
| 182 | * ncpu(A) + ncpu(B) = N |
| 183 | * grps(A) + grps(B) = G |
| 184 | * |
| 185 | * grps(A) = max(1, round_down(G * ncpu(A) / N)) |
| 186 | * grps(B) = G - grps(A) |
| 187 | * |
| 188 | * both N and G are integer, and 2 <= G <= N, suppose |
| 189 | * G = N - delta, and 0 <= delta <= N - 2 |
| 190 | * |
| 191 | * 2) obviously grps(A) <= ncpu(A) because: |
| 192 | * |
| 193 | * if grps(A) is 1, then grps(A) <= ncpu(A) given |
| 194 | * ncpu(A) >= 1 |
| 195 | * |
| 196 | * otherwise, |
| 197 | * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N |
| 198 | * |
| 199 | * 3) prove how grps(B) <= ncpu(B): |
| 200 | * |
| 201 | * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be |
| 202 | * over-allocated, so grps(B) <= ncpu(B), |
| 203 | * |
| 204 | * otherwise: |
| 205 | * |
| 206 | * grps(A) = |
| 207 | * round_down(G * ncpu(A) / N) = |
| 208 | * round_down((N - delta) * ncpu(A) / N) = |
| 209 | * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= |
| 210 | * round_down((N * ncpu(A) - delta * N) / N) = |
| 211 | * cpu(A) - delta |
| 212 | * |
| 213 | * then: |
| 214 | * |
| 215 | * grps(A) - G >= ncpu(A) - delta - G |
| 216 | * => |
| 217 | * G - grps(A) <= G + delta - ncpu(A) |
| 218 | * => |
| 219 | * grps(B) <= N - ncpu(A) |
| 220 | * => |
| 221 | * grps(B) <= cpu(B) |
| 222 | * |
| 223 | * For nodes >= 3, it can be thought as one node and another big |
| 224 | * node given that is exactly what this algorithm is implemented, |
| 225 | * and we always re-calculate 'remaining_ncpus' & 'numgrps', and |
| 226 | * finally for each node X: grps(X) <= ncpu(X). |
| 227 | * |
| 228 | */ |
| 229 | for (n = 0; n < nr_node_ids; n++) { |
| 230 | unsigned ngroups, ncpus; |
| 231 | |
| 232 | if (node_groups[n].ncpus == UINT_MAX) |
| 233 | continue; |
| 234 | |
| 235 | WARN_ON_ONCE(numgrps == 0); |
| 236 | |
| 237 | ncpus = node_groups[n].ncpus; |
| 238 | ngroups = max_t(unsigned, 1, |
| 239 | numgrps * ncpus / remaining_ncpus); |
| 240 | WARN_ON_ONCE(ngroups > ncpus); |
| 241 | |
| 242 | node_groups[n].ngroups = ngroups; |
| 243 | |
| 244 | remaining_ncpus -= ncpus; |
| 245 | numgrps -= ngroups; |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, |
| 250 | cpumask_var_t *node_to_cpumask, |
| 251 | const struct cpumask *cpu_mask, |
| 252 | struct cpumask *nmsk, struct cpumask *masks) |
| 253 | { |
| 254 | unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; |
| 255 | unsigned int last_grp = numgrps; |
| 256 | unsigned int curgrp = startgrp; |
| 257 | nodemask_t nodemsk = NODE_MASK_NONE; |
| 258 | struct node_groups *node_groups; |
| 259 | |
| 260 | if (cpumask_empty(cpu_mask)) |
| 261 | return 0; |
| 262 | |
| 263 | nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); |
| 264 | |
| 265 | /* |
| 266 | * If the number of nodes in the mask is greater than or equal the |
| 267 | * number of groups we just spread the groups across the nodes. |
| 268 | */ |
| 269 | if (numgrps <= nodes) { |
| 270 | for_each_node_mask(n, nodemsk) { |
| 271 | /* Ensure that only CPUs which are in both masks are set */ |
| 272 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); |
| 273 | cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); |
| 274 | if (++curgrp == last_grp) |
| 275 | curgrp = 0; |
| 276 | } |
| 277 | return numgrps; |
| 278 | } |
| 279 | |
| 280 | node_groups = kcalloc(nr_node_ids, |
| 281 | sizeof(struct node_groups), |
| 282 | GFP_KERNEL); |
| 283 | if (!node_groups) |
| 284 | return -ENOMEM; |
| 285 | |
| 286 | /* allocate group number for each node */ |
| 287 | alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, |
| 288 | nodemsk, nmsk, node_groups); |
| 289 | for (i = 0; i < nr_node_ids; i++) { |
| 290 | unsigned int ncpus, v; |
| 291 | struct node_groups *nv = &node_groups[i]; |
| 292 | |
| 293 | if (nv->ngroups == UINT_MAX) |
| 294 | continue; |
| 295 | |
| 296 | /* Get the cpus on this node which are in the mask */ |
| 297 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); |
| 298 | ncpus = cpumask_weight(nmsk); |
| 299 | if (!ncpus) |
| 300 | continue; |
| 301 | |
| 302 | WARN_ON_ONCE(nv->ngroups > ncpus); |
| 303 | |
| 304 | /* Account for rounding errors */ |
| 305 | extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); |
| 306 | |
| 307 | /* Spread allocated groups on CPUs of the current node */ |
| 308 | for (v = 0; v < nv->ngroups; v++, curgrp++) { |
| 309 | cpus_per_grp = ncpus / nv->ngroups; |
| 310 | |
| 311 | /* Account for extra groups to compensate rounding errors */ |
| 312 | if (extra_grps) { |
| 313 | cpus_per_grp++; |
| 314 | --extra_grps; |
| 315 | } |
| 316 | |
| 317 | /* |
| 318 | * wrapping has to be considered given 'startgrp' |
| 319 | * may start anywhere |
| 320 | */ |
| 321 | if (curgrp >= last_grp) |
| 322 | curgrp = 0; |
| 323 | grp_spread_init_one(&masks[curgrp], nmsk, |
| 324 | cpus_per_grp); |
| 325 | } |
| 326 | done += nv->ngroups; |
| 327 | } |
| 328 | kfree(node_groups); |
| 329 | return done; |
| 330 | } |
| 331 | |
| 332 | /** |
| 333 | * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality |
| 334 | * @numgrps: number of groups |
| 335 | * |
| 336 | * Return: cpumask array if successful, NULL otherwise. And each element |
| 337 | * includes CPUs assigned to this group |
| 338 | * |
| 339 | * Try to put close CPUs from viewpoint of CPU and NUMA locality into |
| 340 | * same group, and run two-stage grouping: |
| 341 | * 1) allocate present CPUs on these groups evenly first |
| 342 | * 2) allocate other possible CPUs on these groups evenly |
| 343 | * |
| 344 | * We guarantee in the resulted grouping that all CPUs are covered, and |
| 345 | * no same CPU is assigned to multiple groups |
| 346 | */ |
| 347 | struct cpumask *group_cpus_evenly(unsigned int numgrps) |
| 348 | { |
| 349 | unsigned int curgrp = 0, nr_present = 0, nr_others = 0; |
| 350 | cpumask_var_t *node_to_cpumask; |
| 351 | cpumask_var_t nmsk, npresmsk; |
| 352 | int ret = -ENOMEM; |
| 353 | struct cpumask *masks = NULL; |
| 354 | |
| 355 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) |
| 356 | return NULL; |
| 357 | |
| 358 | if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) |
| 359 | goto fail_nmsk; |
| 360 | |
| 361 | node_to_cpumask = alloc_node_to_cpumask(); |
| 362 | if (!node_to_cpumask) |
| 363 | goto fail_npresmsk; |
| 364 | |
| 365 | masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); |
| 366 | if (!masks) |
| 367 | goto fail_node_to_cpumask; |
| 368 | |
| 369 | /* Stabilize the cpumasks */ |
| 370 | cpus_read_lock(); |
| 371 | build_node_to_cpumask(node_to_cpumask); |
| 372 | |
| 373 | /* grouping present CPUs first */ |
| 374 | ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, |
| 375 | cpu_present_mask, nmsk, masks); |
| 376 | if (ret < 0) |
| 377 | goto fail_build_affinity; |
| 378 | nr_present = ret; |
| 379 | |
| 380 | /* |
| 381 | * Allocate non present CPUs starting from the next group to be |
| 382 | * handled. If the grouping of present CPUs already exhausted the |
| 383 | * group space, assign the non present CPUs to the already |
| 384 | * allocated out groups. |
| 385 | */ |
| 386 | if (nr_present >= numgrps) |
| 387 | curgrp = 0; |
| 388 | else |
| 389 | curgrp = nr_present; |
| 390 | cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); |
| 391 | ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, |
| 392 | npresmsk, nmsk, masks); |
| 393 | if (ret >= 0) |
| 394 | nr_others = ret; |
| 395 | |
| 396 | fail_build_affinity: |
| 397 | cpus_read_unlock(); |
| 398 | |
| 399 | if (ret >= 0) |
| 400 | WARN_ON(nr_present + nr_others < numgrps); |
| 401 | |
| 402 | fail_node_to_cpumask: |
| 403 | free_node_to_cpumask(node_to_cpumask); |
| 404 | |
| 405 | fail_npresmsk: |
| 406 | free_cpumask_var(npresmsk); |
| 407 | |
| 408 | fail_nmsk: |
| 409 | free_cpumask_var(nmsk); |
| 410 | if (ret < 0) { |
| 411 | kfree(masks); |
| 412 | return NULL; |
| 413 | } |
| 414 | return masks; |
| 415 | } |
| 416 | #else /* CONFIG_SMP */ |
| 417 | struct cpumask *group_cpus_evenly(unsigned int numgrps) |
| 418 | { |
| 419 | struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); |
| 420 | |
| 421 | if (!masks) |
| 422 | return NULL; |
| 423 | |
| 424 | /* assign all CPUs(cpu 0) to the 1st group only */ |
| 425 | cpumask_copy(&masks[0], cpu_possible_mask); |
| 426 | return masks; |
| 427 | } |
| 428 | #endif /* CONFIG_SMP */ |
| 429 | EXPORT_SYMBOL_GPL(group_cpus_evenly); |