Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
9a0ef98e CH |
2 | /* |
3 | * Copyright (C) 2016 Thomas Gleixner. | |
4 | * Copyright (C) 2016-2017 Christoph Hellwig. | |
5 | */ | |
5e385a6e CH |
6 | #include <linux/interrupt.h> |
7 | #include <linux/kernel.h> | |
8 | #include <linux/slab.h> | |
9 | #include <linux/cpu.h> | |
b1a5a73e | 10 | #include <linux/sort.h> |
5e385a6e | 11 | |
34c3d981 | 12 | static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, |
0145c30e | 13 | unsigned int cpus_per_vec) |
34c3d981 TG |
14 | { |
15 | const struct cpumask *siblmsk; | |
16 | int cpu, sibl; | |
17 | ||
18 | for ( ; cpus_per_vec > 0; ) { | |
19 | cpu = cpumask_first(nmsk); | |
20 | ||
21 | /* Should not happen, but I'm too lazy to think about it */ | |
22 | if (cpu >= nr_cpu_ids) | |
23 | return; | |
24 | ||
25 | cpumask_clear_cpu(cpu, nmsk); | |
26 | cpumask_set_cpu(cpu, irqmsk); | |
27 | cpus_per_vec--; | |
28 | ||
29 | /* If the cpu has siblings, use them first */ | |
30 | siblmsk = topology_sibling_cpumask(cpu); | |
31 | for (sibl = -1; cpus_per_vec > 0; ) { | |
32 | sibl = cpumask_next(sibl, siblmsk); | |
33 | if (sibl >= nr_cpu_ids) | |
34 | break; | |
35 | if (!cpumask_test_and_clear_cpu(sibl, nmsk)) | |
36 | continue; | |
37 | cpumask_set_cpu(sibl, irqmsk); | |
38 | cpus_per_vec--; | |
39 | } | |
40 | } | |
41 | } | |
42 | ||
47778f33 | 43 | static cpumask_var_t *alloc_node_to_cpumask(void) |
9a0ef98e CH |
44 | { |
45 | cpumask_var_t *masks; | |
46 | int node; | |
47 | ||
48 | masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); | |
49 | if (!masks) | |
50 | return NULL; | |
51 | ||
52 | for (node = 0; node < nr_node_ids; node++) { | |
53 | if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) | |
54 | goto out_unwind; | |
55 | } | |
56 | ||
57 | return masks; | |
58 | ||
59 | out_unwind: | |
60 | while (--node >= 0) | |
61 | free_cpumask_var(masks[node]); | |
62 | kfree(masks); | |
63 | return NULL; | |
64 | } | |
65 | ||
47778f33 | 66 | static void free_node_to_cpumask(cpumask_var_t *masks) |
9a0ef98e CH |
67 | { |
68 | int node; | |
69 | ||
70 | for (node = 0; node < nr_node_ids; node++) | |
71 | free_cpumask_var(masks[node]); | |
72 | kfree(masks); | |
73 | } | |
74 | ||
47778f33 | 75 | static void build_node_to_cpumask(cpumask_var_t *masks) |
9a0ef98e CH |
76 | { |
77 | int cpu; | |
78 | ||
84676c1f | 79 | for_each_possible_cpu(cpu) |
9a0ef98e CH |
80 | cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); |
81 | } | |
82 | ||
47778f33 | 83 | static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, |
9a0ef98e | 84 | const struct cpumask *mask, nodemask_t *nodemsk) |
34c3d981 | 85 | { |
c0af5243 | 86 | int n, nodes = 0; |
34c3d981 TG |
87 | |
88 | /* Calculate the number of nodes in the supplied affinity mask */ | |
9a0ef98e | 89 | for_each_node(n) { |
47778f33 | 90 | if (cpumask_intersects(mask, node_to_cpumask[n])) { |
34c3d981 TG |
91 | node_set(n, *nodemsk); |
92 | nodes++; | |
93 | } | |
94 | } | |
95 | return nodes; | |
96 | } | |
97 | ||
b1a5a73e ML |
98 | struct node_vectors { |
99 | unsigned id; | |
100 | ||
101 | union { | |
102 | unsigned nvectors; | |
103 | unsigned ncpus; | |
104 | }; | |
105 | }; | |
106 | ||
107 | static int ncpus_cmp_func(const void *l, const void *r) | |
108 | { | |
109 | const struct node_vectors *ln = l; | |
110 | const struct node_vectors *rn = r; | |
111 | ||
112 | return ln->ncpus - rn->ncpus; | |
113 | } | |
114 | ||
115 | /* | |
116 | * Allocate vector number for each node, so that for each node: | |
117 | * | |
118 | * 1) the allocated number is >= 1 | |
119 | * | |
120 | * 2) the allocated numbver is <= active CPU number of this node | |
121 | * | |
122 | * The actual allocated total vectors may be less than @numvecs when | |
123 | * active total CPU number is less than @numvecs. | |
124 | * | |
125 | * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' | |
126 | * for each node. | |
127 | */ | |
128 | static void alloc_nodes_vectors(unsigned int numvecs, | |
101f85b5 | 129 | cpumask_var_t *node_to_cpumask, |
b1a5a73e ML |
130 | const struct cpumask *cpu_mask, |
131 | const nodemask_t nodemsk, | |
132 | struct cpumask *nmsk, | |
133 | struct node_vectors *node_vectors) | |
134 | { | |
135 | unsigned n, remaining_ncpus = 0; | |
136 | ||
137 | for (n = 0; n < nr_node_ids; n++) { | |
138 | node_vectors[n].id = n; | |
139 | node_vectors[n].ncpus = UINT_MAX; | |
140 | } | |
141 | ||
142 | for_each_node_mask(n, nodemsk) { | |
143 | unsigned ncpus; | |
144 | ||
145 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); | |
146 | ncpus = cpumask_weight(nmsk); | |
147 | ||
148 | if (!ncpus) | |
149 | continue; | |
150 | remaining_ncpus += ncpus; | |
151 | node_vectors[n].ncpus = ncpus; | |
152 | } | |
153 | ||
154 | numvecs = min_t(unsigned, remaining_ncpus, numvecs); | |
155 | ||
156 | sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), | |
157 | ncpus_cmp_func, NULL); | |
158 | ||
159 | /* | |
160 | * Allocate vectors for each node according to the ratio of this | |
161 | * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is | |
162 | * bigger than number of active numa nodes. Always start the | |
163 | * allocation from the node with minimized nr_cpus. | |
164 | * | |
165 | * This way guarantees that each active node gets allocated at | |
166 | * least one vector, and the theory is simple: over-allocation | |
167 | * is only done when this node is assigned by one vector, so | |
168 | * other nodes will be allocated >= 1 vector, since 'numvecs' is | |
169 | * bigger than number of numa nodes. | |
170 | * | |
171 | * One perfect invariant is that number of allocated vectors for | |
172 | * each node is <= CPU count of this node: | |
173 | * | |
174 | * 1) suppose there are two nodes: A and B | |
175 | * ncpu(X) is CPU count of node X | |
176 | * vecs(X) is the vector count allocated to node X via this | |
177 | * algorithm | |
178 | * | |
179 | * ncpu(A) <= ncpu(B) | |
180 | * ncpu(A) + ncpu(B) = N | |
181 | * vecs(A) + vecs(B) = V | |
182 | * | |
183 | * vecs(A) = max(1, round_down(V * ncpu(A) / N)) | |
184 | * vecs(B) = V - vecs(A) | |
185 | * | |
186 | * both N and V are integer, and 2 <= V <= N, suppose | |
187 | * V = N - delta, and 0 <= delta <= N - 2 | |
188 | * | |
189 | * 2) obviously vecs(A) <= ncpu(A) because: | |
190 | * | |
191 | * if vecs(A) is 1, then vecs(A) <= ncpu(A) given | |
192 | * ncpu(A) >= 1 | |
193 | * | |
194 | * otherwise, | |
195 | * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N | |
196 | * | |
197 | * 3) prove how vecs(B) <= ncpu(B): | |
198 | * | |
199 | * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be | |
200 | * over-allocated, so vecs(B) <= ncpu(B), | |
201 | * | |
202 | * otherwise: | |
203 | * | |
204 | * vecs(A) = | |
205 | * round_down(V * ncpu(A) / N) = | |
206 | * round_down((N - delta) * ncpu(A) / N) = | |
207 | * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= | |
208 | * round_down((N * ncpu(A) - delta * N) / N) = | |
209 | * cpu(A) - delta | |
210 | * | |
211 | * then: | |
212 | * | |
213 | * vecs(A) - V >= ncpu(A) - delta - V | |
214 | * => | |
215 | * V - vecs(A) <= V + delta - ncpu(A) | |
216 | * => | |
217 | * vecs(B) <= N - ncpu(A) | |
218 | * => | |
219 | * vecs(B) <= cpu(B) | |
220 | * | |
221 | * For nodes >= 3, it can be thought as one node and another big | |
222 | * node given that is exactly what this algorithm is implemented, | |
223 | * and we always re-calculate 'remaining_ncpus' & 'numvecs', and | |
224 | * finally for each node X: vecs(X) <= ncpu(X). | |
225 | * | |
226 | */ | |
227 | for (n = 0; n < nr_node_ids; n++) { | |
228 | unsigned nvectors, ncpus; | |
229 | ||
230 | if (node_vectors[n].ncpus == UINT_MAX) | |
231 | continue; | |
232 | ||
233 | WARN_ON_ONCE(numvecs == 0); | |
234 | ||
235 | ncpus = node_vectors[n].ncpus; | |
236 | nvectors = max_t(unsigned, 1, | |
237 | numvecs * ncpus / remaining_ncpus); | |
238 | WARN_ON_ONCE(nvectors > ncpus); | |
239 | ||
240 | node_vectors[n].nvectors = nvectors; | |
241 | ||
242 | remaining_ncpus -= ncpus; | |
243 | numvecs -= nvectors; | |
244 | } | |
245 | } | |
246 | ||
0e518330 | 247 | static int __irq_build_affinity_masks(unsigned int startvec, |
0145c30e TG |
248 | unsigned int numvecs, |
249 | unsigned int firstvec, | |
c2899c34 TG |
250 | cpumask_var_t *node_to_cpumask, |
251 | const struct cpumask *cpu_mask, | |
252 | struct cpumask *nmsk, | |
bec04037 | 253 | struct irq_affinity_desc *masks) |
34c3d981 | 254 | { |
b1a5a73e | 255 | unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; |
0145c30e TG |
256 | unsigned int last_affv = firstvec + numvecs; |
257 | unsigned int curvec = startvec; | |
34c3d981 | 258 | nodemask_t nodemsk = NODE_MASK_NONE; |
b1a5a73e | 259 | struct node_vectors *node_vectors; |
34c3d981 | 260 | |
d3056812 ML |
261 | if (!cpumask_weight(cpu_mask)) |
262 | return 0; | |
263 | ||
b3e6aaa8 | 264 | nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); |
34c3d981 TG |
265 | |
266 | /* | |
c0af5243 | 267 | * If the number of nodes in the mask is greater than or equal the |
34c3d981 TG |
268 | * number of vectors we just spread the vectors across the nodes. |
269 | */ | |
1a2d0914 | 270 | if (numvecs <= nodes) { |
34c3d981 | 271 | for_each_node_mask(n, nodemsk) { |
0145c30e TG |
272 | cpumask_or(&masks[curvec].mask, &masks[curvec].mask, |
273 | node_to_cpumask[n]); | |
1a2d0914 | 274 | if (++curvec == last_affv) |
060746d9 | 275 | curvec = firstvec; |
34c3d981 | 276 | } |
0145c30e | 277 | return numvecs; |
34c3d981 TG |
278 | } |
279 | ||
b1a5a73e ML |
280 | node_vectors = kcalloc(nr_node_ids, |
281 | sizeof(struct node_vectors), | |
282 | GFP_KERNEL); | |
283 | if (!node_vectors) | |
284 | return -ENOMEM; | |
285 | ||
286 | /* allocate vector number for each node */ | |
287 | alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, | |
288 | nodemsk, nmsk, node_vectors); | |
289 | ||
290 | for (i = 0; i < nr_node_ids; i++) { | |
291 | unsigned int ncpus, v; | |
292 | struct node_vectors *nv = &node_vectors[i]; | |
293 | ||
294 | if (nv->nvectors == UINT_MAX) | |
295 | continue; | |
7bf8222b | 296 | |
34c3d981 | 297 | /* Get the cpus on this node which are in the mask */ |
b1a5a73e | 298 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); |
34c3d981 | 299 | ncpus = cpumask_weight(nmsk); |
53c1788b ML |
300 | if (!ncpus) |
301 | continue; | |
302 | ||
b1a5a73e | 303 | WARN_ON_ONCE(nv->nvectors > ncpus); |
7bf8222b KB |
304 | |
305 | /* Account for rounding errors */ | |
b1a5a73e | 306 | extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); |
34c3d981 | 307 | |
b1a5a73e ML |
308 | /* Spread allocated vectors on CPUs of the current node */ |
309 | for (v = 0; v < nv->nvectors; v++, curvec++) { | |
310 | cpus_per_vec = ncpus / nv->nvectors; | |
34c3d981 TG |
311 | |
312 | /* Account for extra vectors to compensate rounding errors */ | |
313 | if (extra_vecs) { | |
314 | cpus_per_vec++; | |
7bf8222b | 315 | --extra_vecs; |
34c3d981 | 316 | } |
b1a5a73e ML |
317 | |
318 | /* | |
319 | * wrapping has to be considered given 'startvec' | |
320 | * may start anywhere | |
321 | */ | |
322 | if (curvec >= last_affv) | |
323 | curvec = firstvec; | |
bec04037 DL |
324 | irq_spread_init_one(&masks[curvec].mask, nmsk, |
325 | cpus_per_vec); | |
34c3d981 | 326 | } |
b1a5a73e | 327 | done += nv->nvectors; |
34c3d981 | 328 | } |
b1a5a73e ML |
329 | kfree(node_vectors); |
330 | return done; | |
b3e6aaa8 ML |
331 | } |
332 | ||
5c903e10 ML |
333 | /* |
334 | * build affinity in two stages: | |
335 | * 1) spread present CPU on these vectors | |
336 | * 2) spread other possible CPUs on these vectors | |
337 | */ | |
0e518330 | 338 | static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, |
0145c30e | 339 | unsigned int firstvec, |
bec04037 | 340 | struct irq_affinity_desc *masks) |
5c903e10 | 341 | { |
b1a5a73e | 342 | unsigned int curvec = startvec, nr_present = 0, nr_others = 0; |
347253c4 | 343 | cpumask_var_t *node_to_cpumask; |
0145c30e TG |
344 | cpumask_var_t nmsk, npresmsk; |
345 | int ret = -ENOMEM; | |
5c903e10 ML |
346 | |
347 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) | |
c2899c34 | 348 | return ret; |
5c903e10 ML |
349 | |
350 | if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) | |
347253c4 ML |
351 | goto fail_nmsk; |
352 | ||
353 | node_to_cpumask = alloc_node_to_cpumask(); | |
354 | if (!node_to_cpumask) | |
355 | goto fail_npresmsk; | |
5c903e10 ML |
356 | |
357 | /* Stabilize the cpumasks */ | |
358 | get_online_cpus(); | |
359 | build_node_to_cpumask(node_to_cpumask); | |
360 | ||
361 | /* Spread on present CPUs starting from affd->pre_vectors */ | |
b1a5a73e ML |
362 | ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, |
363 | node_to_cpumask, cpu_present_mask, | |
364 | nmsk, masks); | |
365 | if (ret < 0) | |
366 | goto fail_build_affinity; | |
367 | nr_present = ret; | |
5c903e10 ML |
368 | |
369 | /* | |
370 | * Spread on non present CPUs starting from the next vector to be | |
371 | * handled. If the spreading of present CPUs already exhausted the | |
372 | * vector space, assign the non present CPUs to the already spread | |
373 | * out vectors. | |
374 | */ | |
6da4b3ab JA |
375 | if (nr_present >= numvecs) |
376 | curvec = firstvec; | |
5c903e10 | 377 | else |
6da4b3ab | 378 | curvec = firstvec + nr_present; |
5c903e10 | 379 | cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); |
b1a5a73e ML |
380 | ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, |
381 | node_to_cpumask, npresmsk, nmsk, | |
382 | masks); | |
383 | if (ret >= 0) | |
384 | nr_others = ret; | |
385 | ||
386 | fail_build_affinity: | |
5c903e10 ML |
387 | put_online_cpus(); |
388 | ||
b1a5a73e | 389 | if (ret >= 0) |
c2899c34 | 390 | WARN_ON(nr_present + nr_others < numvecs); |
6da4b3ab | 391 | |
347253c4 ML |
392 | free_node_to_cpumask(node_to_cpumask); |
393 | ||
394 | fail_npresmsk: | |
5c903e10 ML |
395 | free_cpumask_var(npresmsk); |
396 | ||
347253c4 | 397 | fail_nmsk: |
5c903e10 | 398 | free_cpumask_var(nmsk); |
b1a5a73e | 399 | return ret < 0 ? ret : 0; |
5c903e10 ML |
400 | } |
401 | ||
c66d4bd1 ML |
402 | static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) |
403 | { | |
404 | affd->nr_sets = 1; | |
405 | affd->set_size[0] = affvecs; | |
406 | } | |
407 | ||
b3e6aaa8 ML |
408 | /** |
409 | * irq_create_affinity_masks - Create affinity masks for multiqueue spreading | |
410 | * @nvecs: The total number of vectors | |
411 | * @affd: Description of the affinity requirements | |
412 | * | |
bec04037 | 413 | * Returns the irq_affinity_desc pointer or NULL if allocation failed. |
b3e6aaa8 | 414 | */ |
bec04037 | 415 | struct irq_affinity_desc * |
9cfef55b | 416 | irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) |
b3e6aaa8 | 417 | { |
c66d4bd1 | 418 | unsigned int affvecs, curvec, usedvecs, i; |
bec04037 | 419 | struct irq_affinity_desc *masks = NULL; |
b3e6aaa8 ML |
420 | |
421 | /* | |
c66d4bd1 ML |
422 | * Determine the number of vectors which need interrupt affinities |
423 | * assigned. If the pre/post request exhausts the available vectors | |
424 | * then nothing to do here except for invoking the calc_sets() | |
491beed3 | 425 | * callback so the device driver can adjust to the situation. |
b3e6aaa8 | 426 | */ |
491beed3 | 427 | if (nvecs > affd->pre_vectors + affd->post_vectors) |
c66d4bd1 ML |
428 | affvecs = nvecs - affd->pre_vectors - affd->post_vectors; |
429 | else | |
430 | affvecs = 0; | |
431 | ||
432 | /* | |
433 | * Simple invocations do not provide a calc_sets() callback. Install | |
a6a309ed | 434 | * the generic one. |
c66d4bd1 | 435 | */ |
a6a309ed | 436 | if (!affd->calc_sets) |
c66d4bd1 ML |
437 | affd->calc_sets = default_calc_sets; |
438 | ||
a6a309ed TG |
439 | /* Recalculate the sets */ |
440 | affd->calc_sets(affd, affvecs); | |
b3e6aaa8 | 441 | |
9cfef55b ML |
442 | if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS)) |
443 | return NULL; | |
444 | ||
c66d4bd1 ML |
445 | /* Nothing to assign? */ |
446 | if (!affvecs) | |
447 | return NULL; | |
448 | ||
b3e6aaa8 ML |
449 | masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); |
450 | if (!masks) | |
347253c4 | 451 | return NULL; |
b3e6aaa8 ML |
452 | |
453 | /* Fill out vectors at the beginning that don't need affinity */ | |
454 | for (curvec = 0; curvec < affd->pre_vectors; curvec++) | |
bec04037 | 455 | cpumask_copy(&masks[curvec].mask, irq_default_affinity); |
c66d4bd1 | 456 | |
6da4b3ab JA |
457 | /* |
458 | * Spread on present CPUs starting from affd->pre_vectors. If we | |
459 | * have multiple sets, build each sets affinity mask separately. | |
460 | */ | |
c66d4bd1 ML |
461 | for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { |
462 | unsigned int this_vecs = affd->set_size[i]; | |
6da4b3ab JA |
463 | int ret; |
464 | ||
0e518330 | 465 | ret = irq_build_affinity_masks(curvec, this_vecs, |
0145c30e | 466 | curvec, masks); |
6da4b3ab | 467 | if (ret) { |
c2899c34 | 468 | kfree(masks); |
347253c4 | 469 | return NULL; |
6da4b3ab JA |
470 | } |
471 | curvec += this_vecs; | |
472 | usedvecs += this_vecs; | |
473 | } | |
67c93c21 CH |
474 | |
475 | /* Fill out vectors at the end that don't need affinity */ | |
d3056812 ML |
476 | if (usedvecs >= affvecs) |
477 | curvec = affd->pre_vectors + affvecs; | |
478 | else | |
479 | curvec = affd->pre_vectors + usedvecs; | |
67c93c21 | 480 | for (; curvec < nvecs; curvec++) |
bec04037 | 481 | cpumask_copy(&masks[curvec].mask, irq_default_affinity); |
d3056812 | 482 | |
c410abbb DL |
483 | /* Mark the managed interrupts */ |
484 | for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++) | |
485 | masks[i].is_managed = 1; | |
486 | ||
34c3d981 TG |
487 | return masks; |
488 | } | |
489 | ||
490 | /** | |
212bd846 | 491 | * irq_calc_affinity_vectors - Calculate the optimal number of vectors |
6f9a22bc | 492 | * @minvec: The minimum number of vectors available |
212bd846 CH |
493 | * @maxvec: The maximum number of vectors available |
494 | * @affd: Description of the affinity requirements | |
34c3d981 | 495 | */ |
0145c30e TG |
496 | unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, |
497 | const struct irq_affinity *affd) | |
34c3d981 | 498 | { |
0145c30e TG |
499 | unsigned int resv = affd->pre_vectors + affd->post_vectors; |
500 | unsigned int set_vecs; | |
34c3d981 | 501 | |
6f9a22bc MH |
502 | if (resv > minvec) |
503 | return 0; | |
504 | ||
c66d4bd1 ML |
505 | if (affd->calc_sets) { |
506 | set_vecs = maxvec - resv; | |
6da4b3ab JA |
507 | } else { |
508 | get_online_cpus(); | |
509 | set_vecs = cpumask_weight(cpu_possible_mask); | |
510 | put_online_cpus(); | |
511 | } | |
512 | ||
0145c30e | 513 | return resv + min(set_vecs, maxvec - resv); |
34c3d981 | 514 | } |