Merge tag 'vfio-v6.10-rc1' of https://github.com/awilliam/linux-vfio
[linux-2.6-block.git] / kernel / sched / topology.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
f2cb1360
IM
2/*
3 * Scheduler topology setup/handling methods
4 */
f2cb1360 5
cd7f5535
YN
6#include <linux/bsearch.h>
7
f2cb1360
IM
8DEFINE_MUTEX(sched_domains_mutex);
9
10/* Protected by sched_domains_mutex: */
ace80310 11static cpumask_var_t sched_domains_tmpmask;
12static cpumask_var_t sched_domains_tmpmask2;
f2cb1360
IM
13
14#ifdef CONFIG_SCHED_DEBUG
15
f2cb1360
IM
16static int __init sched_debug_setup(char *str)
17{
9406415f 18 sched_debug_verbose = true;
f2cb1360
IM
19
20 return 0;
21}
9406415f 22early_param("sched_verbose", sched_debug_setup);
f2cb1360
IM
23
24static inline bool sched_debug(void)
25{
9406415f 26 return sched_debug_verbose;
f2cb1360
IM
27}
28
848785df
VS
29#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
30const struct sd_flag_debug sd_flag_debug[] = {
31#include <linux/sched/sd_flags.h>
32};
33#undef SD_FLAG
34
f2cb1360
IM
35static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
36 struct cpumask *groupmask)
37{
38 struct sched_group *group = sd->groups;
65c5e253
VS
39 unsigned long flags = sd->flags;
40 unsigned int idx;
f2cb1360
IM
41
42 cpumask_clear(groupmask);
43
005f874d 44 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
005f874d 45 printk(KERN_CONT "span=%*pbl level=%s\n",
f2cb1360
IM
46 cpumask_pr_args(sched_domain_span(sd)), sd->name);
47
48 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
97fb7a0a 49 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
f2cb1360 50 }
6cd0c583 51 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
97fb7a0a 52 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
f2cb1360
IM
53 }
54
65c5e253
VS
55 for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
56 unsigned int flag = BIT(idx);
57 unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
58
59 if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
60 !(sd->child->flags & flag))
61 printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
62 sd_flag_debug[idx].name);
63
64 if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
65 !(sd->parent->flags & flag))
66 printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
67 sd_flag_debug[idx].name);
68 }
69
f2cb1360
IM
70 printk(KERN_DEBUG "%*s groups:", level + 1, "");
71 do {
72 if (!group) {
73 printk("\n");
74 printk(KERN_ERR "ERROR: group is NULL\n");
75 break;
76 }
77
1087ad4e 78 if (cpumask_empty(sched_group_span(group))) {
f2cb1360
IM
79 printk(KERN_CONT "\n");
80 printk(KERN_ERR "ERROR: empty group\n");
81 break;
82 }
83
84 if (!(sd->flags & SD_OVERLAP) &&
ae4df9d6 85 cpumask_intersects(groupmask, sched_group_span(group))) {
f2cb1360
IM
86 printk(KERN_CONT "\n");
87 printk(KERN_ERR "ERROR: repeated CPUs\n");
88 break;
89 }
90
ae4df9d6 91 cpumask_or(groupmask, groupmask, sched_group_span(group));
f2cb1360 92
005f874d
PZ
93 printk(KERN_CONT " %d:{ span=%*pbl",
94 group->sgc->id,
ae4df9d6 95 cpumask_pr_args(sched_group_span(group)));
b0151c25 96
af218122 97 if ((sd->flags & SD_OVERLAP) &&
ae4df9d6 98 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
005f874d 99 printk(KERN_CONT " mask=%*pbl",
e5c14b1f 100 cpumask_pr_args(group_balance_mask(group)));
b0151c25
PZ
101 }
102
005f874d
PZ
103 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
104 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
f2cb1360 105
a420b063
PZ
106 if (group == sd->groups && sd->child &&
107 !cpumask_equal(sched_domain_span(sd->child),
ae4df9d6 108 sched_group_span(group))) {
a420b063
PZ
109 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
110 }
111
005f874d
PZ
112 printk(KERN_CONT " }");
113
f2cb1360 114 group = group->next;
b0151c25
PZ
115
116 if (group != sd->groups)
117 printk(KERN_CONT ",");
118
f2cb1360
IM
119 } while (group != sd->groups);
120 printk(KERN_CONT "\n");
121
122 if (!cpumask_equal(sched_domain_span(sd), groupmask))
123 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
124
125 if (sd->parent &&
126 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
97fb7a0a 127 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
f2cb1360
IM
128 return 0;
129}
130
131static void sched_domain_debug(struct sched_domain *sd, int cpu)
132{
133 int level = 0;
134
9406415f 135 if (!sched_debug_verbose)
f2cb1360
IM
136 return;
137
138 if (!sd) {
139 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
140 return;
141 }
142
005f874d 143 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
f2cb1360
IM
144
145 for (;;) {
146 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
147 break;
148 level++;
149 sd = sd->parent;
150 if (!sd)
151 break;
152 }
153}
154#else /* !CONFIG_SCHED_DEBUG */
155
9406415f 156# define sched_debug_verbose 0
f2cb1360
IM
157# define sched_domain_debug(sd, cpu) do { } while (0)
158static inline bool sched_debug(void)
159{
160 return false;
161}
162#endif /* CONFIG_SCHED_DEBUG */
163
4fc472f1
VS
164/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
165#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
166static const unsigned int SD_DEGENERATE_GROUPS_MASK =
167#include <linux/sched/sd_flags.h>
1680;
169#undef SD_FLAG
170
f2cb1360
IM
171static int sd_degenerate(struct sched_domain *sd)
172{
173 if (cpumask_weight(sched_domain_span(sd)) == 1)
174 return 1;
175
176 /* Following flags need at least 2 groups */
6f349818
VS
177 if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
178 (sd->groups != sd->groups->next))
179 return 0;
f2cb1360
IM
180
181 /* Following flags don't use groups */
182 if (sd->flags & (SD_WAKE_AFFINE))
183 return 0;
184
185 return 1;
186}
187
188static int
189sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
190{
191 unsigned long cflags = sd->flags, pflags = parent->flags;
192
193 if (sd_degenerate(parent))
194 return 1;
195
196 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
197 return 0;
198
199 /* Flags needing groups don't count if only 1 group in parent */
ab65afb0 200 if (parent->groups == parent->groups->next)
3a6712c7 201 pflags &= ~SD_DEGENERATE_GROUPS_MASK;
ab65afb0 202
f2cb1360
IM
203 if (~cflags & pflags)
204 return 0;
205
206 return 1;
207}
208
531b5c9f 209#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
f8a696f2 210DEFINE_STATIC_KEY_FALSE(sched_energy_present);
8a044141 211static unsigned int sysctl_sched_energy_aware = 1;
d91e15a2
TR
212static DEFINE_MUTEX(sched_energy_mutex);
213static bool sched_energy_update;
531b5c9f 214
8f833c82
SH
215static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
216{
217 bool any_asym_capacity = false;
218 struct cpufreq_policy *policy;
219 struct cpufreq_governor *gov;
220 int i;
221
222 /* EAS is enabled for asymmetric CPU capacity topologies. */
223 for_each_cpu(i, cpu_mask) {
224 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
225 any_asym_capacity = true;
226 break;
227 }
228 }
229 if (!any_asym_capacity) {
230 if (sched_debug()) {
231 pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
232 cpumask_pr_args(cpu_mask));
233 }
234 return false;
235 }
236
237 /* EAS definitely does *not* handle SMT */
238 if (sched_smt_active()) {
239 if (sched_debug()) {
240 pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
241 cpumask_pr_args(cpu_mask));
242 }
243 return false;
244 }
245
246 if (!arch_scale_freq_invariant()) {
247 if (sched_debug()) {
248 pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
249 cpumask_pr_args(cpu_mask));
250 }
251 return false;
252 }
253
254 /* Do not attempt EAS if schedutil is not being used. */
255 for_each_cpu(i, cpu_mask) {
256 policy = cpufreq_cpu_get(i);
257 if (!policy) {
258 if (sched_debug()) {
259 pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
260 cpumask_pr_args(cpu_mask), i);
261 }
262 return false;
263 }
264 gov = policy->governor;
265 cpufreq_cpu_put(policy);
266 if (gov != &schedutil_gov) {
267 if (sched_debug()) {
268 pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
269 cpumask_pr_args(cpu_mask));
270 }
271 return false;
272 }
273 }
274
275 return true;
276}
277
31f6a8c0
IV
278void rebuild_sched_domains_energy(void)
279{
280 mutex_lock(&sched_energy_mutex);
281 sched_energy_update = true;
282 rebuild_sched_domains();
283 sched_energy_update = false;
284 mutex_unlock(&sched_energy_mutex);
285}
286
8d5d0cfb 287#ifdef CONFIG_PROC_SYSCTL
8a044141 288static int sched_energy_aware_handler(struct ctl_table *table, int write,
32927393 289 void *buffer, size_t *lenp, loff_t *ppos)
8d5d0cfb
QP
290{
291 int ret, state;
292
293 if (write && !capable(CAP_SYS_ADMIN))
294 return -EPERM;
295
8f833c82
SH
296 if (!sched_is_eas_possible(cpu_active_mask)) {
297 if (write) {
298 return -EOPNOTSUPP;
299 } else {
300 *lenp = 0;
301 return 0;
302 }
303 }
304
8d5d0cfb
QP
305 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
306 if (!ret && write) {
307 state = static_branch_unlikely(&sched_energy_present);
31f6a8c0
IV
308 if (state != sysctl_sched_energy_aware)
309 rebuild_sched_domains_energy();
8d5d0cfb
QP
310 }
311
312 return ret;
313}
8a044141
ZN
314
315static struct ctl_table sched_energy_aware_sysctls[] = {
316 {
317 .procname = "sched_energy_aware",
318 .data = &sysctl_sched_energy_aware,
319 .maxlen = sizeof(unsigned int),
320 .mode = 0644,
321 .proc_handler = sched_energy_aware_handler,
322 .extra1 = SYSCTL_ZERO,
323 .extra2 = SYSCTL_ONE,
324 },
8a044141
ZN
325};
326
327static int __init sched_energy_aware_sysctl_init(void)
328{
329 register_sysctl_init("kernel", sched_energy_aware_sysctls);
330 return 0;
331}
332
333late_initcall(sched_energy_aware_sysctl_init);
8d5d0cfb
QP
334#endif
335
6aa140fa
QP
336static void free_pd(struct perf_domain *pd)
337{
338 struct perf_domain *tmp;
339
340 while (pd) {
341 tmp = pd->next;
342 kfree(pd);
343 pd = tmp;
344 }
345}
346
347static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
348{
349 while (pd) {
350 if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
351 return pd;
352 pd = pd->next;
353 }
354
355 return NULL;
356}
357
358static struct perf_domain *pd_init(int cpu)
359{
360 struct em_perf_domain *obj = em_cpu_get(cpu);
361 struct perf_domain *pd;
362
363 if (!obj) {
364 if (sched_debug())
365 pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
366 return NULL;
367 }
368
369 pd = kzalloc(sizeof(*pd), GFP_KERNEL);
370 if (!pd)
371 return NULL;
372 pd->em_pd = obj;
373
374 return pd;
375}
376
377static void perf_domain_debug(const struct cpumask *cpu_map,
378 struct perf_domain *pd)
379{
380 if (!sched_debug() || !pd)
381 return;
382
383 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
384
385 while (pd) {
521b512b 386 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
6aa140fa
QP
387 cpumask_first(perf_domain_span(pd)),
388 cpumask_pr_args(perf_domain_span(pd)),
521b512b 389 em_pd_nr_perf_states(pd->em_pd));
6aa140fa
QP
390 pd = pd->next;
391 }
392
393 printk(KERN_CONT "\n");
394}
395
396static void destroy_perf_domain_rcu(struct rcu_head *rp)
397{
398 struct perf_domain *pd;
399
400 pd = container_of(rp, struct perf_domain, rcu);
401 free_pd(pd);
402}
403
1f74de87
QP
404static void sched_energy_set(bool has_eas)
405{
406 if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
407 if (sched_debug())
408 pr_info("%s: stopping EAS\n", __func__);
409 static_branch_disable_cpuslocked(&sched_energy_present);
410 } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
411 if (sched_debug())
412 pr_info("%s: starting EAS\n", __func__);
413 static_branch_enable_cpuslocked(&sched_energy_present);
414 }
415}
416
b68a4c0d
QP
417/*
418 * EAS can be used on a root domain if it meets all the following conditions:
419 * 1. an Energy Model (EM) is available;
420 * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
38502ab4 421 * 3. no SMT is detected.
5b77261c
PG
422 * 4. schedutil is driving the frequency of all CPUs of the rd;
423 * 5. frequency invariance support is present;
b68a4c0d 424 */
1f74de87 425static bool build_perf_domains(const struct cpumask *cpu_map)
6aa140fa 426{
5b77261c 427 int i;
6aa140fa
QP
428 struct perf_domain *pd = NULL, *tmp;
429 int cpu = cpumask_first(cpu_map);
430 struct root_domain *rd = cpu_rq(cpu)->rd;
b68a4c0d 431
8d5d0cfb
QP
432 if (!sysctl_sched_energy_aware)
433 goto free;
434
8f833c82 435 if (!sched_is_eas_possible(cpu_map))
b68a4c0d 436 goto free;
fa50e2b4 437
6aa140fa
QP
438 for_each_cpu(i, cpu_map) {
439 /* Skip already covered CPUs. */
440 if (find_pd(pd, i))
441 continue;
442
443 /* Create the new pd and add it to the local list. */
444 tmp = pd_init(i);
445 if (!tmp)
446 goto free;
447 tmp->next = pd;
448 pd = tmp;
449 }
450
451 perf_domain_debug(cpu_map, pd);
452
453 /* Attach the new list of performance domains to the root domain. */
454 tmp = rd->pd;
455 rcu_assign_pointer(rd->pd, pd);
456 if (tmp)
457 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
458
1f74de87 459 return !!pd;
6aa140fa
QP
460
461free:
462 free_pd(pd);
463 tmp = rd->pd;
464 rcu_assign_pointer(rd->pd, NULL);
465 if (tmp)
466 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
1f74de87
QP
467
468 return false;
6aa140fa
QP
469}
470#else
471static void free_pd(struct perf_domain *pd) { }
531b5c9f 472#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
6aa140fa 473
f2cb1360
IM
474static void free_rootdomain(struct rcu_head *rcu)
475{
476 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
477
478 cpupri_cleanup(&rd->cpupri);
479 cpudl_cleanup(&rd->cpudl);
480 free_cpumask_var(rd->dlo_mask);
481 free_cpumask_var(rd->rto_mask);
482 free_cpumask_var(rd->online);
483 free_cpumask_var(rd->span);
6aa140fa 484 free_pd(rd->pd);
f2cb1360
IM
485 kfree(rd);
486}
487
488void rq_attach_root(struct rq *rq, struct root_domain *rd)
489{
490 struct root_domain *old_rd = NULL;
cab3ecae 491 struct rq_flags rf;
f2cb1360 492
cab3ecae 493 rq_lock_irqsave(rq, &rf);
f2cb1360
IM
494
495 if (rq->rd) {
496 old_rd = rq->rd;
497
498 if (cpumask_test_cpu(rq->cpu, old_rd->online))
499 set_rq_offline(rq);
500
501 cpumask_clear_cpu(rq->cpu, old_rd->span);
502
503 /*
504 * If we dont want to free the old_rd yet then
505 * set old_rd to NULL to skip the freeing later
506 * in this function:
507 */
508 if (!atomic_dec_and_test(&old_rd->refcount))
509 old_rd = NULL;
510 }
511
512 atomic_inc(&rd->refcount);
513 rq->rd = rd;
514
515 cpumask_set_cpu(rq->cpu, rd->span);
516 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
517 set_rq_online(rq);
518
cab3ecae 519 rq_unlock_irqrestore(rq, &rf);
f2cb1360
IM
520
521 if (old_rd)
337e9b07 522 call_rcu(&old_rd->rcu, free_rootdomain);
f2cb1360
IM
523}
524
364f5665
SRV
525void sched_get_rd(struct root_domain *rd)
526{
527 atomic_inc(&rd->refcount);
528}
529
530void sched_put_rd(struct root_domain *rd)
531{
532 if (!atomic_dec_and_test(&rd->refcount))
533 return;
534
337e9b07 535 call_rcu(&rd->rcu, free_rootdomain);
364f5665
SRV
536}
537
f2cb1360
IM
538static int init_rootdomain(struct root_domain *rd)
539{
f2cb1360
IM
540 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
541 goto out;
542 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
543 goto free_span;
544 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
545 goto free_online;
546 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
547 goto free_dlo_mask;
548
4bdced5c
SRRH
549#ifdef HAVE_RT_PUSH_IPI
550 rd->rto_cpu = -1;
551 raw_spin_lock_init(&rd->rto_lock);
da6ff099 552 rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
4bdced5c
SRRH
553#endif
554
26762423 555 rd->visit_gen = 0;
f2cb1360
IM
556 init_dl_bw(&rd->dl_bw);
557 if (cpudl_init(&rd->cpudl) != 0)
558 goto free_rto_mask;
559
560 if (cpupri_init(&rd->cpupri) != 0)
561 goto free_cpudl;
562 return 0;
563
564free_cpudl:
565 cpudl_cleanup(&rd->cpudl);
566free_rto_mask:
567 free_cpumask_var(rd->rto_mask);
568free_dlo_mask:
569 free_cpumask_var(rd->dlo_mask);
570free_online:
571 free_cpumask_var(rd->online);
572free_span:
573 free_cpumask_var(rd->span);
574out:
575 return -ENOMEM;
576}
577
578/*
579 * By default the system creates a single root-domain with all CPUs as
580 * members (mimicking the global state we have today).
581 */
582struct root_domain def_root_domain;
583
9a5322db 584void __init init_defrootdomain(void)
f2cb1360
IM
585{
586 init_rootdomain(&def_root_domain);
587
588 atomic_set(&def_root_domain.refcount, 1);
589}
590
591static struct root_domain *alloc_rootdomain(void)
592{
593 struct root_domain *rd;
594
4d13a06d 595 rd = kzalloc(sizeof(*rd), GFP_KERNEL);
f2cb1360
IM
596 if (!rd)
597 return NULL;
598
599 if (init_rootdomain(rd) != 0) {
600 kfree(rd);
601 return NULL;
602 }
603
604 return rd;
605}
606
607static void free_sched_groups(struct sched_group *sg, int free_sgc)
608{
609 struct sched_group *tmp, *first;
610
611 if (!sg)
612 return;
613
614 first = sg;
615 do {
616 tmp = sg->next;
617
618 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
619 kfree(sg->sgc);
620
213c5a45
SW
621 if (atomic_dec_and_test(&sg->ref))
622 kfree(sg);
f2cb1360
IM
623 sg = tmp;
624 } while (sg != first);
625}
626
627static void destroy_sched_domain(struct sched_domain *sd)
628{
629 /*
a090c4f2
PZ
630 * A normal sched domain may have multiple group references, an
631 * overlapping domain, having private groups, only one. Iterate,
632 * dropping group/capacity references, freeing where none remain.
f2cb1360 633 */
213c5a45
SW
634 free_sched_groups(sd->groups, 1);
635
f2cb1360
IM
636 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
637 kfree(sd->shared);
638 kfree(sd);
639}
640
641static void destroy_sched_domains_rcu(struct rcu_head *rcu)
642{
643 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
644
645 while (sd) {
646 struct sched_domain *parent = sd->parent;
647 destroy_sched_domain(sd);
648 sd = parent;
649 }
650}
651
652static void destroy_sched_domains(struct sched_domain *sd)
653{
654 if (sd)
655 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
656}
657
658/*
54de4427
AS
659 * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set
660 * (Last Level Cache Domain) for this allows us to avoid some pointer chasing
661 * select_idle_sibling().
f2cb1360 662 *
54de4427
AS
663 * Also keep a unique ID per domain (we use the first CPU number in the cpumask
664 * of the domain), this allows us to quickly tell if two CPUs are in the same
665 * cache domain, see cpus_share_cache().
f2cb1360 666 */
994aeb7a 667DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
f2cb1360
IM
668DEFINE_PER_CPU(int, sd_llc_size);
669DEFINE_PER_CPU(int, sd_llc_id);
b95303e0 670DEFINE_PER_CPU(int, sd_share_id);
994aeb7a
JFG
671DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
672DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
673DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
674DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
8881e163 675
df054e84 676DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
8881e163 677DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
f2cb1360
IM
678
679static void update_top_cache_domain(int cpu)
680{
681 struct sched_domain_shared *sds = NULL;
682 struct sched_domain *sd;
683 int id = cpu;
684 int size = 1;
685
54de4427 686 sd = highest_flag_domain(cpu, SD_SHARE_LLC);
f2cb1360
IM
687 if (sd) {
688 id = cpumask_first(sched_domain_span(sd));
689 size = cpumask_weight(sched_domain_span(sd));
690 sds = sd->shared;
691 }
692
693 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
694 per_cpu(sd_llc_size, cpu) = size;
695 per_cpu(sd_llc_id, cpu) = id;
696 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
697
b95303e0
BS
698 sd = lowest_flag_domain(cpu, SD_CLUSTER);
699 if (sd)
700 id = cpumask_first(sched_domain_span(sd));
701
702 /*
703 * This assignment should be placed after the sd_llc_id as
704 * we want this id equals to cluster id on cluster machines
705 * but equals to LLC id on non-Cluster machines.
706 */
707 per_cpu(sd_share_id, cpu) = id;
708
f2cb1360
IM
709 sd = lowest_flag_domain(cpu, SD_NUMA);
710 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
711
712 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
011b27bb
QP
713 rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
714
c744dc4a 715 sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
011b27bb 716 rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
f2cb1360
IM
717}
718
719/*
720 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
721 * hold the hotplug lock.
722 */
723static void
724cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
725{
726 struct rq *rq = cpu_rq(cpu);
727 struct sched_domain *tmp;
728
729 /* Remove the sched domains which do not contribute to scheduling. */
730 for (tmp = sd; tmp; ) {
731 struct sched_domain *parent = tmp->parent;
732 if (!parent)
733 break;
734
735 if (sd_parent_degenerate(tmp, parent)) {
736 tmp->parent = parent->parent;
bf2dc42d
TC
737
738 if (parent->parent) {
f2cb1360 739 parent->parent->child = tmp;
4efcc8bc 740 parent->parent->groups->flags = tmp->flags;
bf2dc42d
TC
741 }
742
f2cb1360
IM
743 /*
744 * Transfer SD_PREFER_SIBLING down in case of a
745 * degenerate parent; the spans match for this
746 * so the property transfers.
747 */
748 if (parent->flags & SD_PREFER_SIBLING)
749 tmp->flags |= SD_PREFER_SIBLING;
750 destroy_sched_domain(parent);
751 } else
752 tmp = tmp->parent;
753 }
754
755 if (sd && sd_degenerate(sd)) {
756 tmp = sd;
757 sd = sd->parent;
758 destroy_sched_domain(tmp);
16d364ba
RN
759 if (sd) {
760 struct sched_group *sg = sd->groups;
761
762 /*
763 * sched groups hold the flags of the child sched
764 * domain for convenience. Clear such flags since
765 * the child is being destroyed.
766 */
767 do {
768 sg->flags = 0;
769 } while (sg != sd->groups);
770
f2cb1360 771 sd->child = NULL;
16d364ba 772 }
f2cb1360
IM
773 }
774
775 sched_domain_debug(sd, cpu);
776
777 rq_attach_root(rq, rd);
778 tmp = rq->sd;
779 rcu_assign_pointer(rq->sd, sd);
bbdacdfe 780 dirty_sched_domain_sysctl(cpu);
f2cb1360
IM
781 destroy_sched_domains(tmp);
782
783 update_top_cache_domain(cpu);
784}
785
f2cb1360 786struct s_data {
99687cdb 787 struct sched_domain * __percpu *sd;
f2cb1360
IM
788 struct root_domain *rd;
789};
790
791enum s_alloc {
792 sa_rootdomain,
793 sa_sd,
794 sa_sd_storage,
795 sa_none,
796};
797
35a566e6
PZ
798/*
799 * Return the canonical balance CPU for this group, this is the first CPU
e5c14b1f 800 * of this group that's also in the balance mask.
35a566e6 801 *
e5c14b1f
PZ
802 * The balance mask are all those CPUs that could actually end up at this
803 * group. See build_balance_mask().
35a566e6
PZ
804 *
805 * Also see should_we_balance().
806 */
807int group_balance_cpu(struct sched_group *sg)
808{
e5c14b1f 809 return cpumask_first(group_balance_mask(sg));
35a566e6
PZ
810}
811
812
813/*
814 * NUMA topology (first read the regular topology blurb below)
815 *
816 * Given a node-distance table, for example:
817 *
818 * node 0 1 2 3
819 * 0: 10 20 30 20
820 * 1: 20 10 20 30
821 * 2: 30 20 10 20
822 * 3: 20 30 20 10
823 *
824 * which represents a 4 node ring topology like:
825 *
826 * 0 ----- 1
827 * | |
828 * | |
829 * | |
830 * 3 ----- 2
831 *
832 * We want to construct domains and groups to represent this. The way we go
833 * about doing this is to build the domains on 'hops'. For each NUMA level we
834 * construct the mask of all nodes reachable in @level hops.
835 *
836 * For the above NUMA topology that gives 3 levels:
837 *
838 * NUMA-2 0-3 0-3 0-3 0-3
839 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
840 *
841 * NUMA-1 0-1,3 0-2 1-3 0,2-3
842 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
843 *
844 * NUMA-0 0 1 2 3
845 *
846 *
847 * As can be seen; things don't nicely line up as with the regular topology.
848 * When we iterate a domain in child domain chunks some nodes can be
849 * represented multiple times -- hence the "overlap" naming for this part of
850 * the topology.
851 *
852 * In order to minimize this overlap, we only build enough groups to cover the
853 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
854 *
855 * Because:
856 *
857 * - the first group of each domain is its child domain; this
858 * gets us the first 0-1,3
859 * - the only uncovered node is 2, who's child domain is 1-3.
860 *
861 * However, because of the overlap, computing a unique CPU for each group is
862 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
863 * groups include the CPUs of Node-0, while those CPUs would not in fact ever
864 * end up at those groups (they would end up in group: 0-1,3).
865 *
e5c14b1f 866 * To correct this we have to introduce the group balance mask. This mask
35a566e6
PZ
867 * will contain those CPUs in the group that can reach this group given the
868 * (child) domain tree.
869 *
870 * With this we can once again compute balance_cpu and sched_group_capacity
871 * relations.
872 *
873 * XXX include words on how balance_cpu is unique and therefore can be
874 * used for sched_group_capacity links.
875 *
876 *
877 * Another 'interesting' topology is:
878 *
879 * node 0 1 2 3
880 * 0: 10 20 20 30
881 * 1: 20 10 20 20
882 * 2: 20 20 10 20
883 * 3: 30 20 20 10
884 *
885 * Which looks a little like:
886 *
887 * 0 ----- 1
888 * | / |
889 * | / |
890 * | / |
891 * 2 ----- 3
892 *
893 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
894 * are not.
895 *
896 * This leads to a few particularly weird cases where the sched_domain's are
97fb7a0a 897 * not of the same number for each CPU. Consider:
35a566e6
PZ
898 *
899 * NUMA-2 0-3 0-3
900 * groups: {0-2},{1-3} {1-3},{0-2}
901 *
902 * NUMA-1 0-2 0-3 0-3 1-3
903 *
904 * NUMA-0 0 1 2 3
905 *
906 */
907
908
f2cb1360 909/*
e5c14b1f
PZ
910 * Build the balance mask; it contains only those CPUs that can arrive at this
911 * group and should be considered to continue balancing.
35a566e6
PZ
912 *
913 * We do this during the group creation pass, therefore the group information
914 * isn't complete yet, however since each group represents a (child) domain we
915 * can fully construct this using the sched_domain bits (which are already
916 * complete).
f2cb1360 917 */
1676330e 918static void
e5c14b1f 919build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
f2cb1360 920{
ae4df9d6 921 const struct cpumask *sg_span = sched_group_span(sg);
f2cb1360
IM
922 struct sd_data *sdd = sd->private;
923 struct sched_domain *sibling;
924 int i;
925
1676330e
PZ
926 cpumask_clear(mask);
927
f32d782e 928 for_each_cpu(i, sg_span) {
f2cb1360 929 sibling = *per_cpu_ptr(sdd->sd, i);
73bb059f
PZ
930
931 /*
932 * Can happen in the asymmetric case, where these siblings are
933 * unused. The mask will not be empty because those CPUs that
934 * do have the top domain _should_ span the domain.
935 */
936 if (!sibling->child)
937 continue;
938
939 /* If we would not end up here, we can't continue from here */
940 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
f2cb1360
IM
941 continue;
942
1676330e 943 cpumask_set_cpu(i, mask);
f2cb1360 944 }
73bb059f
PZ
945
946 /* We must not have empty masks here */
1676330e 947 WARN_ON_ONCE(cpumask_empty(mask));
f2cb1360
IM
948}
949
950/*
35a566e6
PZ
951 * XXX: This creates per-node group entries; since the load-balancer will
952 * immediately access remote memory to construct this group's load-balance
953 * statistics having the groups node local is of dubious benefit.
f2cb1360 954 */
8c033469
LRV
955static struct sched_group *
956build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
957{
958 struct sched_group *sg;
959 struct cpumask *sg_span;
960
961 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
962 GFP_KERNEL, cpu_to_node(cpu));
963
964 if (!sg)
965 return NULL;
966
ae4df9d6 967 sg_span = sched_group_span(sg);
16d364ba 968 if (sd->child) {
8c033469 969 cpumask_copy(sg_span, sched_domain_span(sd->child));
16d364ba
RN
970 sg->flags = sd->child->flags;
971 } else {
8c033469 972 cpumask_copy(sg_span, sched_domain_span(sd));
16d364ba 973 }
8c033469 974
213c5a45 975 atomic_inc(&sg->ref);
8c033469
LRV
976 return sg;
977}
978
979static void init_overlap_sched_group(struct sched_domain *sd,
1676330e 980 struct sched_group *sg)
8c033469 981{
1676330e 982 struct cpumask *mask = sched_domains_tmpmask2;
8c033469
LRV
983 struct sd_data *sdd = sd->private;
984 struct cpumask *sg_span;
1676330e
PZ
985 int cpu;
986
e5c14b1f 987 build_balance_mask(sd, sg, mask);
0a2b65c0 988 cpu = cpumask_first(mask);
8c033469
LRV
989
990 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
991 if (atomic_inc_return(&sg->sgc->ref) == 1)
e5c14b1f 992 cpumask_copy(group_balance_mask(sg), mask);
35a566e6 993 else
e5c14b1f 994 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
8c033469
LRV
995
996 /*
997 * Initialize sgc->capacity such that even if we mess up the
998 * domains and no possible iteration will get us here, we won't
999 * die on a /0 trap.
1000 */
ae4df9d6 1001 sg_span = sched_group_span(sg);
8c033469
LRV
1002 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
1003 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
e3d6d0cb 1004 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
8c033469
LRV
1005}
1006
585b6d27
BS
1007static struct sched_domain *
1008find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
1009{
1010 /*
1011 * The proper descendant would be the one whose child won't span out
1012 * of sd
1013 */
1014 while (sibling->child &&
1015 !cpumask_subset(sched_domain_span(sibling->child),
1016 sched_domain_span(sd)))
1017 sibling = sibling->child;
1018
1019 /*
1020 * As we are referencing sgc across different topology level, we need
1021 * to go down to skip those sched_domains which don't contribute to
1022 * scheduling because they will be degenerated in cpu_attach_domain
1023 */
1024 while (sibling->child &&
1025 cpumask_equal(sched_domain_span(sibling->child),
1026 sched_domain_span(sibling)))
1027 sibling = sibling->child;
1028
1029 return sibling;
1030}
1031
f2cb1360
IM
1032static int
1033build_overlap_sched_groups(struct sched_domain *sd, int cpu)
1034{
91eaed0d 1035 struct sched_group *first = NULL, *last = NULL, *sg;
f2cb1360
IM
1036 const struct cpumask *span = sched_domain_span(sd);
1037 struct cpumask *covered = sched_domains_tmpmask;
1038 struct sd_data *sdd = sd->private;
1039 struct sched_domain *sibling;
1040 int i;
1041
1042 cpumask_clear(covered);
1043
0372dd27 1044 for_each_cpu_wrap(i, span, cpu) {
f2cb1360
IM
1045 struct cpumask *sg_span;
1046
1047 if (cpumask_test_cpu(i, covered))
1048 continue;
1049
1050 sibling = *per_cpu_ptr(sdd->sd, i);
1051
c20e1ea4
LRV
1052 /*
1053 * Asymmetric node setups can result in situations where the
1054 * domain tree is of unequal depth, make sure to skip domains
1055 * that already cover the entire range.
1056 *
1057 * In that case build_sched_domains() will have terminated the
1058 * iteration early and our sibling sd spans will be empty.
1059 * Domains should always include the CPU they're built on, so
1060 * check that.
1061 */
f2cb1360
IM
1062 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
1063 continue;
1064
585b6d27
BS
1065 /*
1066 * Usually we build sched_group by sibling's child sched_domain
1067 * But for machines whose NUMA diameter are 3 or above, we move
1068 * to build sched_group by sibling's proper descendant's child
1069 * domain because sibling's child sched_domain will span out of
1070 * the sched_domain being built as below.
1071 *
1072 * Smallest diameter=3 topology is:
1073 *
1074 * node 0 1 2 3
1075 * 0: 10 20 30 40
1076 * 1: 20 10 20 30
1077 * 2: 30 20 10 20
1078 * 3: 40 30 20 10
1079 *
1080 * 0 --- 1 --- 2 --- 3
1081 *
1082 * NUMA-3 0-3 N/A N/A 0-3
1083 * groups: {0-2},{1-3} {1-3},{0-2}
1084 *
1085 * NUMA-2 0-2 0-3 0-3 1-3
1086 * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
1087 *
1088 * NUMA-1 0-1 0-2 1-3 2-3
1089 * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
1090 *
1091 * NUMA-0 0 1 2 3
1092 *
1093 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
1094 * group span isn't a subset of the domain span.
1095 */
1096 if (sibling->child &&
1097 !cpumask_subset(sched_domain_span(sibling->child), span))
1098 sibling = find_descended_sibling(sd, sibling);
1099
8c033469 1100 sg = build_group_from_child_sched_domain(sibling, cpu);
f2cb1360
IM
1101 if (!sg)
1102 goto fail;
1103
ae4df9d6 1104 sg_span = sched_group_span(sg);
f2cb1360
IM
1105 cpumask_or(covered, covered, sg_span);
1106
585b6d27 1107 init_overlap_sched_group(sibling, sg);
f2cb1360 1108
f2cb1360
IM
1109 if (!first)
1110 first = sg;
1111 if (last)
1112 last->next = sg;
1113 last = sg;
1114 last->next = first;
1115 }
91eaed0d 1116 sd->groups = first;
f2cb1360
IM
1117
1118 return 0;
1119
1120fail:
1121 free_sched_groups(first, 0);
1122
1123 return -ENOMEM;
1124}
1125
35a566e6
PZ
1126
1127/*
1128 * Package topology (also see the load-balance blurb in fair.c)
1129 *
1130 * The scheduler builds a tree structure to represent a number of important
1131 * topology features. By default (default_topology[]) these include:
1132 *
1133 * - Simultaneous multithreading (SMT)
1134 * - Multi-Core Cache (MC)
f577cd57 1135 * - Package (PKG)
35a566e6
PZ
1136 *
1137 * Where the last one more or less denotes everything up to a NUMA node.
1138 *
1139 * The tree consists of 3 primary data structures:
1140 *
1141 * sched_domain -> sched_group -> sched_group_capacity
1142 * ^ ^ ^ ^
1143 * `-' `-'
1144 *
97fb7a0a 1145 * The sched_domains are per-CPU and have a two way link (parent & child) and
35a566e6
PZ
1146 * denote the ever growing mask of CPUs belonging to that level of topology.
1147 *
1148 * Each sched_domain has a circular (double) linked list of sched_group's, each
1149 * denoting the domains of the level below (or individual CPUs in case of the
1150 * first domain level). The sched_group linked by a sched_domain includes the
1151 * CPU of that sched_domain [*].
1152 *
1153 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
1154 *
1155 * CPU 0 1 2 3 4 5 6 7
1156 *
f577cd57 1157 * PKG [ ]
35a566e6
PZ
1158 * MC [ ] [ ]
1159 * SMT [ ] [ ] [ ] [ ]
1160 *
1161 * - or -
1162 *
f577cd57 1163 * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
35a566e6
PZ
1164 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
1165 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
1166 *
1167 * CPU 0 1 2 3 4 5 6 7
1168 *
1169 * One way to think about it is: sched_domain moves you up and down among these
1170 * topology levels, while sched_group moves you sideways through it, at child
1171 * domain granularity.
1172 *
1173 * sched_group_capacity ensures each unique sched_group has shared storage.
1174 *
1175 * There are two related construction problems, both require a CPU that
1176 * uniquely identify each group (for a given domain):
1177 *
1178 * - The first is the balance_cpu (see should_we_balance() and the
1179 * load-balance blub in fair.c); for each group we only want 1 CPU to
1180 * continue balancing at a higher domain.
1181 *
1182 * - The second is the sched_group_capacity; we want all identical groups
1183 * to share a single sched_group_capacity.
1184 *
1185 * Since these topologies are exclusive by construction. That is, its
1186 * impossible for an SMT thread to belong to multiple cores, and cores to
1187 * be part of multiple caches. There is a very clear and unique location
1188 * for each CPU in the hierarchy.
1189 *
1190 * Therefore computing a unique CPU for each group is trivial (the iteration
1191 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
1192 * group), we can simply pick the first CPU in each group.
1193 *
1194 *
1195 * [*] in other words, the first group of each domain is its child domain.
1196 */
1197
0c0e776a 1198static struct sched_group *get_group(int cpu, struct sd_data *sdd)
f2cb1360
IM
1199{
1200 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1201 struct sched_domain *child = sd->child;
0c0e776a 1202 struct sched_group *sg;
67d4f6ff 1203 bool already_visited;
f2cb1360
IM
1204
1205 if (child)
1206 cpu = cpumask_first(sched_domain_span(child));
1207
0c0e776a
PZ
1208 sg = *per_cpu_ptr(sdd->sg, cpu);
1209 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1210
67d4f6ff
VS
1211 /* Increase refcounts for claim_allocations: */
1212 already_visited = atomic_inc_return(&sg->ref) > 1;
1213 /* sgc visits should follow a similar trend as sg */
1214 WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1215
1216 /* If we have already visited that group, it's already initialized. */
1217 if (already_visited)
1218 return sg;
f2cb1360 1219
0c0e776a 1220 if (child) {
ae4df9d6
PZ
1221 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
1222 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
16d364ba 1223 sg->flags = child->flags;
0c0e776a 1224 } else {
ae4df9d6 1225 cpumask_set_cpu(cpu, sched_group_span(sg));
e5c14b1f 1226 cpumask_set_cpu(cpu, group_balance_mask(sg));
f2cb1360
IM
1227 }
1228
ae4df9d6 1229 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
0c0e776a 1230 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
e3d6d0cb 1231 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
0c0e776a
PZ
1232
1233 return sg;
f2cb1360
IM
1234}
1235
1236/*
1237 * build_sched_groups will build a circular linked list of the groups
d8743230
VS
1238 * covered by the given span, will set each group's ->cpumask correctly,
1239 * and will initialize their ->sgc.
f2cb1360
IM
1240 *
1241 * Assumes the sched_domain tree is fully constructed
1242 */
1243static int
1244build_sched_groups(struct sched_domain *sd, int cpu)
1245{
1246 struct sched_group *first = NULL, *last = NULL;
1247 struct sd_data *sdd = sd->private;
1248 const struct cpumask *span = sched_domain_span(sd);
1249 struct cpumask *covered;
1250 int i;
1251
f2cb1360
IM
1252 lockdep_assert_held(&sched_domains_mutex);
1253 covered = sched_domains_tmpmask;
1254
1255 cpumask_clear(covered);
1256
0c0e776a 1257 for_each_cpu_wrap(i, span, cpu) {
f2cb1360 1258 struct sched_group *sg;
f2cb1360
IM
1259
1260 if (cpumask_test_cpu(i, covered))
1261 continue;
1262
0c0e776a 1263 sg = get_group(i, sdd);
f2cb1360 1264
ae4df9d6 1265 cpumask_or(covered, covered, sched_group_span(sg));
f2cb1360
IM
1266
1267 if (!first)
1268 first = sg;
1269 if (last)
1270 last->next = sg;
1271 last = sg;
1272 }
1273 last->next = first;
0c0e776a 1274 sd->groups = first;
f2cb1360
IM
1275
1276 return 0;
1277}
1278
1279/*
1280 * Initialize sched groups cpu_capacity.
1281 *
1282 * cpu_capacity indicates the capacity of sched group, which is used while
1283 * distributing the load between different sched groups in a sched domain.
1284 * Typically cpu_capacity for all the groups in a sched domain will be same
1285 * unless there are asymmetries in the topology. If there are asymmetries,
1286 * group having more cpu_capacity will pickup more load compared to the
1287 * group having less cpu_capacity.
1288 */
1289static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
1290{
1291 struct sched_group *sg = sd->groups;
d24cb0d9 1292 struct cpumask *mask = sched_domains_tmpmask2;
f2cb1360
IM
1293
1294 WARN_ON(!sg);
1295
1296 do {
d24cb0d9 1297 int cpu, cores = 0, max_cpu = -1;
f2cb1360 1298
ae4df9d6 1299 sg->group_weight = cpumask_weight(sched_group_span(sg));
f2cb1360 1300
d24cb0d9
TC
1301 cpumask_copy(mask, sched_group_span(sg));
1302 for_each_cpu(cpu, mask) {
1303 cores++;
1304#ifdef CONFIG_SCHED_SMT
1305 cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
1306#endif
1307 }
1308 sg->cores = cores;
1309
f2cb1360
IM
1310 if (!(sd->flags & SD_ASYM_PACKING))
1311 goto next;
1312
ae4df9d6 1313 for_each_cpu(cpu, sched_group_span(sg)) {
f2cb1360
IM
1314 if (max_cpu < 0)
1315 max_cpu = cpu;
1316 else if (sched_asym_prefer(cpu, max_cpu))
1317 max_cpu = cpu;
1318 }
1319 sg->asym_prefer_cpu = max_cpu;
1320
1321next:
1322 sg = sg->next;
1323 } while (sg != sd->groups);
1324
1325 if (cpu != group_balance_cpu(sg))
1326 return;
1327
1328 update_group_capacity(sd, cpu);
1329}
1330
c744dc4a
BM
1331/*
1332 * Set of available CPUs grouped by their corresponding capacities
1333 * Each list entry contains a CPU mask reflecting CPUs that share the same
1334 * capacity.
1335 * The lifespan of data is unlimited.
1336 */
77222b0d 1337LIST_HEAD(asym_cap_list);
c744dc4a
BM
1338
1339/*
1340 * Verify whether there is any CPU capacity asymmetry in a given sched domain.
1341 * Provides sd_flags reflecting the asymmetry scope.
1342 */
1343static inline int
1344asym_cpu_capacity_classify(const struct cpumask *sd_span,
1345 const struct cpumask *cpu_map)
1346{
1347 struct asym_cap_data *entry;
1348 int count = 0, miss = 0;
1349
1350 /*
1351 * Count how many unique CPU capacities this domain spans across
1352 * (compare sched_domain CPUs mask with ones representing available
1353 * CPUs capacities). Take into account CPUs that might be offline:
1354 * skip those.
1355 */
1356 list_for_each_entry(entry, &asym_cap_list, link) {
1357 if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
1358 ++count;
1359 else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
1360 ++miss;
1361 }
1362
1363 WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
1364
1365 /* No asymmetry detected */
1366 if (count < 2)
1367 return 0;
1368 /* Some of the available CPU capacity values have not been detected */
1369 if (miss)
1370 return SD_ASYM_CPUCAPACITY;
1371
1372 /* Full asymmetry */
1373 return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
1374
1375}
1376
77222b0d
QY
1377static void free_asym_cap_entry(struct rcu_head *head)
1378{
1379 struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
1380 kfree(entry);
1381}
1382
c744dc4a
BM
1383static inline void asym_cpu_capacity_update_data(int cpu)
1384{
1385 unsigned long capacity = arch_scale_cpu_capacity(cpu);
77222b0d
QY
1386 struct asym_cap_data *insert_entry = NULL;
1387 struct asym_cap_data *entry;
c744dc4a 1388
77222b0d
QY
1389 /*
1390 * Search if capacity already exits. If not, track which the entry
1391 * where we should insert to keep the list ordered descendingly.
1392 */
c744dc4a
BM
1393 list_for_each_entry(entry, &asym_cap_list, link) {
1394 if (capacity == entry->capacity)
1395 goto done;
77222b0d
QY
1396 else if (!insert_entry && capacity > entry->capacity)
1397 insert_entry = list_prev_entry(entry, link);
c744dc4a
BM
1398 }
1399
1400 entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
1401 if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
1402 return;
1403 entry->capacity = capacity;
77222b0d
QY
1404
1405 /* If NULL then the new capacity is the smallest, add last. */
1406 if (!insert_entry)
1407 list_add_tail_rcu(&entry->link, &asym_cap_list);
1408 else
1409 list_add_rcu(&entry->link, &insert_entry->link);
c744dc4a
BM
1410done:
1411 __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
1412}
1413
1414/*
1415 * Build-up/update list of CPUs grouped by their capacities
1416 * An update requires explicit request to rebuild sched domains
1417 * with state indicating CPU topology changes.
1418 */
1419static void asym_cpu_capacity_scan(void)
1420{
1421 struct asym_cap_data *entry, *next;
1422 int cpu;
1423
1424 list_for_each_entry(entry, &asym_cap_list, link)
1425 cpumask_clear(cpu_capacity_span(entry));
1426
04d4e665 1427 for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN))
c744dc4a
BM
1428 asym_cpu_capacity_update_data(cpu);
1429
1430 list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
1431 if (cpumask_empty(cpu_capacity_span(entry))) {
77222b0d
QY
1432 list_del_rcu(&entry->link);
1433 call_rcu(&entry->rcu, free_asym_cap_entry);
c744dc4a
BM
1434 }
1435 }
1436
1437 /*
1438 * Only one capacity value has been detected i.e. this system is symmetric.
1439 * No need to keep this data around.
1440 */
1441 if (list_is_singular(&asym_cap_list)) {
1442 entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
77222b0d
QY
1443 list_del_rcu(&entry->link);
1444 call_rcu(&entry->rcu, free_asym_cap_entry);
c744dc4a
BM
1445 }
1446}
1447
f2cb1360
IM
1448/*
1449 * Initializers for schedule domains
1450 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
1451 */
1452
1453static int default_relax_domain_level = -1;
1454int sched_domain_level_max;
1455
1456static int __init setup_relax_domain_level(char *str)
1457{
1458 if (kstrtoint(str, 0, &default_relax_domain_level))
1459 pr_warn("Unable to set relax_domain_level\n");
1460
1461 return 1;
1462}
1463__setup("relax_domain_level=", setup_relax_domain_level);
1464
1465static void set_domain_attribute(struct sched_domain *sd,
1466 struct sched_domain_attr *attr)
1467{
1468 int request;
1469
1470 if (!attr || attr->relax_domain_level < 0) {
1471 if (default_relax_domain_level < 0)
1472 return;
9ae7ab20 1473 request = default_relax_domain_level;
f2cb1360
IM
1474 } else
1475 request = attr->relax_domain_level;
9ae7ab20 1476
a1fd0b9d 1477 if (sd->level >= request) {
f2cb1360
IM
1478 /* Turn off idle balance on this domain: */
1479 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
f2cb1360
IM
1480 }
1481}
1482
1483static void __sdt_free(const struct cpumask *cpu_map);
1484static int __sdt_alloc(const struct cpumask *cpu_map);
1485
1486static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
1487 const struct cpumask *cpu_map)
1488{
1489 switch (what) {
1490 case sa_rootdomain:
1491 if (!atomic_read(&d->rd->refcount))
1492 free_rootdomain(&d->rd->rcu);
df561f66 1493 fallthrough;
f2cb1360
IM
1494 case sa_sd:
1495 free_percpu(d->sd);
df561f66 1496 fallthrough;
f2cb1360
IM
1497 case sa_sd_storage:
1498 __sdt_free(cpu_map);
df561f66 1499 fallthrough;
f2cb1360
IM
1500 case sa_none:
1501 break;
1502 }
1503}
1504
1505static enum s_alloc
1506__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1507{
1508 memset(d, 0, sizeof(*d));
1509
1510 if (__sdt_alloc(cpu_map))
1511 return sa_sd_storage;
1512 d->sd = alloc_percpu(struct sched_domain *);
1513 if (!d->sd)
1514 return sa_sd_storage;
1515 d->rd = alloc_rootdomain();
1516 if (!d->rd)
1517 return sa_sd;
97fb7a0a 1518
f2cb1360
IM
1519 return sa_rootdomain;
1520}
1521
1522/*
1523 * NULL the sd_data elements we've used to build the sched_domain and
1524 * sched_group structure so that the subsequent __free_domain_allocs()
1525 * will not free the data we're using.
1526 */
1527static void claim_allocations(int cpu, struct sched_domain *sd)
1528{
1529 struct sd_data *sdd = sd->private;
1530
1531 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1532 *per_cpu_ptr(sdd->sd, cpu) = NULL;
1533
1534 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1535 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1536
1537 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1538 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1539
1540 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1541 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1542}
1543
1544#ifdef CONFIG_NUMA
f2cb1360 1545enum numa_topology_type sched_numa_topology_type;
97fb7a0a
IM
1546
1547static int sched_domains_numa_levels;
1548static int sched_domains_curr_level;
1549
1550int sched_max_numa_distance;
1551static int *sched_domains_numa_distance;
1552static struct cpumask ***sched_domains_numa_masks;
f2cb1360
IM
1553#endif
1554
1555/*
1556 * SD_flags allowed in topology descriptions.
1557 *
1558 * These flags are purely descriptive of the topology and do not prescribe
1559 * behaviour. Behaviour is artificial and mapped in the below sd_init()
d654c8dd 1560 * function. For details, see include/linux/sched/sd_flags.h.
f2cb1360 1561 *
d654c8dd 1562 * SD_SHARE_CPUCAPACITY
54de4427 1563 * SD_SHARE_LLC
d654c8dd
AS
1564 * SD_CLUSTER
1565 * SD_NUMA
f2cb1360
IM
1566 *
1567 * Odd one out, which beside describing the topology has a quirk also
1568 * prescribes the desired behaviour that goes along with it:
1569 *
1570 * SD_ASYM_PACKING - describes SMT quirks
1571 */
1572#define TOPOLOGY_SD_FLAGS \
97fb7a0a 1573 (SD_SHARE_CPUCAPACITY | \
b95303e0 1574 SD_CLUSTER | \
54de4427 1575 SD_SHARE_LLC | \
97fb7a0a 1576 SD_NUMA | \
cfe7ddcb 1577 SD_ASYM_PACKING)
f2cb1360
IM
1578
1579static struct sched_domain *
1580sd_init(struct sched_domain_topology_level *tl,
1581 const struct cpumask *cpu_map,
c744dc4a 1582 struct sched_domain *child, int cpu)
f2cb1360
IM
1583{
1584 struct sd_data *sdd = &tl->data;
1585 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1586 int sd_id, sd_weight, sd_flags = 0;
c744dc4a 1587 struct cpumask *sd_span;
f2cb1360
IM
1588
1589#ifdef CONFIG_NUMA
1590 /*
1591 * Ugly hack to pass state to sd_numa_mask()...
1592 */
1593 sched_domains_curr_level = tl->numa_level;
1594#endif
1595
1596 sd_weight = cpumask_weight(tl->mask(cpu));
1597
1598 if (tl->sd_flags)
1599 sd_flags = (*tl->sd_flags)();
1600 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1601 "wrong sd_flags in topology description\n"))
9b1b234b 1602 sd_flags &= TOPOLOGY_SD_FLAGS;
f2cb1360
IM
1603
1604 *sd = (struct sched_domain){
1605 .min_interval = sd_weight,
1606 .max_interval = 2*sd_weight,
6e749913 1607 .busy_factor = 16,
2208cdaa 1608 .imbalance_pct = 117,
f2cb1360
IM
1609
1610 .cache_nice_tries = 0,
f2cb1360 1611
36c5bdc4 1612 .flags = 1*SD_BALANCE_NEWIDLE
f2cb1360
IM
1613 | 1*SD_BALANCE_EXEC
1614 | 1*SD_BALANCE_FORK
1615 | 0*SD_BALANCE_WAKE
1616 | 1*SD_WAKE_AFFINE
1617 | 0*SD_SHARE_CPUCAPACITY
54de4427 1618 | 0*SD_SHARE_LLC
f2cb1360 1619 | 0*SD_SERIALIZE
9c63e84d 1620 | 1*SD_PREFER_SIBLING
f2cb1360
IM
1621 | 0*SD_NUMA
1622 | sd_flags
1623 ,
1624
1625 .last_balance = jiffies,
1626 .balance_interval = sd_weight,
f2cb1360 1627 .max_newidle_lb_cost = 0,
e60b56e4 1628 .last_decay_max_lb_cost = jiffies,
f2cb1360
IM
1629 .child = child,
1630#ifdef CONFIG_SCHED_DEBUG
1631 .name = tl->name,
1632#endif
1633 };
1634
c744dc4a
BM
1635 sd_span = sched_domain_span(sd);
1636 cpumask_and(sd_span, cpu_map, tl->mask(cpu));
1637 sd_id = cpumask_first(sd_span);
1638
1639 sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
1640
1641 WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
1642 (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
1643 "CPU capacity asymmetry not supported on SMT\n");
f2cb1360
IM
1644
1645 /*
1646 * Convert topological properties into behaviour.
1647 */
a526d466
MR
1648 /* Don't attempt to spread across CPUs of different capacities. */
1649 if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
1650 sd->child->flags &= ~SD_PREFER_SIBLING;
f2cb1360
IM
1651
1652 if (sd->flags & SD_SHARE_CPUCAPACITY) {
f2cb1360 1653 sd->imbalance_pct = 110;
f2cb1360 1654
54de4427 1655 } else if (sd->flags & SD_SHARE_LLC) {
f2cb1360
IM
1656 sd->imbalance_pct = 117;
1657 sd->cache_nice_tries = 1;
f2cb1360
IM
1658
1659#ifdef CONFIG_NUMA
1660 } else if (sd->flags & SD_NUMA) {
1661 sd->cache_nice_tries = 2;
f2cb1360 1662
9c63e84d 1663 sd->flags &= ~SD_PREFER_SIBLING;
f2cb1360 1664 sd->flags |= SD_SERIALIZE;
a55c7454 1665 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
f2cb1360
IM
1666 sd->flags &= ~(SD_BALANCE_EXEC |
1667 SD_BALANCE_FORK |
1668 SD_WAKE_AFFINE);
1669 }
1670
1671#endif
1672 } else {
f2cb1360 1673 sd->cache_nice_tries = 1;
f2cb1360
IM
1674 }
1675
1676 /*
1677 * For all levels sharing cache; connect a sched_domain_shared
1678 * instance.
1679 */
54de4427 1680 if (sd->flags & SD_SHARE_LLC) {
f2cb1360
IM
1681 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1682 atomic_inc(&sd->shared->ref);
1683 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1684 }
1685
1686 sd->private = sdd;
1687
1688 return sd;
1689}
1690
1691/*
1692 * Topology list, bottom-up.
1693 */
1694static struct sched_domain_topology_level default_topology[] = {
1695#ifdef CONFIG_SCHED_SMT
1696 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1697#endif
778c558f
BS
1698
1699#ifdef CONFIG_SCHED_CLUSTER
1700 { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
1701#endif
1702
f2cb1360
IM
1703#ifdef CONFIG_SCHED_MC
1704 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1705#endif
f577cd57 1706 { cpu_cpu_mask, SD_INIT_NAME(PKG) },
f2cb1360
IM
1707 { NULL, },
1708};
1709
1710static struct sched_domain_topology_level *sched_domain_topology =
1711 default_topology;
0fb3978b 1712static struct sched_domain_topology_level *sched_domain_topology_saved;
f2cb1360
IM
1713
1714#define for_each_sd_topology(tl) \
1715 for (tl = sched_domain_topology; tl->mask; tl++)
1716
0cce0fde 1717void __init set_sched_topology(struct sched_domain_topology_level *tl)
f2cb1360
IM
1718{
1719 if (WARN_ON_ONCE(sched_smp_initialized))
1720 return;
1721
1722 sched_domain_topology = tl;
0fb3978b 1723 sched_domain_topology_saved = NULL;
f2cb1360
IM
1724}
1725
1726#ifdef CONFIG_NUMA
1727
1728static const struct cpumask *sd_numa_mask(int cpu)
1729{
1730 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1731}
1732
1733static void sched_numa_warn(const char *str)
1734{
1735 static int done = false;
1736 int i,j;
1737
1738 if (done)
1739 return;
1740
1741 done = true;
1742
1743 printk(KERN_WARNING "ERROR: %s\n\n", str);
1744
1745 for (i = 0; i < nr_node_ids; i++) {
1746 printk(KERN_WARNING " ");
0fb3978b
HY
1747 for (j = 0; j < nr_node_ids; j++) {
1748 if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
1749 printk(KERN_CONT "(%02d) ", node_distance(i,j));
1750 else
1751 printk(KERN_CONT " %02d ", node_distance(i,j));
1752 }
f2cb1360
IM
1753 printk(KERN_CONT "\n");
1754 }
1755 printk(KERN_WARNING "\n");
1756}
1757
1758bool find_numa_distance(int distance)
1759{
0fb3978b
HY
1760 bool found = false;
1761 int i, *distances;
f2cb1360
IM
1762
1763 if (distance == node_distance(0, 0))
1764 return true;
1765
0fb3978b
HY
1766 rcu_read_lock();
1767 distances = rcu_dereference(sched_domains_numa_distance);
1768 if (!distances)
1769 goto unlock;
f2cb1360 1770 for (i = 0; i < sched_domains_numa_levels; i++) {
0fb3978b
HY
1771 if (distances[i] == distance) {
1772 found = true;
1773 break;
1774 }
f2cb1360 1775 }
0fb3978b
HY
1776unlock:
1777 rcu_read_unlock();
f2cb1360 1778
0fb3978b 1779 return found;
f2cb1360
IM
1780}
1781
0fb3978b
HY
1782#define for_each_cpu_node_but(n, nbut) \
1783 for_each_node_state(n, N_CPU) \
1784 if (n == nbut) \
1785 continue; \
1786 else
1787
f2cb1360
IM
1788/*
1789 * A system can have three types of NUMA topology:
1790 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1791 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1792 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1793 *
1794 * The difference between a glueless mesh topology and a backplane
1795 * topology lies in whether communication between not directly
1796 * connected nodes goes through intermediary nodes (where programs
1797 * could run), or through backplane controllers. This affects
1798 * placement of programs.
1799 *
1800 * The type of topology can be discerned with the following tests:
1801 * - If the maximum distance between any nodes is 1 hop, the system
1802 * is directly connected.
1803 * - If for two nodes A and B, located N > 1 hops away from each other,
1804 * there is an intermediary node C, which is < N hops away from both
1805 * nodes A and B, the system is a glueless mesh.
1806 */
0fb3978b 1807static void init_numa_topology_type(int offline_node)
f2cb1360
IM
1808{
1809 int a, b, c, n;
1810
1811 n = sched_max_numa_distance;
1812
e5e96faf 1813 if (sched_domains_numa_levels <= 2) {
f2cb1360
IM
1814 sched_numa_topology_type = NUMA_DIRECT;
1815 return;
1816 }
1817
0fb3978b
HY
1818 for_each_cpu_node_but(a, offline_node) {
1819 for_each_cpu_node_but(b, offline_node) {
f2cb1360
IM
1820 /* Find two nodes furthest removed from each other. */
1821 if (node_distance(a, b) < n)
1822 continue;
1823
1824 /* Is there an intermediary node between a and b? */
0fb3978b 1825 for_each_cpu_node_but(c, offline_node) {
f2cb1360
IM
1826 if (node_distance(a, c) < n &&
1827 node_distance(b, c) < n) {
1828 sched_numa_topology_type =
1829 NUMA_GLUELESS_MESH;
1830 return;
1831 }
1832 }
1833
1834 sched_numa_topology_type = NUMA_BACKPLANE;
1835 return;
1836 }
1837 }
0fb3978b
HY
1838
1839 pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
1840 sched_numa_topology_type = NUMA_DIRECT;
f2cb1360
IM
1841}
1842
620a6dc4
VS
1843
1844#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
1845
0fb3978b 1846void sched_init_numa(int offline_node)
f2cb1360 1847{
f2cb1360 1848 struct sched_domain_topology_level *tl;
620a6dc4
VS
1849 unsigned long *distance_map;
1850 int nr_levels = 0;
1851 int i, j;
0fb3978b
HY
1852 int *distances;
1853 struct cpumask ***masks;
051f3ca0 1854
f2cb1360
IM
1855 /*
1856 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1857 * unique distances in the node_distance() table.
f2cb1360 1858 */
620a6dc4
VS
1859 distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
1860 if (!distance_map)
1861 return;
1862
1863 bitmap_zero(distance_map, NR_DISTANCE_VALUES);
0fb3978b
HY
1864 for_each_cpu_node_but(i, offline_node) {
1865 for_each_cpu_node_but(j, offline_node) {
620a6dc4 1866 int distance = node_distance(i, j);
f2cb1360 1867
620a6dc4
VS
1868 if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
1869 sched_numa_warn("Invalid distance value range");
0fb3978b 1870 bitmap_free(distance_map);
620a6dc4 1871 return;
f2cb1360 1872 }
620a6dc4
VS
1873
1874 bitmap_set(distance_map, distance, 1);
f2cb1360 1875 }
620a6dc4
VS
1876 }
1877 /*
1878 * We can now figure out how many unique distance values there are and
1879 * allocate memory accordingly.
1880 */
1881 nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
f2cb1360 1882
0fb3978b
HY
1883 distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
1884 if (!distances) {
620a6dc4
VS
1885 bitmap_free(distance_map);
1886 return;
f2cb1360
IM
1887 }
1888
620a6dc4
VS
1889 for (i = 0, j = 0; i < nr_levels; i++, j++) {
1890 j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
0fb3978b 1891 distances[i] = j;
620a6dc4 1892 }
0fb3978b 1893 rcu_assign_pointer(sched_domains_numa_distance, distances);
620a6dc4
VS
1894
1895 bitmap_free(distance_map);
1896
f2cb1360 1897 /*
620a6dc4 1898 * 'nr_levels' contains the number of unique distances
f2cb1360
IM
1899 *
1900 * The sched_domains_numa_distance[] array includes the actual distance
1901 * numbers.
1902 */
1903
1904 /*
1905 * Here, we should temporarily reset sched_domains_numa_levels to 0.
1906 * If it fails to allocate memory for array sched_domains_numa_masks[][],
620a6dc4 1907 * the array will contain less then 'nr_levels' members. This could be
f2cb1360
IM
1908 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1909 * in other functions.
1910 *
620a6dc4 1911 * We reset it to 'nr_levels' at the end of this function.
f2cb1360
IM
1912 */
1913 sched_domains_numa_levels = 0;
1914
0fb3978b
HY
1915 masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
1916 if (!masks)
f2cb1360
IM
1917 return;
1918
1919 /*
1920 * Now for each level, construct a mask per node which contains all
1921 * CPUs of nodes that are that many hops away from us.
1922 */
620a6dc4 1923 for (i = 0; i < nr_levels; i++) {
0fb3978b
HY
1924 masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1925 if (!masks[i])
f2cb1360
IM
1926 return;
1927
0fb3978b 1928 for_each_cpu_node_but(j, offline_node) {
f2cb1360 1929 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
620a6dc4
VS
1930 int k;
1931
f2cb1360
IM
1932 if (!mask)
1933 return;
1934
0fb3978b 1935 masks[i][j] = mask;
0083242c 1936
0fb3978b 1937 for_each_cpu_node_but(k, offline_node) {
620a6dc4
VS
1938 if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
1939 sched_numa_warn("Node-distance not symmetric");
1940
f2cb1360
IM
1941 if (node_distance(j, k) > sched_domains_numa_distance[i])
1942 continue;
1943
1944 cpumask_or(mask, mask, cpumask_of_node(k));
1945 }
1946 }
1947 }
0fb3978b 1948 rcu_assign_pointer(sched_domains_numa_masks, masks);
f2cb1360
IM
1949
1950 /* Compute default topology size */
1951 for (i = 0; sched_domain_topology[i].mask; i++);
1952
71e5f664 1953 tl = kzalloc((i + nr_levels + 1) *
f2cb1360
IM
1954 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1955 if (!tl)
1956 return;
1957
1958 /*
1959 * Copy the default topology bits..
1960 */
1961 for (i = 0; sched_domain_topology[i].mask; i++)
1962 tl[i] = sched_domain_topology[i];
1963
051f3ca0
SS
1964 /*
1965 * Add the NUMA identity distance, aka single NODE.
1966 */
1967 tl[i++] = (struct sched_domain_topology_level){
1968 .mask = sd_numa_mask,
1969 .numa_level = 0,
1970 SD_INIT_NAME(NODE)
1971 };
1972
f2cb1360
IM
1973 /*
1974 * .. and append 'j' levels of NUMA goodness.
1975 */
620a6dc4 1976 for (j = 1; j < nr_levels; i++, j++) {
f2cb1360
IM
1977 tl[i] = (struct sched_domain_topology_level){
1978 .mask = sd_numa_mask,
1979 .sd_flags = cpu_numa_flags,
1980 .flags = SDTL_OVERLAP,
1981 .numa_level = j,
1982 SD_INIT_NAME(NUMA)
1983 };
1984 }
1985
0fb3978b 1986 sched_domain_topology_saved = sched_domain_topology;
f2cb1360
IM
1987 sched_domain_topology = tl;
1988
620a6dc4 1989 sched_domains_numa_levels = nr_levels;
0fb3978b 1990 WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
0083242c 1991
0fb3978b 1992 init_numa_topology_type(offline_node);
0083242c
VS
1993}
1994
0083242c 1995
0fb3978b
HY
1996static void sched_reset_numa(void)
1997{
1998 int nr_levels, *distances;
1999 struct cpumask ***masks;
0083242c 2000
0fb3978b
HY
2001 nr_levels = sched_domains_numa_levels;
2002 sched_domains_numa_levels = 0;
2003 sched_max_numa_distance = 0;
2004 sched_numa_topology_type = NUMA_DIRECT;
2005 distances = sched_domains_numa_distance;
2006 rcu_assign_pointer(sched_domains_numa_distance, NULL);
2007 masks = sched_domains_numa_masks;
2008 rcu_assign_pointer(sched_domains_numa_masks, NULL);
2009 if (distances || masks) {
2010 int i, j;
2011
2012 synchronize_rcu();
2013 kfree(distances);
2014 for (i = 0; i < nr_levels && masks; i++) {
2015 if (!masks[i])
0083242c 2016 continue;
0fb3978b
HY
2017 for_each_node(j)
2018 kfree(masks[i][j]);
2019 kfree(masks[i]);
0083242c 2020 }
0fb3978b 2021 kfree(masks);
0083242c 2022 }
0fb3978b
HY
2023 if (sched_domain_topology_saved) {
2024 kfree(sched_domain_topology);
2025 sched_domain_topology = sched_domain_topology_saved;
2026 sched_domain_topology_saved = NULL;
2027 }
2028}
2029
2030/*
2031 * Call with hotplug lock held
2032 */
2033void sched_update_numa(int cpu, bool online)
2034{
2035 int node;
0083242c 2036
0fb3978b 2037 node = cpu_to_node(cpu);
0083242c 2038 /*
0fb3978b
HY
2039 * Scheduler NUMA topology is updated when the first CPU of a
2040 * node is onlined or the last CPU of a node is offlined.
0083242c 2041 */
0fb3978b
HY
2042 if (cpumask_weight(cpumask_of_node(node)) != 1)
2043 return;
2044
2045 sched_reset_numa();
2046 sched_init_numa(online ? NUMA_NO_NODE : node);
f2cb1360
IM
2047}
2048
2049void sched_domains_numa_masks_set(unsigned int cpu)
2050{
2051 int node = cpu_to_node(cpu);
2052 int i, j;
2053
2054 for (i = 0; i < sched_domains_numa_levels; i++) {
2055 for (j = 0; j < nr_node_ids; j++) {
0fb3978b 2056 if (!node_state(j, N_CPU))
0083242c
VS
2057 continue;
2058
2059 /* Set ourselves in the remote node's masks */
f2cb1360
IM
2060 if (node_distance(j, node) <= sched_domains_numa_distance[i])
2061 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
2062 }
2063 }
2064}
2065
2066void sched_domains_numa_masks_clear(unsigned int cpu)
2067{
2068 int i, j;
2069
2070 for (i = 0; i < sched_domains_numa_levels; i++) {
0fb3978b
HY
2071 for (j = 0; j < nr_node_ids; j++) {
2072 if (sched_domains_numa_masks[i][j])
2073 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
2074 }
f2cb1360
IM
2075 }
2076}
2077
e0e8d491
WL
2078/*
2079 * sched_numa_find_closest() - given the NUMA topology, find the cpu
2080 * closest to @cpu from @cpumask.
2081 * cpumask: cpumask to find a cpu from
2082 * cpu: cpu to be close to
2083 *
2084 * returns: cpu, or nr_cpu_ids when nothing found.
2085 */
2086int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
2087{
0fb3978b
HY
2088 int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
2089 struct cpumask ***masks;
e0e8d491 2090
0fb3978b
HY
2091 rcu_read_lock();
2092 masks = rcu_dereference(sched_domains_numa_masks);
2093 if (!masks)
2094 goto unlock;
e0e8d491 2095 for (i = 0; i < sched_domains_numa_levels; i++) {
0fb3978b
HY
2096 if (!masks[i][j])
2097 break;
2098 cpu = cpumask_any_and(cpus, masks[i][j]);
2099 if (cpu < nr_cpu_ids) {
2100 found = cpu;
2101 break;
2102 }
e0e8d491 2103 }
0fb3978b
HY
2104unlock:
2105 rcu_read_unlock();
2106
2107 return found;
e0e8d491
WL
2108}
2109
cd7f5535
YN
2110struct __cmp_key {
2111 const struct cpumask *cpus;
2112 struct cpumask ***masks;
2113 int node;
2114 int cpu;
2115 int w;
2116};
2117
2118static int hop_cmp(const void *a, const void *b)
2119{
01bb11ad 2120 struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b;
cd7f5535
YN
2121 struct __cmp_key *k = (struct __cmp_key *)a;
2122
2123 if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
2124 return 1;
2125
01bb11ad
YN
2126 if (b == k->masks) {
2127 k->w = 0;
2128 return 0;
2129 }
2130
2131 prev_hop = *((struct cpumask ***)b - 1);
2132 k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);
cd7f5535
YN
2133 if (k->w <= k->cpu)
2134 return 0;
2135
2136 return -1;
2137}
2138
6d08ad21
YN
2139/**
2140 * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
2141 * from @cpus to @cpu, taking into account distance
2142 * from a given @node.
2143 * @cpus: cpumask to find a cpu from
2144 * @cpu: CPU to start searching
2145 * @node: NUMA node to order CPUs by distance
2146 *
2147 * Return: cpu, or nr_cpu_ids when nothing found.
cd7f5535
YN
2148 */
2149int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
2150{
617f2c38 2151 struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
cd7f5535
YN
2152 struct cpumask ***hop_masks;
2153 int hop, ret = nr_cpu_ids;
2154
9ecea9ae
YN
2155 if (node == NUMA_NO_NODE)
2156 return cpumask_nth_and(cpu, cpus, cpu_online_mask);
2157
cd7f5535
YN
2158 rcu_read_lock();
2159
617f2c38
YN
2160 /* CPU-less node entries are uninitialized in sched_domains_numa_masks */
2161 node = numa_nearest_node(node, N_CPU);
2162 k.node = node;
2163
cd7f5535
YN
2164 k.masks = rcu_dereference(sched_domains_numa_masks);
2165 if (!k.masks)
2166 goto unlock;
2167
2168 hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
2169 hop = hop_masks - k.masks;
2170
2171 ret = hop ?
2172 cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
2173 cpumask_nth_and(cpu, cpus, k.masks[0][node]);
2174unlock:
2175 rcu_read_unlock();
2176 return ret;
2177}
2178EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
9feae658
VS
2179
2180/**
2181 * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
2182 * @node
2183 * @node: The node to count hops from.
2184 * @hops: Include CPUs up to that many hops away. 0 means local node.
2185 *
2186 * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
2187 * @node, an error value otherwise.
2188 *
2189 * Requires rcu_lock to be held. Returned cpumask is only valid within that
2190 * read-side section, copy it if required beyond that.
2191 *
2192 * Note that not all hops are equal in distance; see sched_init_numa() for how
2193 * distances and masks are handled.
2194 * Also note that this is a reflection of sched_domains_numa_masks, which may change
2195 * during the lifetime of the system (offline nodes are taken out of the masks).
2196 */
2197const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
2198{
2199 struct cpumask ***masks;
2200
2201 if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
2202 return ERR_PTR(-EINVAL);
2203
2204 masks = rcu_dereference(sched_domains_numa_masks);
2205 if (!masks)
2206 return ERR_PTR(-EBUSY);
2207
2208 return masks[hops][node];
2209}
2210EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
2211
f2cb1360
IM
2212#endif /* CONFIG_NUMA */
2213
2214static int __sdt_alloc(const struct cpumask *cpu_map)
2215{
2216 struct sched_domain_topology_level *tl;
2217 int j;
2218
2219 for_each_sd_topology(tl) {
2220 struct sd_data *sdd = &tl->data;
2221
2222 sdd->sd = alloc_percpu(struct sched_domain *);
2223 if (!sdd->sd)
2224 return -ENOMEM;
2225
2226 sdd->sds = alloc_percpu(struct sched_domain_shared *);
2227 if (!sdd->sds)
2228 return -ENOMEM;
2229
2230 sdd->sg = alloc_percpu(struct sched_group *);
2231 if (!sdd->sg)
2232 return -ENOMEM;
2233
2234 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
2235 if (!sdd->sgc)
2236 return -ENOMEM;
2237
2238 for_each_cpu(j, cpu_map) {
2239 struct sched_domain *sd;
2240 struct sched_domain_shared *sds;
2241 struct sched_group *sg;
2242 struct sched_group_capacity *sgc;
2243
2244 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
2245 GFP_KERNEL, cpu_to_node(j));
2246 if (!sd)
2247 return -ENOMEM;
2248
2249 *per_cpu_ptr(sdd->sd, j) = sd;
2250
2251 sds = kzalloc_node(sizeof(struct sched_domain_shared),
2252 GFP_KERNEL, cpu_to_node(j));
2253 if (!sds)
2254 return -ENOMEM;
2255
2256 *per_cpu_ptr(sdd->sds, j) = sds;
2257
2258 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
2259 GFP_KERNEL, cpu_to_node(j));
2260 if (!sg)
2261 return -ENOMEM;
2262
2263 sg->next = sg;
2264
2265 *per_cpu_ptr(sdd->sg, j) = sg;
2266
2267 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
2268 GFP_KERNEL, cpu_to_node(j));
2269 if (!sgc)
2270 return -ENOMEM;
2271
005f874d
PZ
2272#ifdef CONFIG_SCHED_DEBUG
2273 sgc->id = j;
2274#endif
2275
f2cb1360
IM
2276 *per_cpu_ptr(sdd->sgc, j) = sgc;
2277 }
2278 }
2279
2280 return 0;
2281}
2282
2283static void __sdt_free(const struct cpumask *cpu_map)
2284{
2285 struct sched_domain_topology_level *tl;
2286 int j;
2287
2288 for_each_sd_topology(tl) {
2289 struct sd_data *sdd = &tl->data;
2290
2291 for_each_cpu(j, cpu_map) {
2292 struct sched_domain *sd;
2293
2294 if (sdd->sd) {
2295 sd = *per_cpu_ptr(sdd->sd, j);
2296 if (sd && (sd->flags & SD_OVERLAP))
2297 free_sched_groups(sd->groups, 0);
2298 kfree(*per_cpu_ptr(sdd->sd, j));
2299 }
2300
2301 if (sdd->sds)
2302 kfree(*per_cpu_ptr(sdd->sds, j));
2303 if (sdd->sg)
2304 kfree(*per_cpu_ptr(sdd->sg, j));
2305 if (sdd->sgc)
2306 kfree(*per_cpu_ptr(sdd->sgc, j));
2307 }
2308 free_percpu(sdd->sd);
2309 sdd->sd = NULL;
2310 free_percpu(sdd->sds);
2311 sdd->sds = NULL;
2312 free_percpu(sdd->sg);
2313 sdd->sg = NULL;
2314 free_percpu(sdd->sgc);
2315 sdd->sgc = NULL;
2316 }
2317}
2318
181a80d1 2319static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
f2cb1360 2320 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
c744dc4a 2321 struct sched_domain *child, int cpu)
f2cb1360 2322{
c744dc4a 2323 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
f2cb1360
IM
2324
2325 if (child) {
2326 sd->level = child->level + 1;
2327 sched_domain_level_max = max(sched_domain_level_max, sd->level);
2328 child->parent = sd;
2329
2330 if (!cpumask_subset(sched_domain_span(child),
2331 sched_domain_span(sd))) {
2332 pr_err("BUG: arch topology borken\n");
2333#ifdef CONFIG_SCHED_DEBUG
2334 pr_err(" the %s domain not a subset of the %s domain\n",
2335 child->name, sd->name);
2336#endif
97fb7a0a 2337 /* Fixup, ensure @sd has at least @child CPUs. */
f2cb1360
IM
2338 cpumask_or(sched_domain_span(sd),
2339 sched_domain_span(sd),
2340 sched_domain_span(child));
2341 }
2342
2343 }
2344 set_domain_attribute(sd, attr);
2345
2346 return sd;
2347}
2348
ccf74128
VS
2349/*
2350 * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
2351 * any two given CPUs at this (non-NUMA) topology level.
2352 */
2353static bool topology_span_sane(struct sched_domain_topology_level *tl,
2354 const struct cpumask *cpu_map, int cpu)
2355{
2356 int i;
2357
2358 /* NUMA levels are allowed to overlap */
2359 if (tl->flags & SDTL_OVERLAP)
2360 return true;
2361
2362 /*
2363 * Non-NUMA levels cannot partially overlap - they must be either
2364 * completely equal or completely disjoint. Otherwise we can end up
2365 * breaking the sched_group lists - i.e. a later get_group() pass
2366 * breaks the linking done for an earlier span.
2367 */
2368 for_each_cpu(i, cpu_map) {
2369 if (i == cpu)
2370 continue;
2371 /*
2372 * We should 'and' all those masks with 'cpu_map' to exactly
2373 * match the topology we're about to build, but that can only
2374 * remove CPUs, which only lessens our ability to detect
2375 * overlaps
2376 */
2377 if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
2378 cpumask_intersects(tl->mask(cpu), tl->mask(i)))
2379 return false;
2380 }
2381
2382 return true;
2383}
2384
f2cb1360
IM
2385/*
2386 * Build sched domains for a given set of CPUs and attach the sched domains
2387 * to the individual CPUs
2388 */
2389static int
2390build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
2391{
cd1cb335 2392 enum s_alloc alloc_state = sa_none;
f2cb1360
IM
2393 struct sched_domain *sd;
2394 struct s_data d;
2395 struct rq *rq = NULL;
2396 int i, ret = -ENOMEM;
df054e84 2397 bool has_asym = false;
8881e163 2398 bool has_cluster = false;
f2cb1360 2399
cd1cb335
VS
2400 if (WARN_ON(cpumask_empty(cpu_map)))
2401 goto error;
2402
f2cb1360
IM
2403 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
2404 if (alloc_state != sa_rootdomain)
2405 goto error;
2406
2407 /* Set up domains for CPUs specified by the cpu_map: */
2408 for_each_cpu(i, cpu_map) {
2409 struct sched_domain_topology_level *tl;
2410
2411 sd = NULL;
2412 for_each_sd_topology(tl) {
05484e09 2413
ccf74128
VS
2414 if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2415 goto error;
2416
c744dc4a
BM
2417 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
2418
2419 has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
05484e09 2420
f2cb1360
IM
2421 if (tl == sched_domain_topology)
2422 *per_cpu_ptr(d.sd, i) = sd;
af85596c 2423 if (tl->flags & SDTL_OVERLAP)
f2cb1360
IM
2424 sd->flags |= SD_OVERLAP;
2425 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
2426 break;
2427 }
2428 }
2429
2430 /* Build the groups for the domains */
2431 for_each_cpu(i, cpu_map) {
2432 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2433 sd->span_weight = cpumask_weight(sched_domain_span(sd));
2434 if (sd->flags & SD_OVERLAP) {
2435 if (build_overlap_sched_groups(sd, i))
2436 goto error;
2437 } else {
2438 if (build_sched_groups(sd, i))
2439 goto error;
2440 }
2441 }
2442 }
2443
e496132e
MG
2444 /*
2445 * Calculate an allowed NUMA imbalance such that LLCs do not get
2446 * imbalanced.
2447 */
2448 for_each_cpu(i, cpu_map) {
2449 unsigned int imb = 0;
2450 unsigned int imb_span = 1;
2451
2452 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2453 struct sched_domain *child = sd->child;
2454
54de4427
AS
2455 if (!(sd->flags & SD_SHARE_LLC) && child &&
2456 (child->flags & SD_SHARE_LLC)) {
7f434dff 2457 struct sched_domain __rcu *top_p;
e496132e
MG
2458 unsigned int nr_llcs;
2459
2460 /*
2461 * For a single LLC per node, allow an
026b98a9
MG
2462 * imbalance up to 12.5% of the node. This is
2463 * arbitrary cutoff based two factors -- SMT and
2464 * memory channels. For SMT-2, the intent is to
2465 * avoid premature sharing of HT resources but
2466 * SMT-4 or SMT-8 *may* benefit from a different
2467 * cutoff. For memory channels, this is a very
2468 * rough estimate of how many channels may be
2469 * active and is based on recent CPUs with
2470 * many cores.
e496132e
MG
2471 *
2472 * For multiple LLCs, allow an imbalance
2473 * until multiple tasks would share an LLC
2474 * on one node while LLCs on another node
026b98a9
MG
2475 * remain idle. This assumes that there are
2476 * enough logical CPUs per LLC to avoid SMT
2477 * factors and that there is a correlation
2478 * between LLCs and memory channels.
e496132e
MG
2479 */
2480 nr_llcs = sd->span_weight / child->span_weight;
2481 if (nr_llcs == 1)
026b98a9 2482 imb = sd->span_weight >> 3;
e496132e
MG
2483 else
2484 imb = nr_llcs;
026b98a9 2485 imb = max(1U, imb);
e496132e
MG
2486 sd->imb_numa_nr = imb;
2487
2488 /* Set span based on the first NUMA domain. */
7f434dff 2489 top_p = sd->parent;
e496132e 2490 while (top_p && !(top_p->flags & SD_NUMA)) {
7f434dff 2491 top_p = top_p->parent;
e496132e
MG
2492 }
2493 imb_span = top_p ? top_p->span_weight : sd->span_weight;
2494 } else {
2495 int factor = max(1U, (sd->span_weight / imb_span));
2496
2497 sd->imb_numa_nr = imb * factor;
2498 }
2499 }
2500 }
2501
f2cb1360
IM
2502 /* Calculate CPU capacity for physical packages and nodes */
2503 for (i = nr_cpumask_bits-1; i >= 0; i--) {
2504 if (!cpumask_test_cpu(i, cpu_map))
2505 continue;
2506
2507 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2508 claim_allocations(i, sd);
2509 init_sched_groups_capacity(i, sd);
2510 }
2511 }
2512
2513 /* Attach the domains */
2514 rcu_read_lock();
2515 for_each_cpu(i, cpu_map) {
2516 rq = cpu_rq(i);
2517 sd = *per_cpu_ptr(d.sd, i);
2518
f2cb1360 2519 cpu_attach_domain(sd, d.rd, i);
8881e163
BS
2520
2521 if (lowest_flag_domain(i, SD_CLUSTER))
2522 has_cluster = true;
f2cb1360
IM
2523 }
2524 rcu_read_unlock();
2525
df054e84 2526 if (has_asym)
e284df70 2527 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
df054e84 2528
8881e163
BS
2529 if (has_cluster)
2530 static_branch_inc_cpuslocked(&sched_cluster_active);
2531
fa427e8e
QY
2532 if (rq && sched_debug_verbose)
2533 pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
f2cb1360
IM
2534
2535 ret = 0;
2536error:
2537 __free_domain_allocs(&d, alloc_state, cpu_map);
97fb7a0a 2538
f2cb1360
IM
2539 return ret;
2540}
2541
2542/* Current sched domains: */
2543static cpumask_var_t *doms_cur;
2544
2545/* Number of sched domains in 'doms_cur': */
2546static int ndoms_cur;
2547
3b03706f 2548/* Attributes of custom domains in 'doms_cur' */
f2cb1360
IM
2549static struct sched_domain_attr *dattr_cur;
2550
2551/*
2552 * Special case: If a kmalloc() of a doms_cur partition (array of
2553 * cpumask) fails, then fallback to a single sched domain,
2554 * as determined by the single cpumask fallback_doms.
2555 */
8d5dc512 2556static cpumask_var_t fallback_doms;
f2cb1360
IM
2557
2558/*
2559 * arch_update_cpu_topology lets virtualized architectures update the
2560 * CPU core maps. It is supposed to return 1 if the topology changed
2561 * or 0 if it stayed the same.
2562 */
2563int __weak arch_update_cpu_topology(void)
2564{
2565 return 0;
2566}
2567
2568cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
2569{
2570 int i;
2571 cpumask_var_t *doms;
2572
6da2ec56 2573 doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
f2cb1360
IM
2574 if (!doms)
2575 return NULL;
2576 for (i = 0; i < ndoms; i++) {
2577 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
2578 free_sched_domains(doms, i);
2579 return NULL;
2580 }
2581 }
2582 return doms;
2583}
2584
2585void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2586{
2587 unsigned int i;
2588 for (i = 0; i < ndoms; i++)
2589 free_cpumask_var(doms[i]);
2590 kfree(doms);
2591}
2592
2593/*
cb0c0414
JL
2594 * Set up scheduler domains and groups. For now this just excludes isolated
2595 * CPUs, but could be used to exclude other special cases in the future.
f2cb1360 2596 */
ef90cf22 2597int __init sched_init_domains(const struct cpumask *cpu_map)
f2cb1360
IM
2598{
2599 int err;
2600
8d5dc512 2601 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
1676330e 2602 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
8d5dc512
PZ
2603 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
2604
f2cb1360 2605 arch_update_cpu_topology();
c744dc4a 2606 asym_cpu_capacity_scan();
f2cb1360
IM
2607 ndoms_cur = 1;
2608 doms_cur = alloc_sched_domains(ndoms_cur);
2609 if (!doms_cur)
2610 doms_cur = &fallback_doms;
04d4e665 2611 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
f2cb1360 2612 err = build_sched_domains(doms_cur[0], NULL);
f2cb1360
IM
2613
2614 return err;
2615}
2616
2617/*
2618 * Detach sched domains from a group of CPUs specified in cpu_map
2619 * These CPUs will now be attached to the NULL domain
2620 */
2621static void detach_destroy_domains(const struct cpumask *cpu_map)
2622{
e284df70 2623 unsigned int cpu = cpumask_any(cpu_map);
f2cb1360
IM
2624 int i;
2625
e284df70
VS
2626 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
2627 static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
2628
8881e163
BS
2629 if (static_branch_unlikely(&sched_cluster_active))
2630 static_branch_dec_cpuslocked(&sched_cluster_active);
2631
f2cb1360
IM
2632 rcu_read_lock();
2633 for_each_cpu(i, cpu_map)
2634 cpu_attach_domain(NULL, &def_root_domain, i);
2635 rcu_read_unlock();
2636}
2637
2638/* handle null as "default" */
2639static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
2640 struct sched_domain_attr *new, int idx_new)
2641{
2642 struct sched_domain_attr tmp;
2643
2644 /* Fast path: */
2645 if (!new && !cur)
2646 return 1;
2647
2648 tmp = SD_ATTR_INIT;
97fb7a0a 2649
f2cb1360
IM
2650 return !memcmp(cur ? (cur + idx_cur) : &tmp,
2651 new ? (new + idx_new) : &tmp,
2652 sizeof(struct sched_domain_attr));
2653}
2654
2655/*
2656 * Partition sched domains as specified by the 'ndoms_new'
2657 * cpumasks in the array doms_new[] of cpumasks. This compares
2658 * doms_new[] to the current sched domain partitioning, doms_cur[].
2659 * It destroys each deleted domain and builds each new domain.
2660 *
2661 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
2662 * The masks don't intersect (don't overlap.) We should setup one
2663 * sched domain for each mask. CPUs not in any of the cpumasks will
2664 * not be load balanced. If the same cpumask appears both in the
2665 * current 'doms_cur' domains and in the new 'doms_new', we can leave
2666 * it as it is.
2667 *
2668 * The passed in 'doms_new' should be allocated using
2669 * alloc_sched_domains. This routine takes ownership of it and will
2670 * free_sched_domains it when done with it. If the caller failed the
2671 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
2672 * and partition_sched_domains() will fallback to the single partition
2673 * 'fallback_doms', it also forces the domains to be rebuilt.
2674 *
2675 * If doms_new == NULL it will be replaced with cpu_online_mask.
2676 * ndoms_new == 0 is a special case for destroying existing domains,
2677 * and it will not create the default domain.
2678 *
c22645f4 2679 * Call with hotplug lock and sched_domains_mutex held
f2cb1360 2680 */
c22645f4
MP
2681void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2682 struct sched_domain_attr *dattr_new)
f2cb1360 2683{
1f74de87 2684 bool __maybe_unused has_eas = false;
f2cb1360
IM
2685 int i, j, n;
2686 int new_topology;
2687
c22645f4 2688 lockdep_assert_held(&sched_domains_mutex);
f2cb1360 2689
f2cb1360
IM
2690 /* Let the architecture update CPU core mappings: */
2691 new_topology = arch_update_cpu_topology();
c744dc4a
BM
2692 /* Trigger rebuilding CPU capacity asymmetry data */
2693 if (new_topology)
2694 asym_cpu_capacity_scan();
f2cb1360 2695
09e0dd8e
PZ
2696 if (!doms_new) {
2697 WARN_ON_ONCE(dattr_new);
2698 n = 0;
2699 doms_new = alloc_sched_domains(1);
2700 if (doms_new) {
2701 n = 1;
edb93821 2702 cpumask_and(doms_new[0], cpu_active_mask,
04d4e665 2703 housekeeping_cpumask(HK_TYPE_DOMAIN));
09e0dd8e
PZ
2704 }
2705 } else {
2706 n = ndoms_new;
2707 }
f2cb1360
IM
2708
2709 /* Destroy deleted domains: */
2710 for (i = 0; i < ndoms_cur; i++) {
2711 for (j = 0; j < n && !new_topology; j++) {
6aa140fa 2712 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
f9a25f77
MP
2713 dattrs_equal(dattr_cur, i, dattr_new, j)) {
2714 struct root_domain *rd;
2715
2716 /*
2717 * This domain won't be destroyed and as such
2718 * its dl_bw->total_bw needs to be cleared. It
2719 * will be recomputed in function
2720 * update_tasks_root_domain().
2721 */
2722 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2723 dl_clear_root_domain(rd);
f2cb1360 2724 goto match1;
f9a25f77 2725 }
f2cb1360
IM
2726 }
2727 /* No match - a current sched domain not in new doms_new[] */
2728 detach_destroy_domains(doms_cur[i]);
2729match1:
2730 ;
2731 }
2732
2733 n = ndoms_cur;
09e0dd8e 2734 if (!doms_new) {
f2cb1360
IM
2735 n = 0;
2736 doms_new = &fallback_doms;
edb93821 2737 cpumask_and(doms_new[0], cpu_active_mask,
04d4e665 2738 housekeeping_cpumask(HK_TYPE_DOMAIN));
f2cb1360
IM
2739 }
2740
2741 /* Build new domains: */
2742 for (i = 0; i < ndoms_new; i++) {
2743 for (j = 0; j < n && !new_topology; j++) {
6aa140fa
QP
2744 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2745 dattrs_equal(dattr_new, i, dattr_cur, j))
f2cb1360
IM
2746 goto match2;
2747 }
2748 /* No match - add a new doms_new */
2749 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2750match2:
2751 ;
2752 }
2753
531b5c9f 2754#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
6aa140fa
QP
2755 /* Build perf. domains: */
2756 for (i = 0; i < ndoms_new; i++) {
531b5c9f 2757 for (j = 0; j < n && !sched_energy_update; j++) {
6aa140fa 2758 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
1f74de87
QP
2759 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2760 has_eas = true;
6aa140fa 2761 goto match3;
1f74de87 2762 }
6aa140fa
QP
2763 }
2764 /* No match - add perf. domains for a new rd */
1f74de87 2765 has_eas |= build_perf_domains(doms_new[i]);
6aa140fa
QP
2766match3:
2767 ;
2768 }
1f74de87 2769 sched_energy_set(has_eas);
6aa140fa
QP
2770#endif
2771
f2cb1360
IM
2772 /* Remember the new sched domains: */
2773 if (doms_cur != &fallback_doms)
2774 free_sched_domains(doms_cur, ndoms_cur);
2775
2776 kfree(dattr_cur);
2777 doms_cur = doms_new;
2778 dattr_cur = dattr_new;
2779 ndoms_cur = ndoms_new;
2780
3b87f136 2781 update_sched_domain_debugfs();
c22645f4 2782}
f2cb1360 2783
c22645f4
MP
2784/*
2785 * Call with hotplug lock held
2786 */
2787void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2788 struct sched_domain_attr *dattr_new)
2789{
2790 mutex_lock(&sched_domains_mutex);
2791 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
f2cb1360
IM
2792 mutex_unlock(&sched_domains_mutex);
2793}