cpuset: fix race between hotplug work and later CPU offline
[linux-block.git] / kernel / sched / topology.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
f2cb1360
IM
2/*
3 * Scheduler topology setup/handling methods
4 */
f2cb1360
IM
5#include "sched.h"
6
7DEFINE_MUTEX(sched_domains_mutex);
8
9/* Protected by sched_domains_mutex: */
ace80310 10static cpumask_var_t sched_domains_tmpmask;
11static cpumask_var_t sched_domains_tmpmask2;
f2cb1360
IM
12
13#ifdef CONFIG_SCHED_DEBUG
14
f2cb1360
IM
15static int __init sched_debug_setup(char *str)
16{
9469eb01 17 sched_debug_enabled = true;
f2cb1360
IM
18
19 return 0;
20}
21early_param("sched_debug", sched_debug_setup);
22
23static inline bool sched_debug(void)
24{
25 return sched_debug_enabled;
26}
27
848785df
VS
28#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
29const struct sd_flag_debug sd_flag_debug[] = {
30#include <linux/sched/sd_flags.h>
31};
32#undef SD_FLAG
33
f2cb1360
IM
34static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
35 struct cpumask *groupmask)
36{
37 struct sched_group *group = sd->groups;
65c5e253
VS
38 unsigned long flags = sd->flags;
39 unsigned int idx;
f2cb1360
IM
40
41 cpumask_clear(groupmask);
42
005f874d 43 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
005f874d 44 printk(KERN_CONT "span=%*pbl level=%s\n",
f2cb1360
IM
45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
46
47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
97fb7a0a 48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
f2cb1360 49 }
6cd0c583 50 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
97fb7a0a 51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
f2cb1360
IM
52 }
53
65c5e253
VS
54 for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
55 unsigned int flag = BIT(idx);
56 unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
57
58 if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
59 !(sd->child->flags & flag))
60 printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
61 sd_flag_debug[idx].name);
62
63 if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
64 !(sd->parent->flags & flag))
65 printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
66 sd_flag_debug[idx].name);
67 }
68
f2cb1360
IM
69 printk(KERN_DEBUG "%*s groups:", level + 1, "");
70 do {
71 if (!group) {
72 printk("\n");
73 printk(KERN_ERR "ERROR: group is NULL\n");
74 break;
75 }
76
ae4df9d6 77 if (!cpumask_weight(sched_group_span(group))) {
f2cb1360
IM
78 printk(KERN_CONT "\n");
79 printk(KERN_ERR "ERROR: empty group\n");
80 break;
81 }
82
83 if (!(sd->flags & SD_OVERLAP) &&
ae4df9d6 84 cpumask_intersects(groupmask, sched_group_span(group))) {
f2cb1360
IM
85 printk(KERN_CONT "\n");
86 printk(KERN_ERR "ERROR: repeated CPUs\n");
87 break;
88 }
89
ae4df9d6 90 cpumask_or(groupmask, groupmask, sched_group_span(group));
f2cb1360 91
005f874d
PZ
92 printk(KERN_CONT " %d:{ span=%*pbl",
93 group->sgc->id,
ae4df9d6 94 cpumask_pr_args(sched_group_span(group)));
b0151c25 95
af218122 96 if ((sd->flags & SD_OVERLAP) &&
ae4df9d6 97 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
005f874d 98 printk(KERN_CONT " mask=%*pbl",
e5c14b1f 99 cpumask_pr_args(group_balance_mask(group)));
b0151c25
PZ
100 }
101
005f874d
PZ
102 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
103 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
f2cb1360 104
a420b063
PZ
105 if (group == sd->groups && sd->child &&
106 !cpumask_equal(sched_domain_span(sd->child),
ae4df9d6 107 sched_group_span(group))) {
a420b063
PZ
108 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
109 }
110
005f874d
PZ
111 printk(KERN_CONT " }");
112
f2cb1360 113 group = group->next;
b0151c25
PZ
114
115 if (group != sd->groups)
116 printk(KERN_CONT ",");
117
f2cb1360
IM
118 } while (group != sd->groups);
119 printk(KERN_CONT "\n");
120
121 if (!cpumask_equal(sched_domain_span(sd), groupmask))
122 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
123
124 if (sd->parent &&
125 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
97fb7a0a 126 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
f2cb1360
IM
127 return 0;
128}
129
130static void sched_domain_debug(struct sched_domain *sd, int cpu)
131{
132 int level = 0;
133
134 if (!sched_debug_enabled)
135 return;
136
137 if (!sd) {
138 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
139 return;
140 }
141
005f874d 142 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
f2cb1360
IM
143
144 for (;;) {
145 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
146 break;
147 level++;
148 sd = sd->parent;
149 if (!sd)
150 break;
151 }
152}
153#else /* !CONFIG_SCHED_DEBUG */
154
155# define sched_debug_enabled 0
156# define sched_domain_debug(sd, cpu) do { } while (0)
157static inline bool sched_debug(void)
158{
159 return false;
160}
161#endif /* CONFIG_SCHED_DEBUG */
162
4fc472f1
VS
163/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
164#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
165static const unsigned int SD_DEGENERATE_GROUPS_MASK =
166#include <linux/sched/sd_flags.h>
1670;
168#undef SD_FLAG
169
f2cb1360
IM
170static int sd_degenerate(struct sched_domain *sd)
171{
172 if (cpumask_weight(sched_domain_span(sd)) == 1)
173 return 1;
174
175 /* Following flags need at least 2 groups */
6f349818
VS
176 if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
177 (sd->groups != sd->groups->next))
178 return 0;
f2cb1360
IM
179
180 /* Following flags don't use groups */
181 if (sd->flags & (SD_WAKE_AFFINE))
182 return 0;
183
184 return 1;
185}
186
187static int
188sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
189{
190 unsigned long cflags = sd->flags, pflags = parent->flags;
191
192 if (sd_degenerate(parent))
193 return 1;
194
195 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
196 return 0;
197
198 /* Flags needing groups don't count if only 1 group in parent */
ab65afb0 199 if (parent->groups == parent->groups->next)
3a6712c7 200 pflags &= ~SD_DEGENERATE_GROUPS_MASK;
ab65afb0 201
f2cb1360
IM
202 if (~cflags & pflags)
203 return 0;
204
205 return 1;
206}
207
531b5c9f 208#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
f8a696f2 209DEFINE_STATIC_KEY_FALSE(sched_energy_present);
8d5d0cfb 210unsigned int sysctl_sched_energy_aware = 1;
531b5c9f
QP
211DEFINE_MUTEX(sched_energy_mutex);
212bool sched_energy_update;
213
8d5d0cfb
QP
214#ifdef CONFIG_PROC_SYSCTL
215int sched_energy_aware_handler(struct ctl_table *table, int write,
32927393 216 void *buffer, size_t *lenp, loff_t *ppos)
8d5d0cfb
QP
217{
218 int ret, state;
219
220 if (write && !capable(CAP_SYS_ADMIN))
221 return -EPERM;
222
223 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
224 if (!ret && write) {
225 state = static_branch_unlikely(&sched_energy_present);
226 if (state != sysctl_sched_energy_aware) {
227 mutex_lock(&sched_energy_mutex);
228 sched_energy_update = 1;
229 rebuild_sched_domains();
230 sched_energy_update = 0;
231 mutex_unlock(&sched_energy_mutex);
232 }
233 }
234
235 return ret;
236}
237#endif
238
6aa140fa
QP
239static void free_pd(struct perf_domain *pd)
240{
241 struct perf_domain *tmp;
242
243 while (pd) {
244 tmp = pd->next;
245 kfree(pd);
246 pd = tmp;
247 }
248}
249
250static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
251{
252 while (pd) {
253 if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
254 return pd;
255 pd = pd->next;
256 }
257
258 return NULL;
259}
260
261static struct perf_domain *pd_init(int cpu)
262{
263 struct em_perf_domain *obj = em_cpu_get(cpu);
264 struct perf_domain *pd;
265
266 if (!obj) {
267 if (sched_debug())
268 pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
269 return NULL;
270 }
271
272 pd = kzalloc(sizeof(*pd), GFP_KERNEL);
273 if (!pd)
274 return NULL;
275 pd->em_pd = obj;
276
277 return pd;
278}
279
280static void perf_domain_debug(const struct cpumask *cpu_map,
281 struct perf_domain *pd)
282{
283 if (!sched_debug() || !pd)
284 return;
285
286 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
287
288 while (pd) {
521b512b 289 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
6aa140fa
QP
290 cpumask_first(perf_domain_span(pd)),
291 cpumask_pr_args(perf_domain_span(pd)),
521b512b 292 em_pd_nr_perf_states(pd->em_pd));
6aa140fa
QP
293 pd = pd->next;
294 }
295
296 printk(KERN_CONT "\n");
297}
298
299static void destroy_perf_domain_rcu(struct rcu_head *rp)
300{
301 struct perf_domain *pd;
302
303 pd = container_of(rp, struct perf_domain, rcu);
304 free_pd(pd);
305}
306
1f74de87
QP
307static void sched_energy_set(bool has_eas)
308{
309 if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
310 if (sched_debug())
311 pr_info("%s: stopping EAS\n", __func__);
312 static_branch_disable_cpuslocked(&sched_energy_present);
313 } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
314 if (sched_debug())
315 pr_info("%s: starting EAS\n", __func__);
316 static_branch_enable_cpuslocked(&sched_energy_present);
317 }
318}
319
b68a4c0d
QP
320/*
321 * EAS can be used on a root domain if it meets all the following conditions:
322 * 1. an Energy Model (EM) is available;
323 * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
38502ab4
VS
324 * 3. no SMT is detected.
325 * 4. the EM complexity is low enough to keep scheduling overheads low;
326 * 5. schedutil is driving the frequency of all CPUs of the rd;
b68a4c0d
QP
327 *
328 * The complexity of the Energy Model is defined as:
329 *
521b512b 330 * C = nr_pd * (nr_cpus + nr_ps)
b68a4c0d
QP
331 *
332 * with parameters defined as:
333 * - nr_pd: the number of performance domains
334 * - nr_cpus: the number of CPUs
521b512b 335 * - nr_ps: the sum of the number of performance states of all performance
b68a4c0d 336 * domains (for example, on a system with 2 performance domains,
521b512b 337 * with 10 performance states each, nr_ps = 2 * 10 = 20).
b68a4c0d
QP
338 *
339 * It is generally not a good idea to use such a model in the wake-up path on
340 * very complex platforms because of the associated scheduling overheads. The
341 * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
521b512b 342 * with per-CPU DVFS and less than 8 performance states each, for example.
b68a4c0d
QP
343 */
344#define EM_MAX_COMPLEXITY 2048
345
531b5c9f 346extern struct cpufreq_governor schedutil_gov;
1f74de87 347static bool build_perf_domains(const struct cpumask *cpu_map)
6aa140fa 348{
521b512b 349 int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
6aa140fa
QP
350 struct perf_domain *pd = NULL, *tmp;
351 int cpu = cpumask_first(cpu_map);
352 struct root_domain *rd = cpu_rq(cpu)->rd;
531b5c9f
QP
353 struct cpufreq_policy *policy;
354 struct cpufreq_governor *gov;
b68a4c0d 355
8d5d0cfb
QP
356 if (!sysctl_sched_energy_aware)
357 goto free;
358
b68a4c0d
QP
359 /* EAS is enabled for asymmetric CPU capacity topologies. */
360 if (!per_cpu(sd_asym_cpucapacity, cpu)) {
361 if (sched_debug()) {
362 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
363 cpumask_pr_args(cpu_map));
364 }
365 goto free;
366 }
6aa140fa 367
38502ab4
VS
368 /* EAS definitely does *not* handle SMT */
369 if (sched_smt_active()) {
370 pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
371 cpumask_pr_args(cpu_map));
372 goto free;
373 }
374
6aa140fa
QP
375 for_each_cpu(i, cpu_map) {
376 /* Skip already covered CPUs. */
377 if (find_pd(pd, i))
378 continue;
379
531b5c9f
QP
380 /* Do not attempt EAS if schedutil is not being used. */
381 policy = cpufreq_cpu_get(i);
382 if (!policy)
383 goto free;
384 gov = policy->governor;
385 cpufreq_cpu_put(policy);
386 if (gov != &schedutil_gov) {
387 if (rd->pd)
388 pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
389 cpumask_pr_args(cpu_map));
390 goto free;
391 }
392
6aa140fa
QP
393 /* Create the new pd and add it to the local list. */
394 tmp = pd_init(i);
395 if (!tmp)
396 goto free;
397 tmp->next = pd;
398 pd = tmp;
b68a4c0d
QP
399
400 /*
521b512b 401 * Count performance domains and performance states for the
b68a4c0d
QP
402 * complexity check.
403 */
404 nr_pd++;
521b512b 405 nr_ps += em_pd_nr_perf_states(pd->em_pd);
b68a4c0d
QP
406 }
407
408 /* Bail out if the Energy Model complexity is too high. */
521b512b 409 if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
b68a4c0d
QP
410 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
411 cpumask_pr_args(cpu_map));
412 goto free;
6aa140fa
QP
413 }
414
415 perf_domain_debug(cpu_map, pd);
416
417 /* Attach the new list of performance domains to the root domain. */
418 tmp = rd->pd;
419 rcu_assign_pointer(rd->pd, pd);
420 if (tmp)
421 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
422
1f74de87 423 return !!pd;
6aa140fa
QP
424
425free:
426 free_pd(pd);
427 tmp = rd->pd;
428 rcu_assign_pointer(rd->pd, NULL);
429 if (tmp)
430 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
1f74de87
QP
431
432 return false;
6aa140fa
QP
433}
434#else
435static void free_pd(struct perf_domain *pd) { }
531b5c9f 436#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
6aa140fa 437
f2cb1360
IM
438static void free_rootdomain(struct rcu_head *rcu)
439{
440 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
441
442 cpupri_cleanup(&rd->cpupri);
443 cpudl_cleanup(&rd->cpudl);
444 free_cpumask_var(rd->dlo_mask);
445 free_cpumask_var(rd->rto_mask);
446 free_cpumask_var(rd->online);
447 free_cpumask_var(rd->span);
6aa140fa 448 free_pd(rd->pd);
f2cb1360
IM
449 kfree(rd);
450}
451
452void rq_attach_root(struct rq *rq, struct root_domain *rd)
453{
454 struct root_domain *old_rd = NULL;
455 unsigned long flags;
456
457 raw_spin_lock_irqsave(&rq->lock, flags);
458
459 if (rq->rd) {
460 old_rd = rq->rd;
461
462 if (cpumask_test_cpu(rq->cpu, old_rd->online))
463 set_rq_offline(rq);
464
465 cpumask_clear_cpu(rq->cpu, old_rd->span);
466
467 /*
468 * If we dont want to free the old_rd yet then
469 * set old_rd to NULL to skip the freeing later
470 * in this function:
471 */
472 if (!atomic_dec_and_test(&old_rd->refcount))
473 old_rd = NULL;
474 }
475
476 atomic_inc(&rd->refcount);
477 rq->rd = rd;
478
479 cpumask_set_cpu(rq->cpu, rd->span);
480 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
481 set_rq_online(rq);
482
483 raw_spin_unlock_irqrestore(&rq->lock, flags);
484
485 if (old_rd)
337e9b07 486 call_rcu(&old_rd->rcu, free_rootdomain);
f2cb1360
IM
487}
488
364f5665
SRV
489void sched_get_rd(struct root_domain *rd)
490{
491 atomic_inc(&rd->refcount);
492}
493
494void sched_put_rd(struct root_domain *rd)
495{
496 if (!atomic_dec_and_test(&rd->refcount))
497 return;
498
337e9b07 499 call_rcu(&rd->rcu, free_rootdomain);
364f5665
SRV
500}
501
f2cb1360
IM
502static int init_rootdomain(struct root_domain *rd)
503{
f2cb1360
IM
504 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
505 goto out;
506 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
507 goto free_span;
508 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
509 goto free_online;
510 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
511 goto free_dlo_mask;
512
4bdced5c
SRRH
513#ifdef HAVE_RT_PUSH_IPI
514 rd->rto_cpu = -1;
515 raw_spin_lock_init(&rd->rto_lock);
516 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
517#endif
518
26762423 519 rd->visit_gen = 0;
f2cb1360
IM
520 init_dl_bw(&rd->dl_bw);
521 if (cpudl_init(&rd->cpudl) != 0)
522 goto free_rto_mask;
523
524 if (cpupri_init(&rd->cpupri) != 0)
525 goto free_cpudl;
526 return 0;
527
528free_cpudl:
529 cpudl_cleanup(&rd->cpudl);
530free_rto_mask:
531 free_cpumask_var(rd->rto_mask);
532free_dlo_mask:
533 free_cpumask_var(rd->dlo_mask);
534free_online:
535 free_cpumask_var(rd->online);
536free_span:
537 free_cpumask_var(rd->span);
538out:
539 return -ENOMEM;
540}
541
542/*
543 * By default the system creates a single root-domain with all CPUs as
544 * members (mimicking the global state we have today).
545 */
546struct root_domain def_root_domain;
547
548void init_defrootdomain(void)
549{
550 init_rootdomain(&def_root_domain);
551
552 atomic_set(&def_root_domain.refcount, 1);
553}
554
555static struct root_domain *alloc_rootdomain(void)
556{
557 struct root_domain *rd;
558
4d13a06d 559 rd = kzalloc(sizeof(*rd), GFP_KERNEL);
f2cb1360
IM
560 if (!rd)
561 return NULL;
562
563 if (init_rootdomain(rd) != 0) {
564 kfree(rd);
565 return NULL;
566 }
567
568 return rd;
569}
570
571static void free_sched_groups(struct sched_group *sg, int free_sgc)
572{
573 struct sched_group *tmp, *first;
574
575 if (!sg)
576 return;
577
578 first = sg;
579 do {
580 tmp = sg->next;
581
582 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
583 kfree(sg->sgc);
584
213c5a45
SW
585 if (atomic_dec_and_test(&sg->ref))
586 kfree(sg);
f2cb1360
IM
587 sg = tmp;
588 } while (sg != first);
589}
590
591static void destroy_sched_domain(struct sched_domain *sd)
592{
593 /*
a090c4f2
PZ
594 * A normal sched domain may have multiple group references, an
595 * overlapping domain, having private groups, only one. Iterate,
596 * dropping group/capacity references, freeing where none remain.
f2cb1360 597 */
213c5a45
SW
598 free_sched_groups(sd->groups, 1);
599
f2cb1360
IM
600 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
601 kfree(sd->shared);
602 kfree(sd);
603}
604
605static void destroy_sched_domains_rcu(struct rcu_head *rcu)
606{
607 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
608
609 while (sd) {
610 struct sched_domain *parent = sd->parent;
611 destroy_sched_domain(sd);
612 sd = parent;
613 }
614}
615
616static void destroy_sched_domains(struct sched_domain *sd)
617{
618 if (sd)
619 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
620}
621
622/*
623 * Keep a special pointer to the highest sched_domain that has
624 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
625 * allows us to avoid some pointer chasing select_idle_sibling().
626 *
627 * Also keep a unique ID per domain (we use the first CPU number in
628 * the cpumask of the domain), this allows us to quickly tell if
629 * two CPUs are in the same cache domain, see cpus_share_cache().
630 */
994aeb7a 631DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
f2cb1360
IM
632DEFINE_PER_CPU(int, sd_llc_size);
633DEFINE_PER_CPU(int, sd_llc_id);
994aeb7a
JFG
634DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
635DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
636DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
637DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
df054e84 638DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
f2cb1360
IM
639
640static void update_top_cache_domain(int cpu)
641{
642 struct sched_domain_shared *sds = NULL;
643 struct sched_domain *sd;
644 int id = cpu;
645 int size = 1;
646
647 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
648 if (sd) {
649 id = cpumask_first(sched_domain_span(sd));
650 size = cpumask_weight(sched_domain_span(sd));
651 sds = sd->shared;
652 }
653
654 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
655 per_cpu(sd_llc_size, cpu) = size;
656 per_cpu(sd_llc_id, cpu) = id;
657 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
658
659 sd = lowest_flag_domain(cpu, SD_NUMA);
660 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
661
662 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
011b27bb
QP
663 rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
664
665 sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
666 rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
f2cb1360
IM
667}
668
669/*
670 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
671 * hold the hotplug lock.
672 */
673static void
674cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
675{
676 struct rq *rq = cpu_rq(cpu);
677 struct sched_domain *tmp;
678
679 /* Remove the sched domains which do not contribute to scheduling. */
680 for (tmp = sd; tmp; ) {
681 struct sched_domain *parent = tmp->parent;
682 if (!parent)
683 break;
684
685 if (sd_parent_degenerate(tmp, parent)) {
686 tmp->parent = parent->parent;
687 if (parent->parent)
688 parent->parent->child = tmp;
689 /*
690 * Transfer SD_PREFER_SIBLING down in case of a
691 * degenerate parent; the spans match for this
692 * so the property transfers.
693 */
694 if (parent->flags & SD_PREFER_SIBLING)
695 tmp->flags |= SD_PREFER_SIBLING;
696 destroy_sched_domain(parent);
697 } else
698 tmp = tmp->parent;
699 }
700
701 if (sd && sd_degenerate(sd)) {
702 tmp = sd;
703 sd = sd->parent;
704 destroy_sched_domain(tmp);
705 if (sd)
706 sd->child = NULL;
707 }
708
709 sched_domain_debug(sd, cpu);
710
711 rq_attach_root(rq, rd);
712 tmp = rq->sd;
713 rcu_assign_pointer(rq->sd, sd);
bbdacdfe 714 dirty_sched_domain_sysctl(cpu);
f2cb1360
IM
715 destroy_sched_domains(tmp);
716
717 update_top_cache_domain(cpu);
718}
719
f2cb1360 720struct s_data {
99687cdb 721 struct sched_domain * __percpu *sd;
f2cb1360
IM
722 struct root_domain *rd;
723};
724
725enum s_alloc {
726 sa_rootdomain,
727 sa_sd,
728 sa_sd_storage,
729 sa_none,
730};
731
35a566e6
PZ
732/*
733 * Return the canonical balance CPU for this group, this is the first CPU
e5c14b1f 734 * of this group that's also in the balance mask.
35a566e6 735 *
e5c14b1f
PZ
736 * The balance mask are all those CPUs that could actually end up at this
737 * group. See build_balance_mask().
35a566e6
PZ
738 *
739 * Also see should_we_balance().
740 */
741int group_balance_cpu(struct sched_group *sg)
742{
e5c14b1f 743 return cpumask_first(group_balance_mask(sg));
35a566e6
PZ
744}
745
746
747/*
748 * NUMA topology (first read the regular topology blurb below)
749 *
750 * Given a node-distance table, for example:
751 *
752 * node 0 1 2 3
753 * 0: 10 20 30 20
754 * 1: 20 10 20 30
755 * 2: 30 20 10 20
756 * 3: 20 30 20 10
757 *
758 * which represents a 4 node ring topology like:
759 *
760 * 0 ----- 1
761 * | |
762 * | |
763 * | |
764 * 3 ----- 2
765 *
766 * We want to construct domains and groups to represent this. The way we go
767 * about doing this is to build the domains on 'hops'. For each NUMA level we
768 * construct the mask of all nodes reachable in @level hops.
769 *
770 * For the above NUMA topology that gives 3 levels:
771 *
772 * NUMA-2 0-3 0-3 0-3 0-3
773 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
774 *
775 * NUMA-1 0-1,3 0-2 1-3 0,2-3
776 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
777 *
778 * NUMA-0 0 1 2 3
779 *
780 *
781 * As can be seen; things don't nicely line up as with the regular topology.
782 * When we iterate a domain in child domain chunks some nodes can be
783 * represented multiple times -- hence the "overlap" naming for this part of
784 * the topology.
785 *
786 * In order to minimize this overlap, we only build enough groups to cover the
787 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
788 *
789 * Because:
790 *
791 * - the first group of each domain is its child domain; this
792 * gets us the first 0-1,3
793 * - the only uncovered node is 2, who's child domain is 1-3.
794 *
795 * However, because of the overlap, computing a unique CPU for each group is
796 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
797 * groups include the CPUs of Node-0, while those CPUs would not in fact ever
798 * end up at those groups (they would end up in group: 0-1,3).
799 *
e5c14b1f 800 * To correct this we have to introduce the group balance mask. This mask
35a566e6
PZ
801 * will contain those CPUs in the group that can reach this group given the
802 * (child) domain tree.
803 *
804 * With this we can once again compute balance_cpu and sched_group_capacity
805 * relations.
806 *
807 * XXX include words on how balance_cpu is unique and therefore can be
808 * used for sched_group_capacity links.
809 *
810 *
811 * Another 'interesting' topology is:
812 *
813 * node 0 1 2 3
814 * 0: 10 20 20 30
815 * 1: 20 10 20 20
816 * 2: 20 20 10 20
817 * 3: 30 20 20 10
818 *
819 * Which looks a little like:
820 *
821 * 0 ----- 1
822 * | / |
823 * | / |
824 * | / |
825 * 2 ----- 3
826 *
827 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
828 * are not.
829 *
830 * This leads to a few particularly weird cases where the sched_domain's are
97fb7a0a 831 * not of the same number for each CPU. Consider:
35a566e6
PZ
832 *
833 * NUMA-2 0-3 0-3
834 * groups: {0-2},{1-3} {1-3},{0-2}
835 *
836 * NUMA-1 0-2 0-3 0-3 1-3
837 *
838 * NUMA-0 0 1 2 3
839 *
840 */
841
842
f2cb1360 843/*
e5c14b1f
PZ
844 * Build the balance mask; it contains only those CPUs that can arrive at this
845 * group and should be considered to continue balancing.
35a566e6
PZ
846 *
847 * We do this during the group creation pass, therefore the group information
848 * isn't complete yet, however since each group represents a (child) domain we
849 * can fully construct this using the sched_domain bits (which are already
850 * complete).
f2cb1360 851 */
1676330e 852static void
e5c14b1f 853build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
f2cb1360 854{
ae4df9d6 855 const struct cpumask *sg_span = sched_group_span(sg);
f2cb1360
IM
856 struct sd_data *sdd = sd->private;
857 struct sched_domain *sibling;
858 int i;
859
1676330e
PZ
860 cpumask_clear(mask);
861
f32d782e 862 for_each_cpu(i, sg_span) {
f2cb1360 863 sibling = *per_cpu_ptr(sdd->sd, i);
73bb059f
PZ
864
865 /*
866 * Can happen in the asymmetric case, where these siblings are
867 * unused. The mask will not be empty because those CPUs that
868 * do have the top domain _should_ span the domain.
869 */
870 if (!sibling->child)
871 continue;
872
873 /* If we would not end up here, we can't continue from here */
874 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
f2cb1360
IM
875 continue;
876
1676330e 877 cpumask_set_cpu(i, mask);
f2cb1360 878 }
73bb059f
PZ
879
880 /* We must not have empty masks here */
1676330e 881 WARN_ON_ONCE(cpumask_empty(mask));
f2cb1360
IM
882}
883
884/*
35a566e6
PZ
885 * XXX: This creates per-node group entries; since the load-balancer will
886 * immediately access remote memory to construct this group's load-balance
887 * statistics having the groups node local is of dubious benefit.
f2cb1360 888 */
8c033469
LRV
889static struct sched_group *
890build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
891{
892 struct sched_group *sg;
893 struct cpumask *sg_span;
894
895 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
896 GFP_KERNEL, cpu_to_node(cpu));
897
898 if (!sg)
899 return NULL;
900
ae4df9d6 901 sg_span = sched_group_span(sg);
8c033469
LRV
902 if (sd->child)
903 cpumask_copy(sg_span, sched_domain_span(sd->child));
904 else
905 cpumask_copy(sg_span, sched_domain_span(sd));
906
213c5a45 907 atomic_inc(&sg->ref);
8c033469
LRV
908 return sg;
909}
910
911static void init_overlap_sched_group(struct sched_domain *sd,
1676330e 912 struct sched_group *sg)
8c033469 913{
1676330e 914 struct cpumask *mask = sched_domains_tmpmask2;
8c033469
LRV
915 struct sd_data *sdd = sd->private;
916 struct cpumask *sg_span;
1676330e
PZ
917 int cpu;
918
e5c14b1f 919 build_balance_mask(sd, sg, mask);
ae4df9d6 920 cpu = cpumask_first_and(sched_group_span(sg), mask);
8c033469
LRV
921
922 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
923 if (atomic_inc_return(&sg->sgc->ref) == 1)
e5c14b1f 924 cpumask_copy(group_balance_mask(sg), mask);
35a566e6 925 else
e5c14b1f 926 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
8c033469
LRV
927
928 /*
929 * Initialize sgc->capacity such that even if we mess up the
930 * domains and no possible iteration will get us here, we won't
931 * die on a /0 trap.
932 */
ae4df9d6 933 sg_span = sched_group_span(sg);
8c033469
LRV
934 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
935 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
e3d6d0cb 936 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
8c033469
LRV
937}
938
f2cb1360
IM
939static int
940build_overlap_sched_groups(struct sched_domain *sd, int cpu)
941{
91eaed0d 942 struct sched_group *first = NULL, *last = NULL, *sg;
f2cb1360
IM
943 const struct cpumask *span = sched_domain_span(sd);
944 struct cpumask *covered = sched_domains_tmpmask;
945 struct sd_data *sdd = sd->private;
946 struct sched_domain *sibling;
947 int i;
948
949 cpumask_clear(covered);
950
0372dd27 951 for_each_cpu_wrap(i, span, cpu) {
f2cb1360
IM
952 struct cpumask *sg_span;
953
954 if (cpumask_test_cpu(i, covered))
955 continue;
956
957 sibling = *per_cpu_ptr(sdd->sd, i);
958
c20e1ea4
LRV
959 /*
960 * Asymmetric node setups can result in situations where the
961 * domain tree is of unequal depth, make sure to skip domains
962 * that already cover the entire range.
963 *
964 * In that case build_sched_domains() will have terminated the
965 * iteration early and our sibling sd spans will be empty.
966 * Domains should always include the CPU they're built on, so
967 * check that.
968 */
f2cb1360
IM
969 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
970 continue;
971
8c033469 972 sg = build_group_from_child_sched_domain(sibling, cpu);
f2cb1360
IM
973 if (!sg)
974 goto fail;
975
ae4df9d6 976 sg_span = sched_group_span(sg);
f2cb1360
IM
977 cpumask_or(covered, covered, sg_span);
978
1676330e 979 init_overlap_sched_group(sd, sg);
f2cb1360 980
f2cb1360
IM
981 if (!first)
982 first = sg;
983 if (last)
984 last->next = sg;
985 last = sg;
986 last->next = first;
987 }
91eaed0d 988 sd->groups = first;
f2cb1360
IM
989
990 return 0;
991
992fail:
993 free_sched_groups(first, 0);
994
995 return -ENOMEM;
996}
997
35a566e6
PZ
998
999/*
1000 * Package topology (also see the load-balance blurb in fair.c)
1001 *
1002 * The scheduler builds a tree structure to represent a number of important
1003 * topology features. By default (default_topology[]) these include:
1004 *
1005 * - Simultaneous multithreading (SMT)
1006 * - Multi-Core Cache (MC)
1007 * - Package (DIE)
1008 *
1009 * Where the last one more or less denotes everything up to a NUMA node.
1010 *
1011 * The tree consists of 3 primary data structures:
1012 *
1013 * sched_domain -> sched_group -> sched_group_capacity
1014 * ^ ^ ^ ^
1015 * `-' `-'
1016 *
97fb7a0a 1017 * The sched_domains are per-CPU and have a two way link (parent & child) and
35a566e6
PZ
1018 * denote the ever growing mask of CPUs belonging to that level of topology.
1019 *
1020 * Each sched_domain has a circular (double) linked list of sched_group's, each
1021 * denoting the domains of the level below (or individual CPUs in case of the
1022 * first domain level). The sched_group linked by a sched_domain includes the
1023 * CPU of that sched_domain [*].
1024 *
1025 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
1026 *
1027 * CPU 0 1 2 3 4 5 6 7
1028 *
1029 * DIE [ ]
1030 * MC [ ] [ ]
1031 * SMT [ ] [ ] [ ] [ ]
1032 *
1033 * - or -
1034 *
1035 * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
1036 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
1037 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
1038 *
1039 * CPU 0 1 2 3 4 5 6 7
1040 *
1041 * One way to think about it is: sched_domain moves you up and down among these
1042 * topology levels, while sched_group moves you sideways through it, at child
1043 * domain granularity.
1044 *
1045 * sched_group_capacity ensures each unique sched_group has shared storage.
1046 *
1047 * There are two related construction problems, both require a CPU that
1048 * uniquely identify each group (for a given domain):
1049 *
1050 * - The first is the balance_cpu (see should_we_balance() and the
1051 * load-balance blub in fair.c); for each group we only want 1 CPU to
1052 * continue balancing at a higher domain.
1053 *
1054 * - The second is the sched_group_capacity; we want all identical groups
1055 * to share a single sched_group_capacity.
1056 *
1057 * Since these topologies are exclusive by construction. That is, its
1058 * impossible for an SMT thread to belong to multiple cores, and cores to
1059 * be part of multiple caches. There is a very clear and unique location
1060 * for each CPU in the hierarchy.
1061 *
1062 * Therefore computing a unique CPU for each group is trivial (the iteration
1063 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
1064 * group), we can simply pick the first CPU in each group.
1065 *
1066 *
1067 * [*] in other words, the first group of each domain is its child domain.
1068 */
1069
0c0e776a 1070static struct sched_group *get_group(int cpu, struct sd_data *sdd)
f2cb1360
IM
1071{
1072 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1073 struct sched_domain *child = sd->child;
0c0e776a 1074 struct sched_group *sg;
67d4f6ff 1075 bool already_visited;
f2cb1360
IM
1076
1077 if (child)
1078 cpu = cpumask_first(sched_domain_span(child));
1079
0c0e776a
PZ
1080 sg = *per_cpu_ptr(sdd->sg, cpu);
1081 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1082
67d4f6ff
VS
1083 /* Increase refcounts for claim_allocations: */
1084 already_visited = atomic_inc_return(&sg->ref) > 1;
1085 /* sgc visits should follow a similar trend as sg */
1086 WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1087
1088 /* If we have already visited that group, it's already initialized. */
1089 if (already_visited)
1090 return sg;
f2cb1360 1091
0c0e776a 1092 if (child) {
ae4df9d6
PZ
1093 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
1094 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
0c0e776a 1095 } else {
ae4df9d6 1096 cpumask_set_cpu(cpu, sched_group_span(sg));
e5c14b1f 1097 cpumask_set_cpu(cpu, group_balance_mask(sg));
f2cb1360
IM
1098 }
1099
ae4df9d6 1100 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
0c0e776a 1101 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
e3d6d0cb 1102 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
0c0e776a
PZ
1103
1104 return sg;
f2cb1360
IM
1105}
1106
1107/*
1108 * build_sched_groups will build a circular linked list of the groups
d8743230
VS
1109 * covered by the given span, will set each group's ->cpumask correctly,
1110 * and will initialize their ->sgc.
f2cb1360
IM
1111 *
1112 * Assumes the sched_domain tree is fully constructed
1113 */
1114static int
1115build_sched_groups(struct sched_domain *sd, int cpu)
1116{
1117 struct sched_group *first = NULL, *last = NULL;
1118 struct sd_data *sdd = sd->private;
1119 const struct cpumask *span = sched_domain_span(sd);
1120 struct cpumask *covered;
1121 int i;
1122
f2cb1360
IM
1123 lockdep_assert_held(&sched_domains_mutex);
1124 covered = sched_domains_tmpmask;
1125
1126 cpumask_clear(covered);
1127
0c0e776a 1128 for_each_cpu_wrap(i, span, cpu) {
f2cb1360 1129 struct sched_group *sg;
f2cb1360
IM
1130
1131 if (cpumask_test_cpu(i, covered))
1132 continue;
1133
0c0e776a 1134 sg = get_group(i, sdd);
f2cb1360 1135
ae4df9d6 1136 cpumask_or(covered, covered, sched_group_span(sg));
f2cb1360
IM
1137
1138 if (!first)
1139 first = sg;
1140 if (last)
1141 last->next = sg;
1142 last = sg;
1143 }
1144 last->next = first;
0c0e776a 1145 sd->groups = first;
f2cb1360
IM
1146
1147 return 0;
1148}
1149
1150/*
1151 * Initialize sched groups cpu_capacity.
1152 *
1153 * cpu_capacity indicates the capacity of sched group, which is used while
1154 * distributing the load between different sched groups in a sched domain.
1155 * Typically cpu_capacity for all the groups in a sched domain will be same
1156 * unless there are asymmetries in the topology. If there are asymmetries,
1157 * group having more cpu_capacity will pickup more load compared to the
1158 * group having less cpu_capacity.
1159 */
1160static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
1161{
1162 struct sched_group *sg = sd->groups;
1163
1164 WARN_ON(!sg);
1165
1166 do {
1167 int cpu, max_cpu = -1;
1168
ae4df9d6 1169 sg->group_weight = cpumask_weight(sched_group_span(sg));
f2cb1360
IM
1170
1171 if (!(sd->flags & SD_ASYM_PACKING))
1172 goto next;
1173
ae4df9d6 1174 for_each_cpu(cpu, sched_group_span(sg)) {
f2cb1360
IM
1175 if (max_cpu < 0)
1176 max_cpu = cpu;
1177 else if (sched_asym_prefer(cpu, max_cpu))
1178 max_cpu = cpu;
1179 }
1180 sg->asym_prefer_cpu = max_cpu;
1181
1182next:
1183 sg = sg->next;
1184 } while (sg != sd->groups);
1185
1186 if (cpu != group_balance_cpu(sg))
1187 return;
1188
1189 update_group_capacity(sd, cpu);
1190}
1191
1192/*
1193 * Initializers for schedule domains
1194 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
1195 */
1196
1197static int default_relax_domain_level = -1;
1198int sched_domain_level_max;
1199
1200static int __init setup_relax_domain_level(char *str)
1201{
1202 if (kstrtoint(str, 0, &default_relax_domain_level))
1203 pr_warn("Unable to set relax_domain_level\n");
1204
1205 return 1;
1206}
1207__setup("relax_domain_level=", setup_relax_domain_level);
1208
1209static void set_domain_attribute(struct sched_domain *sd,
1210 struct sched_domain_attr *attr)
1211{
1212 int request;
1213
1214 if (!attr || attr->relax_domain_level < 0) {
1215 if (default_relax_domain_level < 0)
1216 return;
9ae7ab20 1217 request = default_relax_domain_level;
f2cb1360
IM
1218 } else
1219 request = attr->relax_domain_level;
9ae7ab20
VS
1220
1221 if (sd->level > request) {
f2cb1360
IM
1222 /* Turn off idle balance on this domain: */
1223 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
f2cb1360
IM
1224 }
1225}
1226
1227static void __sdt_free(const struct cpumask *cpu_map);
1228static int __sdt_alloc(const struct cpumask *cpu_map);
1229
1230static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
1231 const struct cpumask *cpu_map)
1232{
1233 switch (what) {
1234 case sa_rootdomain:
1235 if (!atomic_read(&d->rd->refcount))
1236 free_rootdomain(&d->rd->rcu);
df561f66 1237 fallthrough;
f2cb1360
IM
1238 case sa_sd:
1239 free_percpu(d->sd);
df561f66 1240 fallthrough;
f2cb1360
IM
1241 case sa_sd_storage:
1242 __sdt_free(cpu_map);
df561f66 1243 fallthrough;
f2cb1360
IM
1244 case sa_none:
1245 break;
1246 }
1247}
1248
1249static enum s_alloc
1250__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1251{
1252 memset(d, 0, sizeof(*d));
1253
1254 if (__sdt_alloc(cpu_map))
1255 return sa_sd_storage;
1256 d->sd = alloc_percpu(struct sched_domain *);
1257 if (!d->sd)
1258 return sa_sd_storage;
1259 d->rd = alloc_rootdomain();
1260 if (!d->rd)
1261 return sa_sd;
97fb7a0a 1262
f2cb1360
IM
1263 return sa_rootdomain;
1264}
1265
1266/*
1267 * NULL the sd_data elements we've used to build the sched_domain and
1268 * sched_group structure so that the subsequent __free_domain_allocs()
1269 * will not free the data we're using.
1270 */
1271static void claim_allocations(int cpu, struct sched_domain *sd)
1272{
1273 struct sd_data *sdd = sd->private;
1274
1275 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1276 *per_cpu_ptr(sdd->sd, cpu) = NULL;
1277
1278 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1279 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1280
1281 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1282 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1283
1284 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1285 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1286}
1287
1288#ifdef CONFIG_NUMA
f2cb1360 1289enum numa_topology_type sched_numa_topology_type;
97fb7a0a
IM
1290
1291static int sched_domains_numa_levels;
1292static int sched_domains_curr_level;
1293
1294int sched_max_numa_distance;
1295static int *sched_domains_numa_distance;
1296static struct cpumask ***sched_domains_numa_masks;
a55c7454 1297int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
f2cb1360
IM
1298#endif
1299
1300/*
1301 * SD_flags allowed in topology descriptions.
1302 *
1303 * These flags are purely descriptive of the topology and do not prescribe
1304 * behaviour. Behaviour is artificial and mapped in the below sd_init()
1305 * function:
1306 *
1307 * SD_SHARE_CPUCAPACITY - describes SMT topologies
1308 * SD_SHARE_PKG_RESOURCES - describes shared caches
1309 * SD_NUMA - describes NUMA topologies
f2cb1360
IM
1310 *
1311 * Odd one out, which beside describing the topology has a quirk also
1312 * prescribes the desired behaviour that goes along with it:
1313 *
1314 * SD_ASYM_PACKING - describes SMT quirks
1315 */
1316#define TOPOLOGY_SD_FLAGS \
97fb7a0a 1317 (SD_SHARE_CPUCAPACITY | \
f2cb1360 1318 SD_SHARE_PKG_RESOURCES | \
97fb7a0a 1319 SD_NUMA | \
cfe7ddcb 1320 SD_ASYM_PACKING)
f2cb1360
IM
1321
1322static struct sched_domain *
1323sd_init(struct sched_domain_topology_level *tl,
1324 const struct cpumask *cpu_map,
05484e09 1325 struct sched_domain *child, int dflags, int cpu)
f2cb1360
IM
1326{
1327 struct sd_data *sdd = &tl->data;
1328 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1329 int sd_id, sd_weight, sd_flags = 0;
1330
1331#ifdef CONFIG_NUMA
1332 /*
1333 * Ugly hack to pass state to sd_numa_mask()...
1334 */
1335 sched_domains_curr_level = tl->numa_level;
1336#endif
1337
1338 sd_weight = cpumask_weight(tl->mask(cpu));
1339
1340 if (tl->sd_flags)
1341 sd_flags = (*tl->sd_flags)();
1342 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1343 "wrong sd_flags in topology description\n"))
9b1b234b 1344 sd_flags &= TOPOLOGY_SD_FLAGS;
f2cb1360 1345
05484e09
MR
1346 /* Apply detected topology flags */
1347 sd_flags |= dflags;
1348
f2cb1360
IM
1349 *sd = (struct sched_domain){
1350 .min_interval = sd_weight,
1351 .max_interval = 2*sd_weight,
6e749913 1352 .busy_factor = 16,
2208cdaa 1353 .imbalance_pct = 117,
f2cb1360
IM
1354
1355 .cache_nice_tries = 0,
f2cb1360 1356
36c5bdc4 1357 .flags = 1*SD_BALANCE_NEWIDLE
f2cb1360
IM
1358 | 1*SD_BALANCE_EXEC
1359 | 1*SD_BALANCE_FORK
1360 | 0*SD_BALANCE_WAKE
1361 | 1*SD_WAKE_AFFINE
1362 | 0*SD_SHARE_CPUCAPACITY
1363 | 0*SD_SHARE_PKG_RESOURCES
1364 | 0*SD_SERIALIZE
9c63e84d 1365 | 1*SD_PREFER_SIBLING
f2cb1360
IM
1366 | 0*SD_NUMA
1367 | sd_flags
1368 ,
1369
1370 .last_balance = jiffies,
1371 .balance_interval = sd_weight,
f2cb1360
IM
1372 .max_newidle_lb_cost = 0,
1373 .next_decay_max_lb_cost = jiffies,
1374 .child = child,
1375#ifdef CONFIG_SCHED_DEBUG
1376 .name = tl->name,
1377#endif
1378 };
1379
1380 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1381 sd_id = cpumask_first(sched_domain_span(sd));
1382
1383 /*
1384 * Convert topological properties into behaviour.
1385 */
1386
a526d466
MR
1387 /* Don't attempt to spread across CPUs of different capacities. */
1388 if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
1389 sd->child->flags &= ~SD_PREFER_SIBLING;
f2cb1360
IM
1390
1391 if (sd->flags & SD_SHARE_CPUCAPACITY) {
f2cb1360 1392 sd->imbalance_pct = 110;
f2cb1360
IM
1393
1394 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1395 sd->imbalance_pct = 117;
1396 sd->cache_nice_tries = 1;
f2cb1360
IM
1397
1398#ifdef CONFIG_NUMA
1399 } else if (sd->flags & SD_NUMA) {
1400 sd->cache_nice_tries = 2;
f2cb1360 1401
9c63e84d 1402 sd->flags &= ~SD_PREFER_SIBLING;
f2cb1360 1403 sd->flags |= SD_SERIALIZE;
a55c7454 1404 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
f2cb1360
IM
1405 sd->flags &= ~(SD_BALANCE_EXEC |
1406 SD_BALANCE_FORK |
1407 SD_WAKE_AFFINE);
1408 }
1409
1410#endif
1411 } else {
f2cb1360 1412 sd->cache_nice_tries = 1;
f2cb1360
IM
1413 }
1414
1415 /*
1416 * For all levels sharing cache; connect a sched_domain_shared
1417 * instance.
1418 */
1419 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1420 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1421 atomic_inc(&sd->shared->ref);
1422 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1423 }
1424
1425 sd->private = sdd;
1426
1427 return sd;
1428}
1429
1430/*
1431 * Topology list, bottom-up.
1432 */
1433static struct sched_domain_topology_level default_topology[] = {
1434#ifdef CONFIG_SCHED_SMT
1435 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1436#endif
1437#ifdef CONFIG_SCHED_MC
1438 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1439#endif
1440 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1441 { NULL, },
1442};
1443
1444static struct sched_domain_topology_level *sched_domain_topology =
1445 default_topology;
1446
1447#define for_each_sd_topology(tl) \
1448 for (tl = sched_domain_topology; tl->mask; tl++)
1449
1450void set_sched_topology(struct sched_domain_topology_level *tl)
1451{
1452 if (WARN_ON_ONCE(sched_smp_initialized))
1453 return;
1454
1455 sched_domain_topology = tl;
1456}
1457
1458#ifdef CONFIG_NUMA
1459
1460static const struct cpumask *sd_numa_mask(int cpu)
1461{
1462 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1463}
1464
1465static void sched_numa_warn(const char *str)
1466{
1467 static int done = false;
1468 int i,j;
1469
1470 if (done)
1471 return;
1472
1473 done = true;
1474
1475 printk(KERN_WARNING "ERROR: %s\n\n", str);
1476
1477 for (i = 0; i < nr_node_ids; i++) {
1478 printk(KERN_WARNING " ");
1479 for (j = 0; j < nr_node_ids; j++)
1480 printk(KERN_CONT "%02d ", node_distance(i,j));
1481 printk(KERN_CONT "\n");
1482 }
1483 printk(KERN_WARNING "\n");
1484}
1485
1486bool find_numa_distance(int distance)
1487{
1488 int i;
1489
1490 if (distance == node_distance(0, 0))
1491 return true;
1492
1493 for (i = 0; i < sched_domains_numa_levels; i++) {
1494 if (sched_domains_numa_distance[i] == distance)
1495 return true;
1496 }
1497
1498 return false;
1499}
1500
1501/*
1502 * A system can have three types of NUMA topology:
1503 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1504 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1505 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1506 *
1507 * The difference between a glueless mesh topology and a backplane
1508 * topology lies in whether communication between not directly
1509 * connected nodes goes through intermediary nodes (where programs
1510 * could run), or through backplane controllers. This affects
1511 * placement of programs.
1512 *
1513 * The type of topology can be discerned with the following tests:
1514 * - If the maximum distance between any nodes is 1 hop, the system
1515 * is directly connected.
1516 * - If for two nodes A and B, located N > 1 hops away from each other,
1517 * there is an intermediary node C, which is < N hops away from both
1518 * nodes A and B, the system is a glueless mesh.
1519 */
1520static void init_numa_topology_type(void)
1521{
1522 int a, b, c, n;
1523
1524 n = sched_max_numa_distance;
1525
e5e96faf 1526 if (sched_domains_numa_levels <= 2) {
f2cb1360
IM
1527 sched_numa_topology_type = NUMA_DIRECT;
1528 return;
1529 }
1530
1531 for_each_online_node(a) {
1532 for_each_online_node(b) {
1533 /* Find two nodes furthest removed from each other. */
1534 if (node_distance(a, b) < n)
1535 continue;
1536
1537 /* Is there an intermediary node between a and b? */
1538 for_each_online_node(c) {
1539 if (node_distance(a, c) < n &&
1540 node_distance(b, c) < n) {
1541 sched_numa_topology_type =
1542 NUMA_GLUELESS_MESH;
1543 return;
1544 }
1545 }
1546
1547 sched_numa_topology_type = NUMA_BACKPLANE;
1548 return;
1549 }
1550 }
1551}
1552
1553void sched_init_numa(void)
1554{
1555 int next_distance, curr_distance = node_distance(0, 0);
1556 struct sched_domain_topology_level *tl;
1557 int level = 0;
1558 int i, j, k;
1559
993f0b05 1560 sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
f2cb1360
IM
1561 if (!sched_domains_numa_distance)
1562 return;
1563
051f3ca0
SS
1564 /* Includes NUMA identity node at level 0. */
1565 sched_domains_numa_distance[level++] = curr_distance;
1566 sched_domains_numa_levels = level;
1567
f2cb1360
IM
1568 /*
1569 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1570 * unique distances in the node_distance() table.
1571 *
1572 * Assumes node_distance(0,j) includes all distances in
1573 * node_distance(i,j) in order to avoid cubic time.
1574 */
1575 next_distance = curr_distance;
1576 for (i = 0; i < nr_node_ids; i++) {
1577 for (j = 0; j < nr_node_ids; j++) {
1578 for (k = 0; k < nr_node_ids; k++) {
1579 int distance = node_distance(i, k);
1580
1581 if (distance > curr_distance &&
1582 (distance < next_distance ||
1583 next_distance == curr_distance))
1584 next_distance = distance;
1585
1586 /*
1587 * While not a strong assumption it would be nice to know
1588 * about cases where if node A is connected to B, B is not
1589 * equally connected to A.
1590 */
1591 if (sched_debug() && node_distance(k, i) != distance)
1592 sched_numa_warn("Node-distance not symmetric");
1593
1594 if (sched_debug() && i && !find_numa_distance(distance))
1595 sched_numa_warn("Node-0 not representative");
1596 }
1597 if (next_distance != curr_distance) {
1598 sched_domains_numa_distance[level++] = next_distance;
1599 sched_domains_numa_levels = level;
1600 curr_distance = next_distance;
1601 } else break;
1602 }
1603
1604 /*
1605 * In case of sched_debug() we verify the above assumption.
1606 */
1607 if (!sched_debug())
1608 break;
1609 }
1610
f2cb1360 1611 /*
051f3ca0 1612 * 'level' contains the number of unique distances
f2cb1360
IM
1613 *
1614 * The sched_domains_numa_distance[] array includes the actual distance
1615 * numbers.
1616 */
1617
1618 /*
1619 * Here, we should temporarily reset sched_domains_numa_levels to 0.
1620 * If it fails to allocate memory for array sched_domains_numa_masks[][],
1621 * the array will contain less then 'level' members. This could be
1622 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1623 * in other functions.
1624 *
1625 * We reset it to 'level' at the end of this function.
1626 */
1627 sched_domains_numa_levels = 0;
1628
1629 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1630 if (!sched_domains_numa_masks)
1631 return;
1632
1633 /*
1634 * Now for each level, construct a mask per node which contains all
1635 * CPUs of nodes that are that many hops away from us.
1636 */
1637 for (i = 0; i < level; i++) {
1638 sched_domains_numa_masks[i] =
1639 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1640 if (!sched_domains_numa_masks[i])
1641 return;
1642
1643 for (j = 0; j < nr_node_ids; j++) {
1644 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1645 if (!mask)
1646 return;
1647
1648 sched_domains_numa_masks[i][j] = mask;
1649
1650 for_each_node(k) {
1651 if (node_distance(j, k) > sched_domains_numa_distance[i])
1652 continue;
1653
1654 cpumask_or(mask, mask, cpumask_of_node(k));
1655 }
1656 }
1657 }
1658
1659 /* Compute default topology size */
1660 for (i = 0; sched_domain_topology[i].mask; i++);
1661
1662 tl = kzalloc((i + level + 1) *
1663 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1664 if (!tl)
1665 return;
1666
1667 /*
1668 * Copy the default topology bits..
1669 */
1670 for (i = 0; sched_domain_topology[i].mask; i++)
1671 tl[i] = sched_domain_topology[i];
1672
051f3ca0
SS
1673 /*
1674 * Add the NUMA identity distance, aka single NODE.
1675 */
1676 tl[i++] = (struct sched_domain_topology_level){
1677 .mask = sd_numa_mask,
1678 .numa_level = 0,
1679 SD_INIT_NAME(NODE)
1680 };
1681
f2cb1360
IM
1682 /*
1683 * .. and append 'j' levels of NUMA goodness.
1684 */
051f3ca0 1685 for (j = 1; j < level; i++, j++) {
f2cb1360
IM
1686 tl[i] = (struct sched_domain_topology_level){
1687 .mask = sd_numa_mask,
1688 .sd_flags = cpu_numa_flags,
1689 .flags = SDTL_OVERLAP,
1690 .numa_level = j,
1691 SD_INIT_NAME(NUMA)
1692 };
1693 }
1694
1695 sched_domain_topology = tl;
1696
1697 sched_domains_numa_levels = level;
1698 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1699
1700 init_numa_topology_type();
1701}
1702
1703void sched_domains_numa_masks_set(unsigned int cpu)
1704{
1705 int node = cpu_to_node(cpu);
1706 int i, j;
1707
1708 for (i = 0; i < sched_domains_numa_levels; i++) {
1709 for (j = 0; j < nr_node_ids; j++) {
1710 if (node_distance(j, node) <= sched_domains_numa_distance[i])
1711 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1712 }
1713 }
1714}
1715
1716void sched_domains_numa_masks_clear(unsigned int cpu)
1717{
1718 int i, j;
1719
1720 for (i = 0; i < sched_domains_numa_levels; i++) {
1721 for (j = 0; j < nr_node_ids; j++)
1722 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1723 }
1724}
1725
e0e8d491
WL
1726/*
1727 * sched_numa_find_closest() - given the NUMA topology, find the cpu
1728 * closest to @cpu from @cpumask.
1729 * cpumask: cpumask to find a cpu from
1730 * cpu: cpu to be close to
1731 *
1732 * returns: cpu, or nr_cpu_ids when nothing found.
1733 */
1734int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1735{
1736 int i, j = cpu_to_node(cpu);
1737
1738 for (i = 0; i < sched_domains_numa_levels; i++) {
1739 cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1740 if (cpu < nr_cpu_ids)
1741 return cpu;
1742 }
1743 return nr_cpu_ids;
1744}
1745
f2cb1360
IM
1746#endif /* CONFIG_NUMA */
1747
1748static int __sdt_alloc(const struct cpumask *cpu_map)
1749{
1750 struct sched_domain_topology_level *tl;
1751 int j;
1752
1753 for_each_sd_topology(tl) {
1754 struct sd_data *sdd = &tl->data;
1755
1756 sdd->sd = alloc_percpu(struct sched_domain *);
1757 if (!sdd->sd)
1758 return -ENOMEM;
1759
1760 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1761 if (!sdd->sds)
1762 return -ENOMEM;
1763
1764 sdd->sg = alloc_percpu(struct sched_group *);
1765 if (!sdd->sg)
1766 return -ENOMEM;
1767
1768 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1769 if (!sdd->sgc)
1770 return -ENOMEM;
1771
1772 for_each_cpu(j, cpu_map) {
1773 struct sched_domain *sd;
1774 struct sched_domain_shared *sds;
1775 struct sched_group *sg;
1776 struct sched_group_capacity *sgc;
1777
1778 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1779 GFP_KERNEL, cpu_to_node(j));
1780 if (!sd)
1781 return -ENOMEM;
1782
1783 *per_cpu_ptr(sdd->sd, j) = sd;
1784
1785 sds = kzalloc_node(sizeof(struct sched_domain_shared),
1786 GFP_KERNEL, cpu_to_node(j));
1787 if (!sds)
1788 return -ENOMEM;
1789
1790 *per_cpu_ptr(sdd->sds, j) = sds;
1791
1792 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1793 GFP_KERNEL, cpu_to_node(j));
1794 if (!sg)
1795 return -ENOMEM;
1796
1797 sg->next = sg;
1798
1799 *per_cpu_ptr(sdd->sg, j) = sg;
1800
1801 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1802 GFP_KERNEL, cpu_to_node(j));
1803 if (!sgc)
1804 return -ENOMEM;
1805
005f874d
PZ
1806#ifdef CONFIG_SCHED_DEBUG
1807 sgc->id = j;
1808#endif
1809
f2cb1360
IM
1810 *per_cpu_ptr(sdd->sgc, j) = sgc;
1811 }
1812 }
1813
1814 return 0;
1815}
1816
1817static void __sdt_free(const struct cpumask *cpu_map)
1818{
1819 struct sched_domain_topology_level *tl;
1820 int j;
1821
1822 for_each_sd_topology(tl) {
1823 struct sd_data *sdd = &tl->data;
1824
1825 for_each_cpu(j, cpu_map) {
1826 struct sched_domain *sd;
1827
1828 if (sdd->sd) {
1829 sd = *per_cpu_ptr(sdd->sd, j);
1830 if (sd && (sd->flags & SD_OVERLAP))
1831 free_sched_groups(sd->groups, 0);
1832 kfree(*per_cpu_ptr(sdd->sd, j));
1833 }
1834
1835 if (sdd->sds)
1836 kfree(*per_cpu_ptr(sdd->sds, j));
1837 if (sdd->sg)
1838 kfree(*per_cpu_ptr(sdd->sg, j));
1839 if (sdd->sgc)
1840 kfree(*per_cpu_ptr(sdd->sgc, j));
1841 }
1842 free_percpu(sdd->sd);
1843 sdd->sd = NULL;
1844 free_percpu(sdd->sds);
1845 sdd->sds = NULL;
1846 free_percpu(sdd->sg);
1847 sdd->sg = NULL;
1848 free_percpu(sdd->sgc);
1849 sdd->sgc = NULL;
1850 }
1851}
1852
181a80d1 1853static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
f2cb1360 1854 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
05484e09 1855 struct sched_domain *child, int dflags, int cpu)
f2cb1360 1856{
05484e09 1857 struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
f2cb1360
IM
1858
1859 if (child) {
1860 sd->level = child->level + 1;
1861 sched_domain_level_max = max(sched_domain_level_max, sd->level);
1862 child->parent = sd;
1863
1864 if (!cpumask_subset(sched_domain_span(child),
1865 sched_domain_span(sd))) {
1866 pr_err("BUG: arch topology borken\n");
1867#ifdef CONFIG_SCHED_DEBUG
1868 pr_err(" the %s domain not a subset of the %s domain\n",
1869 child->name, sd->name);
1870#endif
97fb7a0a 1871 /* Fixup, ensure @sd has at least @child CPUs. */
f2cb1360
IM
1872 cpumask_or(sched_domain_span(sd),
1873 sched_domain_span(sd),
1874 sched_domain_span(child));
1875 }
1876
1877 }
1878 set_domain_attribute(sd, attr);
1879
1880 return sd;
1881}
1882
ccf74128
VS
1883/*
1884 * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
1885 * any two given CPUs at this (non-NUMA) topology level.
1886 */
1887static bool topology_span_sane(struct sched_domain_topology_level *tl,
1888 const struct cpumask *cpu_map, int cpu)
1889{
1890 int i;
1891
1892 /* NUMA levels are allowed to overlap */
1893 if (tl->flags & SDTL_OVERLAP)
1894 return true;
1895
1896 /*
1897 * Non-NUMA levels cannot partially overlap - they must be either
1898 * completely equal or completely disjoint. Otherwise we can end up
1899 * breaking the sched_group lists - i.e. a later get_group() pass
1900 * breaks the linking done for an earlier span.
1901 */
1902 for_each_cpu(i, cpu_map) {
1903 if (i == cpu)
1904 continue;
1905 /*
1906 * We should 'and' all those masks with 'cpu_map' to exactly
1907 * match the topology we're about to build, but that can only
1908 * remove CPUs, which only lessens our ability to detect
1909 * overlaps
1910 */
1911 if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
1912 cpumask_intersects(tl->mask(cpu), tl->mask(i)))
1913 return false;
1914 }
1915
1916 return true;
1917}
1918
05484e09
MR
1919/*
1920 * Find the sched_domain_topology_level where all CPU capacities are visible
1921 * for all CPUs.
1922 */
1923static struct sched_domain_topology_level
1924*asym_cpu_capacity_level(const struct cpumask *cpu_map)
1925{
1926 int i, j, asym_level = 0;
1927 bool asym = false;
1928 struct sched_domain_topology_level *tl, *asym_tl = NULL;
1929 unsigned long cap;
1930
1931 /* Is there any asymmetry? */
8ec59c0f 1932 cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
05484e09
MR
1933
1934 for_each_cpu(i, cpu_map) {
8ec59c0f 1935 if (arch_scale_cpu_capacity(i) != cap) {
05484e09
MR
1936 asym = true;
1937 break;
1938 }
1939 }
1940
1941 if (!asym)
1942 return NULL;
1943
1944 /*
1945 * Examine topology from all CPU's point of views to detect the lowest
1946 * sched_domain_topology_level where a highest capacity CPU is visible
1947 * to everyone.
1948 */
1949 for_each_cpu(i, cpu_map) {
8ec59c0f 1950 unsigned long max_capacity = arch_scale_cpu_capacity(i);
05484e09
MR
1951 int tl_id = 0;
1952
1953 for_each_sd_topology(tl) {
1954 if (tl_id < asym_level)
1955 goto next_level;
1956
1957 for_each_cpu_and(j, tl->mask(i), cpu_map) {
1958 unsigned long capacity;
1959
8ec59c0f 1960 capacity = arch_scale_cpu_capacity(j);
05484e09
MR
1961
1962 if (capacity <= max_capacity)
1963 continue;
1964
1965 max_capacity = capacity;
1966 asym_level = tl_id;
1967 asym_tl = tl;
1968 }
1969next_level:
1970 tl_id++;
1971 }
1972 }
1973
1974 return asym_tl;
1975}
1976
1977
f2cb1360
IM
1978/*
1979 * Build sched domains for a given set of CPUs and attach the sched domains
1980 * to the individual CPUs
1981 */
1982static int
1983build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
1984{
cd1cb335 1985 enum s_alloc alloc_state = sa_none;
f2cb1360
IM
1986 struct sched_domain *sd;
1987 struct s_data d;
1988 struct rq *rq = NULL;
1989 int i, ret = -ENOMEM;
05484e09 1990 struct sched_domain_topology_level *tl_asym;
df054e84 1991 bool has_asym = false;
f2cb1360 1992
cd1cb335
VS
1993 if (WARN_ON(cpumask_empty(cpu_map)))
1994 goto error;
1995
f2cb1360
IM
1996 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1997 if (alloc_state != sa_rootdomain)
1998 goto error;
1999
05484e09
MR
2000 tl_asym = asym_cpu_capacity_level(cpu_map);
2001
f2cb1360
IM
2002 /* Set up domains for CPUs specified by the cpu_map: */
2003 for_each_cpu(i, cpu_map) {
2004 struct sched_domain_topology_level *tl;
c200191d 2005 int dflags = 0;
f2cb1360
IM
2006
2007 sd = NULL;
2008 for_each_sd_topology(tl) {
df054e84 2009 if (tl == tl_asym) {
05484e09 2010 dflags |= SD_ASYM_CPUCAPACITY;
df054e84
MR
2011 has_asym = true;
2012 }
05484e09 2013
ccf74128
VS
2014 if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2015 goto error;
2016
05484e09
MR
2017 sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
2018
f2cb1360
IM
2019 if (tl == sched_domain_topology)
2020 *per_cpu_ptr(d.sd, i) = sd;
af85596c 2021 if (tl->flags & SDTL_OVERLAP)
f2cb1360
IM
2022 sd->flags |= SD_OVERLAP;
2023 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
2024 break;
2025 }
2026 }
2027
2028 /* Build the groups for the domains */
2029 for_each_cpu(i, cpu_map) {
2030 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2031 sd->span_weight = cpumask_weight(sched_domain_span(sd));
2032 if (sd->flags & SD_OVERLAP) {
2033 if (build_overlap_sched_groups(sd, i))
2034 goto error;
2035 } else {
2036 if (build_sched_groups(sd, i))
2037 goto error;
2038 }
2039 }
2040 }
2041
2042 /* Calculate CPU capacity for physical packages and nodes */
2043 for (i = nr_cpumask_bits-1; i >= 0; i--) {
2044 if (!cpumask_test_cpu(i, cpu_map))
2045 continue;
2046
2047 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2048 claim_allocations(i, sd);
2049 init_sched_groups_capacity(i, sd);
2050 }
2051 }
2052
2053 /* Attach the domains */
2054 rcu_read_lock();
2055 for_each_cpu(i, cpu_map) {
2056 rq = cpu_rq(i);
2057 sd = *per_cpu_ptr(d.sd, i);
2058
2059 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
2060 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
2061 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2062
2063 cpu_attach_domain(sd, d.rd, i);
2064 }
2065 rcu_read_unlock();
2066
df054e84 2067 if (has_asym)
e284df70 2068 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
df054e84 2069
f2cb1360 2070 if (rq && sched_debug_enabled) {
bf5015a5 2071 pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
f2cb1360
IM
2072 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
2073 }
2074
2075 ret = 0;
2076error:
2077 __free_domain_allocs(&d, alloc_state, cpu_map);
97fb7a0a 2078
f2cb1360
IM
2079 return ret;
2080}
2081
2082/* Current sched domains: */
2083static cpumask_var_t *doms_cur;
2084
2085/* Number of sched domains in 'doms_cur': */
2086static int ndoms_cur;
2087
2088/* Attribues of custom domains in 'doms_cur' */
2089static struct sched_domain_attr *dattr_cur;
2090
2091/*
2092 * Special case: If a kmalloc() of a doms_cur partition (array of
2093 * cpumask) fails, then fallback to a single sched domain,
2094 * as determined by the single cpumask fallback_doms.
2095 */
8d5dc512 2096static cpumask_var_t fallback_doms;
f2cb1360
IM
2097
2098/*
2099 * arch_update_cpu_topology lets virtualized architectures update the
2100 * CPU core maps. It is supposed to return 1 if the topology changed
2101 * or 0 if it stayed the same.
2102 */
2103int __weak arch_update_cpu_topology(void)
2104{
2105 return 0;
2106}
2107
2108cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
2109{
2110 int i;
2111 cpumask_var_t *doms;
2112
6da2ec56 2113 doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
f2cb1360
IM
2114 if (!doms)
2115 return NULL;
2116 for (i = 0; i < ndoms; i++) {
2117 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
2118 free_sched_domains(doms, i);
2119 return NULL;
2120 }
2121 }
2122 return doms;
2123}
2124
2125void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2126{
2127 unsigned int i;
2128 for (i = 0; i < ndoms; i++)
2129 free_cpumask_var(doms[i]);
2130 kfree(doms);
2131}
2132
2133/*
cb0c0414
JL
2134 * Set up scheduler domains and groups. For now this just excludes isolated
2135 * CPUs, but could be used to exclude other special cases in the future.
f2cb1360 2136 */
8d5dc512 2137int sched_init_domains(const struct cpumask *cpu_map)
f2cb1360
IM
2138{
2139 int err;
2140
8d5dc512 2141 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
1676330e 2142 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
8d5dc512
PZ
2143 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
2144
f2cb1360
IM
2145 arch_update_cpu_topology();
2146 ndoms_cur = 1;
2147 doms_cur = alloc_sched_domains(ndoms_cur);
2148 if (!doms_cur)
2149 doms_cur = &fallback_doms;
edb93821 2150 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
f2cb1360
IM
2151 err = build_sched_domains(doms_cur[0], NULL);
2152 register_sched_domain_sysctl();
2153
2154 return err;
2155}
2156
2157/*
2158 * Detach sched domains from a group of CPUs specified in cpu_map
2159 * These CPUs will now be attached to the NULL domain
2160 */
2161static void detach_destroy_domains(const struct cpumask *cpu_map)
2162{
e284df70 2163 unsigned int cpu = cpumask_any(cpu_map);
f2cb1360
IM
2164 int i;
2165
e284df70
VS
2166 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
2167 static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
2168
f2cb1360
IM
2169 rcu_read_lock();
2170 for_each_cpu(i, cpu_map)
2171 cpu_attach_domain(NULL, &def_root_domain, i);
2172 rcu_read_unlock();
2173}
2174
2175/* handle null as "default" */
2176static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
2177 struct sched_domain_attr *new, int idx_new)
2178{
2179 struct sched_domain_attr tmp;
2180
2181 /* Fast path: */
2182 if (!new && !cur)
2183 return 1;
2184
2185 tmp = SD_ATTR_INIT;
97fb7a0a 2186
f2cb1360
IM
2187 return !memcmp(cur ? (cur + idx_cur) : &tmp,
2188 new ? (new + idx_new) : &tmp,
2189 sizeof(struct sched_domain_attr));
2190}
2191
2192/*
2193 * Partition sched domains as specified by the 'ndoms_new'
2194 * cpumasks in the array doms_new[] of cpumasks. This compares
2195 * doms_new[] to the current sched domain partitioning, doms_cur[].
2196 * It destroys each deleted domain and builds each new domain.
2197 *
2198 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
2199 * The masks don't intersect (don't overlap.) We should setup one
2200 * sched domain for each mask. CPUs not in any of the cpumasks will
2201 * not be load balanced. If the same cpumask appears both in the
2202 * current 'doms_cur' domains and in the new 'doms_new', we can leave
2203 * it as it is.
2204 *
2205 * The passed in 'doms_new' should be allocated using
2206 * alloc_sched_domains. This routine takes ownership of it and will
2207 * free_sched_domains it when done with it. If the caller failed the
2208 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
2209 * and partition_sched_domains() will fallback to the single partition
2210 * 'fallback_doms', it also forces the domains to be rebuilt.
2211 *
2212 * If doms_new == NULL it will be replaced with cpu_online_mask.
2213 * ndoms_new == 0 is a special case for destroying existing domains,
2214 * and it will not create the default domain.
2215 *
c22645f4 2216 * Call with hotplug lock and sched_domains_mutex held
f2cb1360 2217 */
c22645f4
MP
2218void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2219 struct sched_domain_attr *dattr_new)
f2cb1360 2220{
1f74de87 2221 bool __maybe_unused has_eas = false;
f2cb1360
IM
2222 int i, j, n;
2223 int new_topology;
2224
c22645f4 2225 lockdep_assert_held(&sched_domains_mutex);
f2cb1360
IM
2226
2227 /* Always unregister in case we don't destroy any domains: */
2228 unregister_sched_domain_sysctl();
2229
2230 /* Let the architecture update CPU core mappings: */
2231 new_topology = arch_update_cpu_topology();
2232
09e0dd8e
PZ
2233 if (!doms_new) {
2234 WARN_ON_ONCE(dattr_new);
2235 n = 0;
2236 doms_new = alloc_sched_domains(1);
2237 if (doms_new) {
2238 n = 1;
edb93821
FW
2239 cpumask_and(doms_new[0], cpu_active_mask,
2240 housekeeping_cpumask(HK_FLAG_DOMAIN));
09e0dd8e
PZ
2241 }
2242 } else {
2243 n = ndoms_new;
2244 }
f2cb1360
IM
2245
2246 /* Destroy deleted domains: */
2247 for (i = 0; i < ndoms_cur; i++) {
2248 for (j = 0; j < n && !new_topology; j++) {
6aa140fa 2249 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
f9a25f77
MP
2250 dattrs_equal(dattr_cur, i, dattr_new, j)) {
2251 struct root_domain *rd;
2252
2253 /*
2254 * This domain won't be destroyed and as such
2255 * its dl_bw->total_bw needs to be cleared. It
2256 * will be recomputed in function
2257 * update_tasks_root_domain().
2258 */
2259 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2260 dl_clear_root_domain(rd);
f2cb1360 2261 goto match1;
f9a25f77 2262 }
f2cb1360
IM
2263 }
2264 /* No match - a current sched domain not in new doms_new[] */
2265 detach_destroy_domains(doms_cur[i]);
2266match1:
2267 ;
2268 }
2269
2270 n = ndoms_cur;
09e0dd8e 2271 if (!doms_new) {
f2cb1360
IM
2272 n = 0;
2273 doms_new = &fallback_doms;
edb93821
FW
2274 cpumask_and(doms_new[0], cpu_active_mask,
2275 housekeeping_cpumask(HK_FLAG_DOMAIN));
f2cb1360
IM
2276 }
2277
2278 /* Build new domains: */
2279 for (i = 0; i < ndoms_new; i++) {
2280 for (j = 0; j < n && !new_topology; j++) {
6aa140fa
QP
2281 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2282 dattrs_equal(dattr_new, i, dattr_cur, j))
f2cb1360
IM
2283 goto match2;
2284 }
2285 /* No match - add a new doms_new */
2286 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2287match2:
2288 ;
2289 }
2290
531b5c9f 2291#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
6aa140fa
QP
2292 /* Build perf. domains: */
2293 for (i = 0; i < ndoms_new; i++) {
531b5c9f 2294 for (j = 0; j < n && !sched_energy_update; j++) {
6aa140fa 2295 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
1f74de87
QP
2296 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2297 has_eas = true;
6aa140fa 2298 goto match3;
1f74de87 2299 }
6aa140fa
QP
2300 }
2301 /* No match - add perf. domains for a new rd */
1f74de87 2302 has_eas |= build_perf_domains(doms_new[i]);
6aa140fa
QP
2303match3:
2304 ;
2305 }
1f74de87 2306 sched_energy_set(has_eas);
6aa140fa
QP
2307#endif
2308
f2cb1360
IM
2309 /* Remember the new sched domains: */
2310 if (doms_cur != &fallback_doms)
2311 free_sched_domains(doms_cur, ndoms_cur);
2312
2313 kfree(dattr_cur);
2314 doms_cur = doms_new;
2315 dattr_cur = dattr_new;
2316 ndoms_cur = ndoms_new;
2317
2318 register_sched_domain_sysctl();
c22645f4 2319}
f2cb1360 2320
c22645f4
MP
2321/*
2322 * Call with hotplug lock held
2323 */
2324void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2325 struct sched_domain_attr *dattr_new)
2326{
2327 mutex_lock(&sched_domains_mutex);
2328 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
f2cb1360
IM
2329 mutex_unlock(&sched_domains_mutex);
2330}