sched/isolation: Move isolcpus= handling to the housekeeping code
[linux-2.6-block.git] / kernel / sched / topology.c
1 /*
2  * Scheduler topology setup/handling methods
3  */
4 #include <linux/sched.h>
5 #include <linux/mutex.h>
6 #include <linux/sched/isolation.h>
7
8 #include "sched.h"
9
10 DEFINE_MUTEX(sched_domains_mutex);
11
12 /* Protected by sched_domains_mutex: */
13 cpumask_var_t sched_domains_tmpmask;
14 cpumask_var_t sched_domains_tmpmask2;
15
16 #ifdef CONFIG_SCHED_DEBUG
17
18 static int __init sched_debug_setup(char *str)
19 {
20         sched_debug_enabled = true;
21
22         return 0;
23 }
24 early_param("sched_debug", sched_debug_setup);
25
26 static inline bool sched_debug(void)
27 {
28         return sched_debug_enabled;
29 }
30
31 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
32                                   struct cpumask *groupmask)
33 {
34         struct sched_group *group = sd->groups;
35
36         cpumask_clear(groupmask);
37
38         printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
39
40         if (!(sd->flags & SD_LOAD_BALANCE)) {
41                 printk("does not load-balance\n");
42                 if (sd->parent)
43                         printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
44                                         " has parent");
45                 return -1;
46         }
47
48         printk(KERN_CONT "span=%*pbl level=%s\n",
49                cpumask_pr_args(sched_domain_span(sd)), sd->name);
50
51         if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
52                 printk(KERN_ERR "ERROR: domain->span does not contain "
53                                 "CPU%d\n", cpu);
54         }
55         if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
56                 printk(KERN_ERR "ERROR: domain->groups does not contain"
57                                 " CPU%d\n", cpu);
58         }
59
60         printk(KERN_DEBUG "%*s groups:", level + 1, "");
61         do {
62                 if (!group) {
63                         printk("\n");
64                         printk(KERN_ERR "ERROR: group is NULL\n");
65                         break;
66                 }
67
68                 if (!cpumask_weight(sched_group_span(group))) {
69                         printk(KERN_CONT "\n");
70                         printk(KERN_ERR "ERROR: empty group\n");
71                         break;
72                 }
73
74                 if (!(sd->flags & SD_OVERLAP) &&
75                     cpumask_intersects(groupmask, sched_group_span(group))) {
76                         printk(KERN_CONT "\n");
77                         printk(KERN_ERR "ERROR: repeated CPUs\n");
78                         break;
79                 }
80
81                 cpumask_or(groupmask, groupmask, sched_group_span(group));
82
83                 printk(KERN_CONT " %d:{ span=%*pbl",
84                                 group->sgc->id,
85                                 cpumask_pr_args(sched_group_span(group)));
86
87                 if ((sd->flags & SD_OVERLAP) &&
88                     !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
89                         printk(KERN_CONT " mask=%*pbl",
90                                 cpumask_pr_args(group_balance_mask(group)));
91                 }
92
93                 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
94                         printk(KERN_CONT " cap=%lu", group->sgc->capacity);
95
96                 if (group == sd->groups && sd->child &&
97                     !cpumask_equal(sched_domain_span(sd->child),
98                                    sched_group_span(group))) {
99                         printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
100                 }
101
102                 printk(KERN_CONT " }");
103
104                 group = group->next;
105
106                 if (group != sd->groups)
107                         printk(KERN_CONT ",");
108
109         } while (group != sd->groups);
110         printk(KERN_CONT "\n");
111
112         if (!cpumask_equal(sched_domain_span(sd), groupmask))
113                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
114
115         if (sd->parent &&
116             !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
117                 printk(KERN_ERR "ERROR: parent span is not a superset "
118                         "of domain->span\n");
119         return 0;
120 }
121
122 static void sched_domain_debug(struct sched_domain *sd, int cpu)
123 {
124         int level = 0;
125
126         if (!sched_debug_enabled)
127                 return;
128
129         if (!sd) {
130                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
131                 return;
132         }
133
134         printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
135
136         for (;;) {
137                 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
138                         break;
139                 level++;
140                 sd = sd->parent;
141                 if (!sd)
142                         break;
143         }
144 }
145 #else /* !CONFIG_SCHED_DEBUG */
146
147 # define sched_debug_enabled 0
148 # define sched_domain_debug(sd, cpu) do { } while (0)
149 static inline bool sched_debug(void)
150 {
151         return false;
152 }
153 #endif /* CONFIG_SCHED_DEBUG */
154
155 static int sd_degenerate(struct sched_domain *sd)
156 {
157         if (cpumask_weight(sched_domain_span(sd)) == 1)
158                 return 1;
159
160         /* Following flags need at least 2 groups */
161         if (sd->flags & (SD_LOAD_BALANCE |
162                          SD_BALANCE_NEWIDLE |
163                          SD_BALANCE_FORK |
164                          SD_BALANCE_EXEC |
165                          SD_SHARE_CPUCAPACITY |
166                          SD_ASYM_CPUCAPACITY |
167                          SD_SHARE_PKG_RESOURCES |
168                          SD_SHARE_POWERDOMAIN)) {
169                 if (sd->groups != sd->groups->next)
170                         return 0;
171         }
172
173         /* Following flags don't use groups */
174         if (sd->flags & (SD_WAKE_AFFINE))
175                 return 0;
176
177         return 1;
178 }
179
180 static int
181 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
182 {
183         unsigned long cflags = sd->flags, pflags = parent->flags;
184
185         if (sd_degenerate(parent))
186                 return 1;
187
188         if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
189                 return 0;
190
191         /* Flags needing groups don't count if only 1 group in parent */
192         if (parent->groups == parent->groups->next) {
193                 pflags &= ~(SD_LOAD_BALANCE |
194                                 SD_BALANCE_NEWIDLE |
195                                 SD_BALANCE_FORK |
196                                 SD_BALANCE_EXEC |
197                                 SD_ASYM_CPUCAPACITY |
198                                 SD_SHARE_CPUCAPACITY |
199                                 SD_SHARE_PKG_RESOURCES |
200                                 SD_PREFER_SIBLING |
201                                 SD_SHARE_POWERDOMAIN);
202                 if (nr_node_ids == 1)
203                         pflags &= ~SD_SERIALIZE;
204         }
205         if (~cflags & pflags)
206                 return 0;
207
208         return 1;
209 }
210
211 static void free_rootdomain(struct rcu_head *rcu)
212 {
213         struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
214
215         cpupri_cleanup(&rd->cpupri);
216         cpudl_cleanup(&rd->cpudl);
217         free_cpumask_var(rd->dlo_mask);
218         free_cpumask_var(rd->rto_mask);
219         free_cpumask_var(rd->online);
220         free_cpumask_var(rd->span);
221         kfree(rd);
222 }
223
224 void rq_attach_root(struct rq *rq, struct root_domain *rd)
225 {
226         struct root_domain *old_rd = NULL;
227         unsigned long flags;
228
229         raw_spin_lock_irqsave(&rq->lock, flags);
230
231         if (rq->rd) {
232                 old_rd = rq->rd;
233
234                 if (cpumask_test_cpu(rq->cpu, old_rd->online))
235                         set_rq_offline(rq);
236
237                 cpumask_clear_cpu(rq->cpu, old_rd->span);
238
239                 /*
240                  * If we dont want to free the old_rd yet then
241                  * set old_rd to NULL to skip the freeing later
242                  * in this function:
243                  */
244                 if (!atomic_dec_and_test(&old_rd->refcount))
245                         old_rd = NULL;
246         }
247
248         atomic_inc(&rd->refcount);
249         rq->rd = rd;
250
251         cpumask_set_cpu(rq->cpu, rd->span);
252         if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
253                 set_rq_online(rq);
254
255         raw_spin_unlock_irqrestore(&rq->lock, flags);
256
257         if (old_rd)
258                 call_rcu_sched(&old_rd->rcu, free_rootdomain);
259 }
260
261 static int init_rootdomain(struct root_domain *rd)
262 {
263         if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
264                 goto out;
265         if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
266                 goto free_span;
267         if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
268                 goto free_online;
269         if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
270                 goto free_dlo_mask;
271
272 #ifdef HAVE_RT_PUSH_IPI
273         rd->rto_cpu = -1;
274         raw_spin_lock_init(&rd->rto_lock);
275         init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
276 #endif
277
278         init_dl_bw(&rd->dl_bw);
279         if (cpudl_init(&rd->cpudl) != 0)
280                 goto free_rto_mask;
281
282         if (cpupri_init(&rd->cpupri) != 0)
283                 goto free_cpudl;
284         return 0;
285
286 free_cpudl:
287         cpudl_cleanup(&rd->cpudl);
288 free_rto_mask:
289         free_cpumask_var(rd->rto_mask);
290 free_dlo_mask:
291         free_cpumask_var(rd->dlo_mask);
292 free_online:
293         free_cpumask_var(rd->online);
294 free_span:
295         free_cpumask_var(rd->span);
296 out:
297         return -ENOMEM;
298 }
299
300 /*
301  * By default the system creates a single root-domain with all CPUs as
302  * members (mimicking the global state we have today).
303  */
304 struct root_domain def_root_domain;
305
306 void init_defrootdomain(void)
307 {
308         init_rootdomain(&def_root_domain);
309
310         atomic_set(&def_root_domain.refcount, 1);
311 }
312
313 static struct root_domain *alloc_rootdomain(void)
314 {
315         struct root_domain *rd;
316
317         rd = kzalloc(sizeof(*rd), GFP_KERNEL);
318         if (!rd)
319                 return NULL;
320
321         if (init_rootdomain(rd) != 0) {
322                 kfree(rd);
323                 return NULL;
324         }
325
326         return rd;
327 }
328
329 static void free_sched_groups(struct sched_group *sg, int free_sgc)
330 {
331         struct sched_group *tmp, *first;
332
333         if (!sg)
334                 return;
335
336         first = sg;
337         do {
338                 tmp = sg->next;
339
340                 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
341                         kfree(sg->sgc);
342
343                 if (atomic_dec_and_test(&sg->ref))
344                         kfree(sg);
345                 sg = tmp;
346         } while (sg != first);
347 }
348
349 static void destroy_sched_domain(struct sched_domain *sd)
350 {
351         /*
352          * A normal sched domain may have multiple group references, an
353          * overlapping domain, having private groups, only one.  Iterate,
354          * dropping group/capacity references, freeing where none remain.
355          */
356         free_sched_groups(sd->groups, 1);
357
358         if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
359                 kfree(sd->shared);
360         kfree(sd);
361 }
362
363 static void destroy_sched_domains_rcu(struct rcu_head *rcu)
364 {
365         struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
366
367         while (sd) {
368                 struct sched_domain *parent = sd->parent;
369                 destroy_sched_domain(sd);
370                 sd = parent;
371         }
372 }
373
374 static void destroy_sched_domains(struct sched_domain *sd)
375 {
376         if (sd)
377                 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
378 }
379
380 /*
381  * Keep a special pointer to the highest sched_domain that has
382  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
383  * allows us to avoid some pointer chasing select_idle_sibling().
384  *
385  * Also keep a unique ID per domain (we use the first CPU number in
386  * the cpumask of the domain), this allows us to quickly tell if
387  * two CPUs are in the same cache domain, see cpus_share_cache().
388  */
389 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
390 DEFINE_PER_CPU(int, sd_llc_size);
391 DEFINE_PER_CPU(int, sd_llc_id);
392 DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
393 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
394 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
395
396 static void update_top_cache_domain(int cpu)
397 {
398         struct sched_domain_shared *sds = NULL;
399         struct sched_domain *sd;
400         int id = cpu;
401         int size = 1;
402
403         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
404         if (sd) {
405                 id = cpumask_first(sched_domain_span(sd));
406                 size = cpumask_weight(sched_domain_span(sd));
407                 sds = sd->shared;
408         }
409
410         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
411         per_cpu(sd_llc_size, cpu) = size;
412         per_cpu(sd_llc_id, cpu) = id;
413         rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
414
415         sd = lowest_flag_domain(cpu, SD_NUMA);
416         rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
417
418         sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
419         rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
420 }
421
422 /*
423  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
424  * hold the hotplug lock.
425  */
426 static void
427 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
428 {
429         struct rq *rq = cpu_rq(cpu);
430         struct sched_domain *tmp;
431
432         /* Remove the sched domains which do not contribute to scheduling. */
433         for (tmp = sd; tmp; ) {
434                 struct sched_domain *parent = tmp->parent;
435                 if (!parent)
436                         break;
437
438                 if (sd_parent_degenerate(tmp, parent)) {
439                         tmp->parent = parent->parent;
440                         if (parent->parent)
441                                 parent->parent->child = tmp;
442                         /*
443                          * Transfer SD_PREFER_SIBLING down in case of a
444                          * degenerate parent; the spans match for this
445                          * so the property transfers.
446                          */
447                         if (parent->flags & SD_PREFER_SIBLING)
448                                 tmp->flags |= SD_PREFER_SIBLING;
449                         destroy_sched_domain(parent);
450                 } else
451                         tmp = tmp->parent;
452         }
453
454         if (sd && sd_degenerate(sd)) {
455                 tmp = sd;
456                 sd = sd->parent;
457                 destroy_sched_domain(tmp);
458                 if (sd)
459                         sd->child = NULL;
460         }
461
462         sched_domain_debug(sd, cpu);
463
464         rq_attach_root(rq, rd);
465         tmp = rq->sd;
466         rcu_assign_pointer(rq->sd, sd);
467         dirty_sched_domain_sysctl(cpu);
468         destroy_sched_domains(tmp);
469
470         update_top_cache_domain(cpu);
471 }
472
473 struct s_data {
474         struct sched_domain ** __percpu sd;
475         struct root_domain      *rd;
476 };
477
478 enum s_alloc {
479         sa_rootdomain,
480         sa_sd,
481         sa_sd_storage,
482         sa_none,
483 };
484
485 /*
486  * Return the canonical balance CPU for this group, this is the first CPU
487  * of this group that's also in the balance mask.
488  *
489  * The balance mask are all those CPUs that could actually end up at this
490  * group. See build_balance_mask().
491  *
492  * Also see should_we_balance().
493  */
494 int group_balance_cpu(struct sched_group *sg)
495 {
496         return cpumask_first(group_balance_mask(sg));
497 }
498
499
500 /*
501  * NUMA topology (first read the regular topology blurb below)
502  *
503  * Given a node-distance table, for example:
504  *
505  *   node   0   1   2   3
506  *     0:  10  20  30  20
507  *     1:  20  10  20  30
508  *     2:  30  20  10  20
509  *     3:  20  30  20  10
510  *
511  * which represents a 4 node ring topology like:
512  *
513  *   0 ----- 1
514  *   |       |
515  *   |       |
516  *   |       |
517  *   3 ----- 2
518  *
519  * We want to construct domains and groups to represent this. The way we go
520  * about doing this is to build the domains on 'hops'. For each NUMA level we
521  * construct the mask of all nodes reachable in @level hops.
522  *
523  * For the above NUMA topology that gives 3 levels:
524  *
525  * NUMA-2       0-3             0-3             0-3             0-3
526  *  groups:     {0-1,3},{1-3}   {0-2},{0,2-3}   {1-3},{0-1,3}   {0,2-3},{0-2}
527  *
528  * NUMA-1       0-1,3           0-2             1-3             0,2-3
529  *  groups:     {0},{1},{3}     {0},{1},{2}     {1},{2},{3}     {0},{2},{3}
530  *
531  * NUMA-0       0               1               2               3
532  *
533  *
534  * As can be seen; things don't nicely line up as with the regular topology.
535  * When we iterate a domain in child domain chunks some nodes can be
536  * represented multiple times -- hence the "overlap" naming for this part of
537  * the topology.
538  *
539  * In order to minimize this overlap, we only build enough groups to cover the
540  * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
541  *
542  * Because:
543  *
544  *  - the first group of each domain is its child domain; this
545  *    gets us the first 0-1,3
546  *  - the only uncovered node is 2, who's child domain is 1-3.
547  *
548  * However, because of the overlap, computing a unique CPU for each group is
549  * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
550  * groups include the CPUs of Node-0, while those CPUs would not in fact ever
551  * end up at those groups (they would end up in group: 0-1,3).
552  *
553  * To correct this we have to introduce the group balance mask. This mask
554  * will contain those CPUs in the group that can reach this group given the
555  * (child) domain tree.
556  *
557  * With this we can once again compute balance_cpu and sched_group_capacity
558  * relations.
559  *
560  * XXX include words on how balance_cpu is unique and therefore can be
561  * used for sched_group_capacity links.
562  *
563  *
564  * Another 'interesting' topology is:
565  *
566  *   node   0   1   2   3
567  *     0:  10  20  20  30
568  *     1:  20  10  20  20
569  *     2:  20  20  10  20
570  *     3:  30  20  20  10
571  *
572  * Which looks a little like:
573  *
574  *   0 ----- 1
575  *   |     / |
576  *   |   /   |
577  *   | /     |
578  *   2 ----- 3
579  *
580  * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
581  * are not.
582  *
583  * This leads to a few particularly weird cases where the sched_domain's are
584  * not of the same number for each cpu. Consider:
585  *
586  * NUMA-2       0-3                                             0-3
587  *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
588  *
589  * NUMA-1       0-2             0-3             0-3             1-3
590  *
591  * NUMA-0       0               1               2               3
592  *
593  */
594
595
596 /*
597  * Build the balance mask; it contains only those CPUs that can arrive at this
598  * group and should be considered to continue balancing.
599  *
600  * We do this during the group creation pass, therefore the group information
601  * isn't complete yet, however since each group represents a (child) domain we
602  * can fully construct this using the sched_domain bits (which are already
603  * complete).
604  */
605 static void
606 build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
607 {
608         const struct cpumask *sg_span = sched_group_span(sg);
609         struct sd_data *sdd = sd->private;
610         struct sched_domain *sibling;
611         int i;
612
613         cpumask_clear(mask);
614
615         for_each_cpu(i, sg_span) {
616                 sibling = *per_cpu_ptr(sdd->sd, i);
617
618                 /*
619                  * Can happen in the asymmetric case, where these siblings are
620                  * unused. The mask will not be empty because those CPUs that
621                  * do have the top domain _should_ span the domain.
622                  */
623                 if (!sibling->child)
624                         continue;
625
626                 /* If we would not end up here, we can't continue from here */
627                 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
628                         continue;
629
630                 cpumask_set_cpu(i, mask);
631         }
632
633         /* We must not have empty masks here */
634         WARN_ON_ONCE(cpumask_empty(mask));
635 }
636
637 /*
638  * XXX: This creates per-node group entries; since the load-balancer will
639  * immediately access remote memory to construct this group's load-balance
640  * statistics having the groups node local is of dubious benefit.
641  */
642 static struct sched_group *
643 build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
644 {
645         struct sched_group *sg;
646         struct cpumask *sg_span;
647
648         sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
649                         GFP_KERNEL, cpu_to_node(cpu));
650
651         if (!sg)
652                 return NULL;
653
654         sg_span = sched_group_span(sg);
655         if (sd->child)
656                 cpumask_copy(sg_span, sched_domain_span(sd->child));
657         else
658                 cpumask_copy(sg_span, sched_domain_span(sd));
659
660         atomic_inc(&sg->ref);
661         return sg;
662 }
663
664 static void init_overlap_sched_group(struct sched_domain *sd,
665                                      struct sched_group *sg)
666 {
667         struct cpumask *mask = sched_domains_tmpmask2;
668         struct sd_data *sdd = sd->private;
669         struct cpumask *sg_span;
670         int cpu;
671
672         build_balance_mask(sd, sg, mask);
673         cpu = cpumask_first_and(sched_group_span(sg), mask);
674
675         sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
676         if (atomic_inc_return(&sg->sgc->ref) == 1)
677                 cpumask_copy(group_balance_mask(sg), mask);
678         else
679                 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
680
681         /*
682          * Initialize sgc->capacity such that even if we mess up the
683          * domains and no possible iteration will get us here, we won't
684          * die on a /0 trap.
685          */
686         sg_span = sched_group_span(sg);
687         sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
688         sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
689 }
690
691 static int
692 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
693 {
694         struct sched_group *first = NULL, *last = NULL, *sg;
695         const struct cpumask *span = sched_domain_span(sd);
696         struct cpumask *covered = sched_domains_tmpmask;
697         struct sd_data *sdd = sd->private;
698         struct sched_domain *sibling;
699         int i;
700
701         cpumask_clear(covered);
702
703         for_each_cpu_wrap(i, span, cpu) {
704                 struct cpumask *sg_span;
705
706                 if (cpumask_test_cpu(i, covered))
707                         continue;
708
709                 sibling = *per_cpu_ptr(sdd->sd, i);
710
711                 /*
712                  * Asymmetric node setups can result in situations where the
713                  * domain tree is of unequal depth, make sure to skip domains
714                  * that already cover the entire range.
715                  *
716                  * In that case build_sched_domains() will have terminated the
717                  * iteration early and our sibling sd spans will be empty.
718                  * Domains should always include the CPU they're built on, so
719                  * check that.
720                  */
721                 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
722                         continue;
723
724                 sg = build_group_from_child_sched_domain(sibling, cpu);
725                 if (!sg)
726                         goto fail;
727
728                 sg_span = sched_group_span(sg);
729                 cpumask_or(covered, covered, sg_span);
730
731                 init_overlap_sched_group(sd, sg);
732
733                 if (!first)
734                         first = sg;
735                 if (last)
736                         last->next = sg;
737                 last = sg;
738                 last->next = first;
739         }
740         sd->groups = first;
741
742         return 0;
743
744 fail:
745         free_sched_groups(first, 0);
746
747         return -ENOMEM;
748 }
749
750
751 /*
752  * Package topology (also see the load-balance blurb in fair.c)
753  *
754  * The scheduler builds a tree structure to represent a number of important
755  * topology features. By default (default_topology[]) these include:
756  *
757  *  - Simultaneous multithreading (SMT)
758  *  - Multi-Core Cache (MC)
759  *  - Package (DIE)
760  *
761  * Where the last one more or less denotes everything up to a NUMA node.
762  *
763  * The tree consists of 3 primary data structures:
764  *
765  *      sched_domain -> sched_group -> sched_group_capacity
766  *          ^ ^             ^ ^
767  *          `-'             `-'
768  *
769  * The sched_domains are per-cpu and have a two way link (parent & child) and
770  * denote the ever growing mask of CPUs belonging to that level of topology.
771  *
772  * Each sched_domain has a circular (double) linked list of sched_group's, each
773  * denoting the domains of the level below (or individual CPUs in case of the
774  * first domain level). The sched_group linked by a sched_domain includes the
775  * CPU of that sched_domain [*].
776  *
777  * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
778  *
779  * CPU   0   1   2   3   4   5   6   7
780  *
781  * DIE  [                             ]
782  * MC   [             ] [             ]
783  * SMT  [     ] [     ] [     ] [     ]
784  *
785  *  - or -
786  *
787  * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
788  * MC   0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
789  * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
790  *
791  * CPU   0   1   2   3   4   5   6   7
792  *
793  * One way to think about it is: sched_domain moves you up and down among these
794  * topology levels, while sched_group moves you sideways through it, at child
795  * domain granularity.
796  *
797  * sched_group_capacity ensures each unique sched_group has shared storage.
798  *
799  * There are two related construction problems, both require a CPU that
800  * uniquely identify each group (for a given domain):
801  *
802  *  - The first is the balance_cpu (see should_we_balance() and the
803  *    load-balance blub in fair.c); for each group we only want 1 CPU to
804  *    continue balancing at a higher domain.
805  *
806  *  - The second is the sched_group_capacity; we want all identical groups
807  *    to share a single sched_group_capacity.
808  *
809  * Since these topologies are exclusive by construction. That is, its
810  * impossible for an SMT thread to belong to multiple cores, and cores to
811  * be part of multiple caches. There is a very clear and unique location
812  * for each CPU in the hierarchy.
813  *
814  * Therefore computing a unique CPU for each group is trivial (the iteration
815  * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
816  * group), we can simply pick the first CPU in each group.
817  *
818  *
819  * [*] in other words, the first group of each domain is its child domain.
820  */
821
822 static struct sched_group *get_group(int cpu, struct sd_data *sdd)
823 {
824         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
825         struct sched_domain *child = sd->child;
826         struct sched_group *sg;
827
828         if (child)
829                 cpu = cpumask_first(sched_domain_span(child));
830
831         sg = *per_cpu_ptr(sdd->sg, cpu);
832         sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
833
834         /* For claim_allocations: */
835         atomic_inc(&sg->ref);
836         atomic_inc(&sg->sgc->ref);
837
838         if (child) {
839                 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
840                 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
841         } else {
842                 cpumask_set_cpu(cpu, sched_group_span(sg));
843                 cpumask_set_cpu(cpu, group_balance_mask(sg));
844         }
845
846         sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
847         sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
848
849         return sg;
850 }
851
852 /*
853  * build_sched_groups will build a circular linked list of the groups
854  * covered by the given span, and will set each group's ->cpumask correctly,
855  * and ->cpu_capacity to 0.
856  *
857  * Assumes the sched_domain tree is fully constructed
858  */
859 static int
860 build_sched_groups(struct sched_domain *sd, int cpu)
861 {
862         struct sched_group *first = NULL, *last = NULL;
863         struct sd_data *sdd = sd->private;
864         const struct cpumask *span = sched_domain_span(sd);
865         struct cpumask *covered;
866         int i;
867
868         lockdep_assert_held(&sched_domains_mutex);
869         covered = sched_domains_tmpmask;
870
871         cpumask_clear(covered);
872
873         for_each_cpu_wrap(i, span, cpu) {
874                 struct sched_group *sg;
875
876                 if (cpumask_test_cpu(i, covered))
877                         continue;
878
879                 sg = get_group(i, sdd);
880
881                 cpumask_or(covered, covered, sched_group_span(sg));
882
883                 if (!first)
884                         first = sg;
885                 if (last)
886                         last->next = sg;
887                 last = sg;
888         }
889         last->next = first;
890         sd->groups = first;
891
892         return 0;
893 }
894
895 /*
896  * Initialize sched groups cpu_capacity.
897  *
898  * cpu_capacity indicates the capacity of sched group, which is used while
899  * distributing the load between different sched groups in a sched domain.
900  * Typically cpu_capacity for all the groups in a sched domain will be same
901  * unless there are asymmetries in the topology. If there are asymmetries,
902  * group having more cpu_capacity will pickup more load compared to the
903  * group having less cpu_capacity.
904  */
905 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
906 {
907         struct sched_group *sg = sd->groups;
908
909         WARN_ON(!sg);
910
911         do {
912                 int cpu, max_cpu = -1;
913
914                 sg->group_weight = cpumask_weight(sched_group_span(sg));
915
916                 if (!(sd->flags & SD_ASYM_PACKING))
917                         goto next;
918
919                 for_each_cpu(cpu, sched_group_span(sg)) {
920                         if (max_cpu < 0)
921                                 max_cpu = cpu;
922                         else if (sched_asym_prefer(cpu, max_cpu))
923                                 max_cpu = cpu;
924                 }
925                 sg->asym_prefer_cpu = max_cpu;
926
927 next:
928                 sg = sg->next;
929         } while (sg != sd->groups);
930
931         if (cpu != group_balance_cpu(sg))
932                 return;
933
934         update_group_capacity(sd, cpu);
935 }
936
937 /*
938  * Initializers for schedule domains
939  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
940  */
941
942 static int default_relax_domain_level = -1;
943 int sched_domain_level_max;
944
945 static int __init setup_relax_domain_level(char *str)
946 {
947         if (kstrtoint(str, 0, &default_relax_domain_level))
948                 pr_warn("Unable to set relax_domain_level\n");
949
950         return 1;
951 }
952 __setup("relax_domain_level=", setup_relax_domain_level);
953
954 static void set_domain_attribute(struct sched_domain *sd,
955                                  struct sched_domain_attr *attr)
956 {
957         int request;
958
959         if (!attr || attr->relax_domain_level < 0) {
960                 if (default_relax_domain_level < 0)
961                         return;
962                 else
963                         request = default_relax_domain_level;
964         } else
965                 request = attr->relax_domain_level;
966         if (request < sd->level) {
967                 /* Turn off idle balance on this domain: */
968                 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
969         } else {
970                 /* Turn on idle balance on this domain: */
971                 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
972         }
973 }
974
975 static void __sdt_free(const struct cpumask *cpu_map);
976 static int __sdt_alloc(const struct cpumask *cpu_map);
977
978 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
979                                  const struct cpumask *cpu_map)
980 {
981         switch (what) {
982         case sa_rootdomain:
983                 if (!atomic_read(&d->rd->refcount))
984                         free_rootdomain(&d->rd->rcu);
985                 /* Fall through */
986         case sa_sd:
987                 free_percpu(d->sd);
988                 /* Fall through */
989         case sa_sd_storage:
990                 __sdt_free(cpu_map);
991                 /* Fall through */
992         case sa_none:
993                 break;
994         }
995 }
996
997 static enum s_alloc
998 __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
999 {
1000         memset(d, 0, sizeof(*d));
1001
1002         if (__sdt_alloc(cpu_map))
1003                 return sa_sd_storage;
1004         d->sd = alloc_percpu(struct sched_domain *);
1005         if (!d->sd)
1006                 return sa_sd_storage;
1007         d->rd = alloc_rootdomain();
1008         if (!d->rd)
1009                 return sa_sd;
1010         return sa_rootdomain;
1011 }
1012
1013 /*
1014  * NULL the sd_data elements we've used to build the sched_domain and
1015  * sched_group structure so that the subsequent __free_domain_allocs()
1016  * will not free the data we're using.
1017  */
1018 static void claim_allocations(int cpu, struct sched_domain *sd)
1019 {
1020         struct sd_data *sdd = sd->private;
1021
1022         WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1023         *per_cpu_ptr(sdd->sd, cpu) = NULL;
1024
1025         if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1026                 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1027
1028         if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1029                 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1030
1031         if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1032                 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1033 }
1034
1035 #ifdef CONFIG_NUMA
1036 static int sched_domains_numa_levels;
1037 enum numa_topology_type sched_numa_topology_type;
1038 static int *sched_domains_numa_distance;
1039 int sched_max_numa_distance;
1040 static struct cpumask ***sched_domains_numa_masks;
1041 static int sched_domains_curr_level;
1042 #endif
1043
1044 /*
1045  * SD_flags allowed in topology descriptions.
1046  *
1047  * These flags are purely descriptive of the topology and do not prescribe
1048  * behaviour. Behaviour is artificial and mapped in the below sd_init()
1049  * function:
1050  *
1051  *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
1052  *   SD_SHARE_PKG_RESOURCES - describes shared caches
1053  *   SD_NUMA                - describes NUMA topologies
1054  *   SD_SHARE_POWERDOMAIN   - describes shared power domain
1055  *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
1056  *
1057  * Odd one out, which beside describing the topology has a quirk also
1058  * prescribes the desired behaviour that goes along with it:
1059  *
1060  *   SD_ASYM_PACKING        - describes SMT quirks
1061  */
1062 #define TOPOLOGY_SD_FLAGS               \
1063         (SD_SHARE_CPUCAPACITY |         \
1064          SD_SHARE_PKG_RESOURCES |       \
1065          SD_NUMA |                      \
1066          SD_ASYM_PACKING |              \
1067          SD_ASYM_CPUCAPACITY |          \
1068          SD_SHARE_POWERDOMAIN)
1069
1070 static struct sched_domain *
1071 sd_init(struct sched_domain_topology_level *tl,
1072         const struct cpumask *cpu_map,
1073         struct sched_domain *child, int cpu)
1074 {
1075         struct sd_data *sdd = &tl->data;
1076         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1077         int sd_id, sd_weight, sd_flags = 0;
1078
1079 #ifdef CONFIG_NUMA
1080         /*
1081          * Ugly hack to pass state to sd_numa_mask()...
1082          */
1083         sched_domains_curr_level = tl->numa_level;
1084 #endif
1085
1086         sd_weight = cpumask_weight(tl->mask(cpu));
1087
1088         if (tl->sd_flags)
1089                 sd_flags = (*tl->sd_flags)();
1090         if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1091                         "wrong sd_flags in topology description\n"))
1092                 sd_flags &= ~TOPOLOGY_SD_FLAGS;
1093
1094         *sd = (struct sched_domain){
1095                 .min_interval           = sd_weight,
1096                 .max_interval           = 2*sd_weight,
1097                 .busy_factor            = 32,
1098                 .imbalance_pct          = 125,
1099
1100                 .cache_nice_tries       = 0,
1101                 .busy_idx               = 0,
1102                 .idle_idx               = 0,
1103                 .newidle_idx            = 0,
1104                 .wake_idx               = 0,
1105                 .forkexec_idx           = 0,
1106
1107                 .flags                  = 1*SD_LOAD_BALANCE
1108                                         | 1*SD_BALANCE_NEWIDLE
1109                                         | 1*SD_BALANCE_EXEC
1110                                         | 1*SD_BALANCE_FORK
1111                                         | 0*SD_BALANCE_WAKE
1112                                         | 1*SD_WAKE_AFFINE
1113                                         | 0*SD_SHARE_CPUCAPACITY
1114                                         | 0*SD_SHARE_PKG_RESOURCES
1115                                         | 0*SD_SERIALIZE
1116                                         | 0*SD_PREFER_SIBLING
1117                                         | 0*SD_NUMA
1118                                         | sd_flags
1119                                         ,
1120
1121                 .last_balance           = jiffies,
1122                 .balance_interval       = sd_weight,
1123                 .smt_gain               = 0,
1124                 .max_newidle_lb_cost    = 0,
1125                 .next_decay_max_lb_cost = jiffies,
1126                 .child                  = child,
1127 #ifdef CONFIG_SCHED_DEBUG
1128                 .name                   = tl->name,
1129 #endif
1130         };
1131
1132         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1133         sd_id = cpumask_first(sched_domain_span(sd));
1134
1135         /*
1136          * Convert topological properties into behaviour.
1137          */
1138
1139         if (sd->flags & SD_ASYM_CPUCAPACITY) {
1140                 struct sched_domain *t = sd;
1141
1142                 for_each_lower_domain(t)
1143                         t->flags |= SD_BALANCE_WAKE;
1144         }
1145
1146         if (sd->flags & SD_SHARE_CPUCAPACITY) {
1147                 sd->flags |= SD_PREFER_SIBLING;
1148                 sd->imbalance_pct = 110;
1149                 sd->smt_gain = 1178; /* ~15% */
1150
1151         } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1152                 sd->flags |= SD_PREFER_SIBLING;
1153                 sd->imbalance_pct = 117;
1154                 sd->cache_nice_tries = 1;
1155                 sd->busy_idx = 2;
1156
1157 #ifdef CONFIG_NUMA
1158         } else if (sd->flags & SD_NUMA) {
1159                 sd->cache_nice_tries = 2;
1160                 sd->busy_idx = 3;
1161                 sd->idle_idx = 2;
1162
1163                 sd->flags |= SD_SERIALIZE;
1164                 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
1165                         sd->flags &= ~(SD_BALANCE_EXEC |
1166                                        SD_BALANCE_FORK |
1167                                        SD_WAKE_AFFINE);
1168                 }
1169
1170 #endif
1171         } else {
1172                 sd->flags |= SD_PREFER_SIBLING;
1173                 sd->cache_nice_tries = 1;
1174                 sd->busy_idx = 2;
1175                 sd->idle_idx = 1;
1176         }
1177
1178         /*
1179          * For all levels sharing cache; connect a sched_domain_shared
1180          * instance.
1181          */
1182         if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1183                 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1184                 atomic_inc(&sd->shared->ref);
1185                 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1186         }
1187
1188         sd->private = sdd;
1189
1190         return sd;
1191 }
1192
1193 /*
1194  * Topology list, bottom-up.
1195  */
1196 static struct sched_domain_topology_level default_topology[] = {
1197 #ifdef CONFIG_SCHED_SMT
1198         { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1199 #endif
1200 #ifdef CONFIG_SCHED_MC
1201         { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1202 #endif
1203         { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1204         { NULL, },
1205 };
1206
1207 static struct sched_domain_topology_level *sched_domain_topology =
1208         default_topology;
1209
1210 #define for_each_sd_topology(tl)                        \
1211         for (tl = sched_domain_topology; tl->mask; tl++)
1212
1213 void set_sched_topology(struct sched_domain_topology_level *tl)
1214 {
1215         if (WARN_ON_ONCE(sched_smp_initialized))
1216                 return;
1217
1218         sched_domain_topology = tl;
1219 }
1220
1221 #ifdef CONFIG_NUMA
1222
1223 static const struct cpumask *sd_numa_mask(int cpu)
1224 {
1225         return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1226 }
1227
1228 static void sched_numa_warn(const char *str)
1229 {
1230         static int done = false;
1231         int i,j;
1232
1233         if (done)
1234                 return;
1235
1236         done = true;
1237
1238         printk(KERN_WARNING "ERROR: %s\n\n", str);
1239
1240         for (i = 0; i < nr_node_ids; i++) {
1241                 printk(KERN_WARNING "  ");
1242                 for (j = 0; j < nr_node_ids; j++)
1243                         printk(KERN_CONT "%02d ", node_distance(i,j));
1244                 printk(KERN_CONT "\n");
1245         }
1246         printk(KERN_WARNING "\n");
1247 }
1248
1249 bool find_numa_distance(int distance)
1250 {
1251         int i;
1252
1253         if (distance == node_distance(0, 0))
1254                 return true;
1255
1256         for (i = 0; i < sched_domains_numa_levels; i++) {
1257                 if (sched_domains_numa_distance[i] == distance)
1258                         return true;
1259         }
1260
1261         return false;
1262 }
1263
1264 /*
1265  * A system can have three types of NUMA topology:
1266  * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1267  * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1268  * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1269  *
1270  * The difference between a glueless mesh topology and a backplane
1271  * topology lies in whether communication between not directly
1272  * connected nodes goes through intermediary nodes (where programs
1273  * could run), or through backplane controllers. This affects
1274  * placement of programs.
1275  *
1276  * The type of topology can be discerned with the following tests:
1277  * - If the maximum distance between any nodes is 1 hop, the system
1278  *   is directly connected.
1279  * - If for two nodes A and B, located N > 1 hops away from each other,
1280  *   there is an intermediary node C, which is < N hops away from both
1281  *   nodes A and B, the system is a glueless mesh.
1282  */
1283 static void init_numa_topology_type(void)
1284 {
1285         int a, b, c, n;
1286
1287         n = sched_max_numa_distance;
1288
1289         if (sched_domains_numa_levels <= 1) {
1290                 sched_numa_topology_type = NUMA_DIRECT;
1291                 return;
1292         }
1293
1294         for_each_online_node(a) {
1295                 for_each_online_node(b) {
1296                         /* Find two nodes furthest removed from each other. */
1297                         if (node_distance(a, b) < n)
1298                                 continue;
1299
1300                         /* Is there an intermediary node between a and b? */
1301                         for_each_online_node(c) {
1302                                 if (node_distance(a, c) < n &&
1303                                     node_distance(b, c) < n) {
1304                                         sched_numa_topology_type =
1305                                                         NUMA_GLUELESS_MESH;
1306                                         return;
1307                                 }
1308                         }
1309
1310                         sched_numa_topology_type = NUMA_BACKPLANE;
1311                         return;
1312                 }
1313         }
1314 }
1315
1316 void sched_init_numa(void)
1317 {
1318         int next_distance, curr_distance = node_distance(0, 0);
1319         struct sched_domain_topology_level *tl;
1320         int level = 0;
1321         int i, j, k;
1322
1323         sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
1324         if (!sched_domains_numa_distance)
1325                 return;
1326
1327         /* Includes NUMA identity node at level 0. */
1328         sched_domains_numa_distance[level++] = curr_distance;
1329         sched_domains_numa_levels = level;
1330
1331         /*
1332          * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1333          * unique distances in the node_distance() table.
1334          *
1335          * Assumes node_distance(0,j) includes all distances in
1336          * node_distance(i,j) in order to avoid cubic time.
1337          */
1338         next_distance = curr_distance;
1339         for (i = 0; i < nr_node_ids; i++) {
1340                 for (j = 0; j < nr_node_ids; j++) {
1341                         for (k = 0; k < nr_node_ids; k++) {
1342                                 int distance = node_distance(i, k);
1343
1344                                 if (distance > curr_distance &&
1345                                     (distance < next_distance ||
1346                                      next_distance == curr_distance))
1347                                         next_distance = distance;
1348
1349                                 /*
1350                                  * While not a strong assumption it would be nice to know
1351                                  * about cases where if node A is connected to B, B is not
1352                                  * equally connected to A.
1353                                  */
1354                                 if (sched_debug() && node_distance(k, i) != distance)
1355                                         sched_numa_warn("Node-distance not symmetric");
1356
1357                                 if (sched_debug() && i && !find_numa_distance(distance))
1358                                         sched_numa_warn("Node-0 not representative");
1359                         }
1360                         if (next_distance != curr_distance) {
1361                                 sched_domains_numa_distance[level++] = next_distance;
1362                                 sched_domains_numa_levels = level;
1363                                 curr_distance = next_distance;
1364                         } else break;
1365                 }
1366
1367                 /*
1368                  * In case of sched_debug() we verify the above assumption.
1369                  */
1370                 if (!sched_debug())
1371                         break;
1372         }
1373
1374         if (!level)
1375                 return;
1376
1377         /*
1378          * 'level' contains the number of unique distances
1379          *
1380          * The sched_domains_numa_distance[] array includes the actual distance
1381          * numbers.
1382          */
1383
1384         /*
1385          * Here, we should temporarily reset sched_domains_numa_levels to 0.
1386          * If it fails to allocate memory for array sched_domains_numa_masks[][],
1387          * the array will contain less then 'level' members. This could be
1388          * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1389          * in other functions.
1390          *
1391          * We reset it to 'level' at the end of this function.
1392          */
1393         sched_domains_numa_levels = 0;
1394
1395         sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1396         if (!sched_domains_numa_masks)
1397                 return;
1398
1399         /*
1400          * Now for each level, construct a mask per node which contains all
1401          * CPUs of nodes that are that many hops away from us.
1402          */
1403         for (i = 0; i < level; i++) {
1404                 sched_domains_numa_masks[i] =
1405                         kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1406                 if (!sched_domains_numa_masks[i])
1407                         return;
1408
1409                 for (j = 0; j < nr_node_ids; j++) {
1410                         struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1411                         if (!mask)
1412                                 return;
1413
1414                         sched_domains_numa_masks[i][j] = mask;
1415
1416                         for_each_node(k) {
1417                                 if (node_distance(j, k) > sched_domains_numa_distance[i])
1418                                         continue;
1419
1420                                 cpumask_or(mask, mask, cpumask_of_node(k));
1421                         }
1422                 }
1423         }
1424
1425         /* Compute default topology size */
1426         for (i = 0; sched_domain_topology[i].mask; i++);
1427
1428         tl = kzalloc((i + level + 1) *
1429                         sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1430         if (!tl)
1431                 return;
1432
1433         /*
1434          * Copy the default topology bits..
1435          */
1436         for (i = 0; sched_domain_topology[i].mask; i++)
1437                 tl[i] = sched_domain_topology[i];
1438
1439         /*
1440          * Add the NUMA identity distance, aka single NODE.
1441          */
1442         tl[i++] = (struct sched_domain_topology_level){
1443                 .mask = sd_numa_mask,
1444                 .numa_level = 0,
1445                 SD_INIT_NAME(NODE)
1446         };
1447
1448         /*
1449          * .. and append 'j' levels of NUMA goodness.
1450          */
1451         for (j = 1; j < level; i++, j++) {
1452                 tl[i] = (struct sched_domain_topology_level){
1453                         .mask = sd_numa_mask,
1454                         .sd_flags = cpu_numa_flags,
1455                         .flags = SDTL_OVERLAP,
1456                         .numa_level = j,
1457                         SD_INIT_NAME(NUMA)
1458                 };
1459         }
1460
1461         sched_domain_topology = tl;
1462
1463         sched_domains_numa_levels = level;
1464         sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1465
1466         init_numa_topology_type();
1467 }
1468
1469 void sched_domains_numa_masks_set(unsigned int cpu)
1470 {
1471         int node = cpu_to_node(cpu);
1472         int i, j;
1473
1474         for (i = 0; i < sched_domains_numa_levels; i++) {
1475                 for (j = 0; j < nr_node_ids; j++) {
1476                         if (node_distance(j, node) <= sched_domains_numa_distance[i])
1477                                 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1478                 }
1479         }
1480 }
1481
1482 void sched_domains_numa_masks_clear(unsigned int cpu)
1483 {
1484         int i, j;
1485
1486         for (i = 0; i < sched_domains_numa_levels; i++) {
1487                 for (j = 0; j < nr_node_ids; j++)
1488                         cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1489         }
1490 }
1491
1492 #endif /* CONFIG_NUMA */
1493
1494 static int __sdt_alloc(const struct cpumask *cpu_map)
1495 {
1496         struct sched_domain_topology_level *tl;
1497         int j;
1498
1499         for_each_sd_topology(tl) {
1500                 struct sd_data *sdd = &tl->data;
1501
1502                 sdd->sd = alloc_percpu(struct sched_domain *);
1503                 if (!sdd->sd)
1504                         return -ENOMEM;
1505
1506                 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1507                 if (!sdd->sds)
1508                         return -ENOMEM;
1509
1510                 sdd->sg = alloc_percpu(struct sched_group *);
1511                 if (!sdd->sg)
1512                         return -ENOMEM;
1513
1514                 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1515                 if (!sdd->sgc)
1516                         return -ENOMEM;
1517
1518                 for_each_cpu(j, cpu_map) {
1519                         struct sched_domain *sd;
1520                         struct sched_domain_shared *sds;
1521                         struct sched_group *sg;
1522                         struct sched_group_capacity *sgc;
1523
1524                         sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1525                                         GFP_KERNEL, cpu_to_node(j));
1526                         if (!sd)
1527                                 return -ENOMEM;
1528
1529                         *per_cpu_ptr(sdd->sd, j) = sd;
1530
1531                         sds = kzalloc_node(sizeof(struct sched_domain_shared),
1532                                         GFP_KERNEL, cpu_to_node(j));
1533                         if (!sds)
1534                                 return -ENOMEM;
1535
1536                         *per_cpu_ptr(sdd->sds, j) = sds;
1537
1538                         sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1539                                         GFP_KERNEL, cpu_to_node(j));
1540                         if (!sg)
1541                                 return -ENOMEM;
1542
1543                         sg->next = sg;
1544
1545                         *per_cpu_ptr(sdd->sg, j) = sg;
1546
1547                         sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1548                                         GFP_KERNEL, cpu_to_node(j));
1549                         if (!sgc)
1550                                 return -ENOMEM;
1551
1552 #ifdef CONFIG_SCHED_DEBUG
1553                         sgc->id = j;
1554 #endif
1555
1556                         *per_cpu_ptr(sdd->sgc, j) = sgc;
1557                 }
1558         }
1559
1560         return 0;
1561 }
1562
1563 static void __sdt_free(const struct cpumask *cpu_map)
1564 {
1565         struct sched_domain_topology_level *tl;
1566         int j;
1567
1568         for_each_sd_topology(tl) {
1569                 struct sd_data *sdd = &tl->data;
1570
1571                 for_each_cpu(j, cpu_map) {
1572                         struct sched_domain *sd;
1573
1574                         if (sdd->sd) {
1575                                 sd = *per_cpu_ptr(sdd->sd, j);
1576                                 if (sd && (sd->flags & SD_OVERLAP))
1577                                         free_sched_groups(sd->groups, 0);
1578                                 kfree(*per_cpu_ptr(sdd->sd, j));
1579                         }
1580
1581                         if (sdd->sds)
1582                                 kfree(*per_cpu_ptr(sdd->sds, j));
1583                         if (sdd->sg)
1584                                 kfree(*per_cpu_ptr(sdd->sg, j));
1585                         if (sdd->sgc)
1586                                 kfree(*per_cpu_ptr(sdd->sgc, j));
1587                 }
1588                 free_percpu(sdd->sd);
1589                 sdd->sd = NULL;
1590                 free_percpu(sdd->sds);
1591                 sdd->sds = NULL;
1592                 free_percpu(sdd->sg);
1593                 sdd->sg = NULL;
1594                 free_percpu(sdd->sgc);
1595                 sdd->sgc = NULL;
1596         }
1597 }
1598
1599 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
1600                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
1601                 struct sched_domain *child, int cpu)
1602 {
1603         struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
1604
1605         if (child) {
1606                 sd->level = child->level + 1;
1607                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
1608                 child->parent = sd;
1609
1610                 if (!cpumask_subset(sched_domain_span(child),
1611                                     sched_domain_span(sd))) {
1612                         pr_err("BUG: arch topology borken\n");
1613 #ifdef CONFIG_SCHED_DEBUG
1614                         pr_err("     the %s domain not a subset of the %s domain\n",
1615                                         child->name, sd->name);
1616 #endif
1617                         /* Fixup, ensure @sd has at least @child cpus. */
1618                         cpumask_or(sched_domain_span(sd),
1619                                    sched_domain_span(sd),
1620                                    sched_domain_span(child));
1621                 }
1622
1623         }
1624         set_domain_attribute(sd, attr);
1625
1626         return sd;
1627 }
1628
1629 /*
1630  * Build sched domains for a given set of CPUs and attach the sched domains
1631  * to the individual CPUs
1632  */
1633 static int
1634 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
1635 {
1636         enum s_alloc alloc_state;
1637         struct sched_domain *sd;
1638         struct s_data d;
1639         struct rq *rq = NULL;
1640         int i, ret = -ENOMEM;
1641
1642         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1643         if (alloc_state != sa_rootdomain)
1644                 goto error;
1645
1646         /* Set up domains for CPUs specified by the cpu_map: */
1647         for_each_cpu(i, cpu_map) {
1648                 struct sched_domain_topology_level *tl;
1649
1650                 sd = NULL;
1651                 for_each_sd_topology(tl) {
1652                         sd = build_sched_domain(tl, cpu_map, attr, sd, i);
1653                         if (tl == sched_domain_topology)
1654                                 *per_cpu_ptr(d.sd, i) = sd;
1655                         if (tl->flags & SDTL_OVERLAP)
1656                                 sd->flags |= SD_OVERLAP;
1657                         if (cpumask_equal(cpu_map, sched_domain_span(sd)))
1658                                 break;
1659                 }
1660         }
1661
1662         /* Build the groups for the domains */
1663         for_each_cpu(i, cpu_map) {
1664                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1665                         sd->span_weight = cpumask_weight(sched_domain_span(sd));
1666                         if (sd->flags & SD_OVERLAP) {
1667                                 if (build_overlap_sched_groups(sd, i))
1668                                         goto error;
1669                         } else {
1670                                 if (build_sched_groups(sd, i))
1671                                         goto error;
1672                         }
1673                 }
1674         }
1675
1676         /* Calculate CPU capacity for physical packages and nodes */
1677         for (i = nr_cpumask_bits-1; i >= 0; i--) {
1678                 if (!cpumask_test_cpu(i, cpu_map))
1679                         continue;
1680
1681                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1682                         claim_allocations(i, sd);
1683                         init_sched_groups_capacity(i, sd);
1684                 }
1685         }
1686
1687         /* Attach the domains */
1688         rcu_read_lock();
1689         for_each_cpu(i, cpu_map) {
1690                 rq = cpu_rq(i);
1691                 sd = *per_cpu_ptr(d.sd, i);
1692
1693                 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
1694                 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
1695                         WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
1696
1697                 cpu_attach_domain(sd, d.rd, i);
1698         }
1699         rcu_read_unlock();
1700
1701         if (rq && sched_debug_enabled) {
1702                 pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
1703                         cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
1704         }
1705
1706         ret = 0;
1707 error:
1708         __free_domain_allocs(&d, alloc_state, cpu_map);
1709         return ret;
1710 }
1711
1712 /* Current sched domains: */
1713 static cpumask_var_t                    *doms_cur;
1714
1715 /* Number of sched domains in 'doms_cur': */
1716 static int                              ndoms_cur;
1717
1718 /* Attribues of custom domains in 'doms_cur' */
1719 static struct sched_domain_attr         *dattr_cur;
1720
1721 /*
1722  * Special case: If a kmalloc() of a doms_cur partition (array of
1723  * cpumask) fails, then fallback to a single sched domain,
1724  * as determined by the single cpumask fallback_doms.
1725  */
1726 static cpumask_var_t                    fallback_doms;
1727
1728 /*
1729  * arch_update_cpu_topology lets virtualized architectures update the
1730  * CPU core maps. It is supposed to return 1 if the topology changed
1731  * or 0 if it stayed the same.
1732  */
1733 int __weak arch_update_cpu_topology(void)
1734 {
1735         return 0;
1736 }
1737
1738 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
1739 {
1740         int i;
1741         cpumask_var_t *doms;
1742
1743         doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
1744         if (!doms)
1745                 return NULL;
1746         for (i = 0; i < ndoms; i++) {
1747                 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
1748                         free_sched_domains(doms, i);
1749                         return NULL;
1750                 }
1751         }
1752         return doms;
1753 }
1754
1755 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
1756 {
1757         unsigned int i;
1758         for (i = 0; i < ndoms; i++)
1759                 free_cpumask_var(doms[i]);
1760         kfree(doms);
1761 }
1762
1763 /*
1764  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
1765  * For now this just excludes isolated CPUs, but could be used to
1766  * exclude other special cases in the future.
1767  */
1768 int sched_init_domains(const struct cpumask *cpu_map)
1769 {
1770         int err;
1771
1772         zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
1773         zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
1774         zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
1775
1776         arch_update_cpu_topology();
1777         ndoms_cur = 1;
1778         doms_cur = alloc_sched_domains(ndoms_cur);
1779         if (!doms_cur)
1780                 doms_cur = &fallback_doms;
1781         cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
1782         err = build_sched_domains(doms_cur[0], NULL);
1783         register_sched_domain_sysctl();
1784
1785         return err;
1786 }
1787
1788 /*
1789  * Detach sched domains from a group of CPUs specified in cpu_map
1790  * These CPUs will now be attached to the NULL domain
1791  */
1792 static void detach_destroy_domains(const struct cpumask *cpu_map)
1793 {
1794         int i;
1795
1796         rcu_read_lock();
1797         for_each_cpu(i, cpu_map)
1798                 cpu_attach_domain(NULL, &def_root_domain, i);
1799         rcu_read_unlock();
1800 }
1801
1802 /* handle null as "default" */
1803 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1804                         struct sched_domain_attr *new, int idx_new)
1805 {
1806         struct sched_domain_attr tmp;
1807
1808         /* Fast path: */
1809         if (!new && !cur)
1810                 return 1;
1811
1812         tmp = SD_ATTR_INIT;
1813         return !memcmp(cur ? (cur + idx_cur) : &tmp,
1814                         new ? (new + idx_new) : &tmp,
1815                         sizeof(struct sched_domain_attr));
1816 }
1817
1818 /*
1819  * Partition sched domains as specified by the 'ndoms_new'
1820  * cpumasks in the array doms_new[] of cpumasks. This compares
1821  * doms_new[] to the current sched domain partitioning, doms_cur[].
1822  * It destroys each deleted domain and builds each new domain.
1823  *
1824  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
1825  * The masks don't intersect (don't overlap.) We should setup one
1826  * sched domain for each mask. CPUs not in any of the cpumasks will
1827  * not be load balanced. If the same cpumask appears both in the
1828  * current 'doms_cur' domains and in the new 'doms_new', we can leave
1829  * it as it is.
1830  *
1831  * The passed in 'doms_new' should be allocated using
1832  * alloc_sched_domains.  This routine takes ownership of it and will
1833  * free_sched_domains it when done with it. If the caller failed the
1834  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
1835  * and partition_sched_domains() will fallback to the single partition
1836  * 'fallback_doms', it also forces the domains to be rebuilt.
1837  *
1838  * If doms_new == NULL it will be replaced with cpu_online_mask.
1839  * ndoms_new == 0 is a special case for destroying existing domains,
1840  * and it will not create the default domain.
1841  *
1842  * Call with hotplug lock held
1843  */
1844 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1845                              struct sched_domain_attr *dattr_new)
1846 {
1847         int i, j, n;
1848         int new_topology;
1849
1850         mutex_lock(&sched_domains_mutex);
1851
1852         /* Always unregister in case we don't destroy any domains: */
1853         unregister_sched_domain_sysctl();
1854
1855         /* Let the architecture update CPU core mappings: */
1856         new_topology = arch_update_cpu_topology();
1857
1858         if (!doms_new) {
1859                 WARN_ON_ONCE(dattr_new);
1860                 n = 0;
1861                 doms_new = alloc_sched_domains(1);
1862                 if (doms_new) {
1863                         n = 1;
1864                         cpumask_and(doms_new[0], cpu_active_mask,
1865                                     housekeeping_cpumask(HK_FLAG_DOMAIN));
1866                 }
1867         } else {
1868                 n = ndoms_new;
1869         }
1870
1871         /* Destroy deleted domains: */
1872         for (i = 0; i < ndoms_cur; i++) {
1873                 for (j = 0; j < n && !new_topology; j++) {
1874                         if (cpumask_equal(doms_cur[i], doms_new[j])
1875                             && dattrs_equal(dattr_cur, i, dattr_new, j))
1876                                 goto match1;
1877                 }
1878                 /* No match - a current sched domain not in new doms_new[] */
1879                 detach_destroy_domains(doms_cur[i]);
1880 match1:
1881                 ;
1882         }
1883
1884         n = ndoms_cur;
1885         if (!doms_new) {
1886                 n = 0;
1887                 doms_new = &fallback_doms;
1888                 cpumask_and(doms_new[0], cpu_active_mask,
1889                             housekeeping_cpumask(HK_FLAG_DOMAIN));
1890         }
1891
1892         /* Build new domains: */
1893         for (i = 0; i < ndoms_new; i++) {
1894                 for (j = 0; j < n && !new_topology; j++) {
1895                         if (cpumask_equal(doms_new[i], doms_cur[j])
1896                             && dattrs_equal(dattr_new, i, dattr_cur, j))
1897                                 goto match2;
1898                 }
1899                 /* No match - add a new doms_new */
1900                 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
1901 match2:
1902                 ;
1903         }
1904
1905         /* Remember the new sched domains: */
1906         if (doms_cur != &fallback_doms)
1907                 free_sched_domains(doms_cur, ndoms_cur);
1908
1909         kfree(dattr_cur);
1910         doms_cur = doms_new;
1911         dattr_cur = dattr_new;
1912         ndoms_cur = ndoms_new;
1913
1914         register_sched_domain_sysctl();
1915
1916         mutex_unlock(&sched_domains_mutex);
1917 }
1918