kernel/sched/topology.c

   1 /*
   2  * Scheduler topology setup/handling methods
   3  */
   4 #include <linux/sched.h>
   5 #include <linux/mutex.h>
   6 #include <linux/sched/isolation.h>
   7
   8 #include "sched.h"
   9
  10 DEFINE_MUTEX(sched_domains_mutex);
  11
  12 /* Protected by sched_domains_mutex: */
  13 cpumask_var_t sched_domains_tmpmask;
  14 cpumask_var_t sched_domains_tmpmask2;
  15
  16 #ifdef CONFIG_SCHED_DEBUG
  17
  18 static int __init sched_debug_setup(char *str)
  19 {
  20         sched_debug_enabled = true;
  21
  22         return 0;
  23 }
  24 early_param("sched_debug", sched_debug_setup);
  25
  26 static inline bool sched_debug(void)
  27 {
  28         return sched_debug_enabled;
  29 }
  30
  31 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
  32                                   struct cpumask *groupmask)
  33 {
  34         struct sched_group *group = sd->groups;
  35
  36         cpumask_clear(groupmask);
  37
  38         printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
  39
  40         if (!(sd->flags & SD_LOAD_BALANCE)) {
  41                 printk("does not load-balance\n");
  42                 if (sd->parent)
  43                         printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
  44                                         " has parent");
  45                 return -1;
  46         }
  47
  48         printk(KERN_CONT "span=%*pbl level=%s\n",
  49                cpumask_pr_args(sched_domain_span(sd)), sd->name);
  50
  51         if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  52                 printk(KERN_ERR "ERROR: domain->span does not contain "
  53                                 "CPU%d\n", cpu);
  54         }
  55         if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
  56                 printk(KERN_ERR "ERROR: domain->groups does not contain"
  57                                 " CPU%d\n", cpu);
  58         }
  59
  60         printk(KERN_DEBUG "%*s groups:", level + 1, "");
  61         do {
  62                 if (!group) {
  63                         printk("\n");
  64                         printk(KERN_ERR "ERROR: group is NULL\n");
  65                         break;
  66                 }
  67
  68                 if (!cpumask_weight(sched_group_span(group))) {
  69                         printk(KERN_CONT "\n");
  70                         printk(KERN_ERR "ERROR: empty group\n");
  71                         break;
  72                 }
  73
  74                 if (!(sd->flags & SD_OVERLAP) &&
  75                     cpumask_intersects(groupmask, sched_group_span(group))) {
  76                         printk(KERN_CONT "\n");
  77                         printk(KERN_ERR "ERROR: repeated CPUs\n");
  78                         break;
  79                 }
  80
  81                 cpumask_or(groupmask, groupmask, sched_group_span(group));
  82
  83                 printk(KERN_CONT " %d:{ span=%*pbl",
  84                                 group->sgc->id,
  85                                 cpumask_pr_args(sched_group_span(group)));
  86
  87                 if ((sd->flags & SD_OVERLAP) &&
  88                     !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
  89                         printk(KERN_CONT " mask=%*pbl",
  90                                 cpumask_pr_args(group_balance_mask(group)));
  91                 }
  92
  93                 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
  94                         printk(KERN_CONT " cap=%lu", group->sgc->capacity);
  95
  96                 if (group == sd->groups && sd->child &&
  97                     !cpumask_equal(sched_domain_span(sd->child),
  98                                    sched_group_span(group))) {
  99                         printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
 100                 }
 101
 102                 printk(KERN_CONT " }");
 103
 104                 group = group->next;
 105
 106                 if (group != sd->groups)
 107                         printk(KERN_CONT ",");
 108
 109         } while (group != sd->groups);
 110         printk(KERN_CONT "\n");
 111
 112         if (!cpumask_equal(sched_domain_span(sd), groupmask))
 113                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 114
 115         if (sd->parent &&
 116             !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
 117                 printk(KERN_ERR "ERROR: parent span is not a superset "
 118                         "of domain->span\n");
 119         return 0;
 120 }
 121
 122 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 123 {
 124         int level = 0;
 125
 126         if (!sched_debug_enabled)
 127                 return;
 128
 129         if (!sd) {
 130                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 131                 return;
 132         }
 133
 134         printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
 135
 136         for (;;) {
 137                 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
 138                         break;
 139                 level++;
 140                 sd = sd->parent;
 141                 if (!sd)
 142                         break;
 143         }
 144 }
 145 #else /* !CONFIG_SCHED_DEBUG */
 146
 147 # define sched_debug_enabled 0
 148 # define sched_domain_debug(sd, cpu) do { } while (0)
 149 static inline bool sched_debug(void)
 150 {
 151         return false;
 152 }
 153 #endif /* CONFIG_SCHED_DEBUG */
 154
 155 static int sd_degenerate(struct sched_domain *sd)
 156 {
 157         if (cpumask_weight(sched_domain_span(sd)) == 1)
 158                 return 1;
 159
 160         /* Following flags need at least 2 groups */
 161         if (sd->flags & (SD_LOAD_BALANCE |
 162                          SD_BALANCE_NEWIDLE |
 163                          SD_BALANCE_FORK |
 164                          SD_BALANCE_EXEC |
 165                          SD_SHARE_CPUCAPACITY |
 166                          SD_ASYM_CPUCAPACITY |
 167                          SD_SHARE_PKG_RESOURCES |
 168                          SD_SHARE_POWERDOMAIN)) {
 169                 if (sd->groups != sd->groups->next)
 170                         return 0;
 171         }
 172
 173         /* Following flags don't use groups */
 174         if (sd->flags & (SD_WAKE_AFFINE))
 175                 return 0;
 176
 177         return 1;
 178 }
 179
 180 static int
 181 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 182 {
 183         unsigned long cflags = sd->flags, pflags = parent->flags;
 184
 185         if (sd_degenerate(parent))
 186                 return 1;
 187
 188         if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 189                 return 0;
 190
 191         /* Flags needing groups don't count if only 1 group in parent */
 192         if (parent->groups == parent->groups->next) {
 193                 pflags &= ~(SD_LOAD_BALANCE |
 194                                 SD_BALANCE_NEWIDLE |
 195                                 SD_BALANCE_FORK |
 196                                 SD_BALANCE_EXEC |
 197                                 SD_ASYM_CPUCAPACITY |
 198                                 SD_SHARE_CPUCAPACITY |
 199                                 SD_SHARE_PKG_RESOURCES |
 200                                 SD_PREFER_SIBLING |
 201                                 SD_SHARE_POWERDOMAIN);
 202                 if (nr_node_ids == 1)
 203                         pflags &= ~SD_SERIALIZE;
 204         }
 205         if (~cflags & pflags)
 206                 return 0;
 207
 208         return 1;
 209 }
 210
 211 static void free_rootdomain(struct rcu_head *rcu)
 212 {
 213         struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
 214
 215         cpupri_cleanup(&rd->cpupri);
 216         cpudl_cleanup(&rd->cpudl);
 217         free_cpumask_var(rd->dlo_mask);
 218         free_cpumask_var(rd->rto_mask);
 219         free_cpumask_var(rd->online);
 220         free_cpumask_var(rd->span);
 221         kfree(rd);
 222 }
 223
 224 void rq_attach_root(struct rq *rq, struct root_domain *rd)
 225 {
 226         struct root_domain *old_rd = NULL;
 227         unsigned long flags;
 228
 229         raw_spin_lock_irqsave(&rq->lock, flags);
 230
 231         if (rq->rd) {
 232                 old_rd = rq->rd;
 233
 234                 if (cpumask_test_cpu(rq->cpu, old_rd->online))
 235                         set_rq_offline(rq);
 236
 237                 cpumask_clear_cpu(rq->cpu, old_rd->span);
 238
 239                 /*
 240                  * If we dont want to free the old_rd yet then
 241                  * set old_rd to NULL to skip the freeing later
 242                  * in this function:
 243                  */
 244                 if (!atomic_dec_and_test(&old_rd->refcount))
 245                         old_rd = NULL;
 246         }
 247
 248         atomic_inc(&rd->refcount);
 249         rq->rd = rd;
 250
 251         cpumask_set_cpu(rq->cpu, rd->span);
 252         if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 253                 set_rq_online(rq);
 254
 255         raw_spin_unlock_irqrestore(&rq->lock, flags);
 256
 257         if (old_rd)
 258                 call_rcu_sched(&old_rd->rcu, free_rootdomain);
 259 }
 260
 261 static int init_rootdomain(struct root_domain *rd)
 262 {
 263         if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
 264                 goto out;
 265         if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
 266                 goto free_span;
 267         if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
 268                 goto free_online;
 269         if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 270                 goto free_dlo_mask;
 271
 272 #ifdef HAVE_RT_PUSH_IPI
 273         rd->rto_cpu = -1;
 274         raw_spin_lock_init(&rd->rto_lock);
 275         init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
 276 #endif
 277
 278         init_dl_bw(&rd->dl_bw);
 279         if (cpudl_init(&rd->cpudl) != 0)
 280                 goto free_rto_mask;
 281
 282         if (cpupri_init(&rd->cpupri) != 0)
 283                 goto free_cpudl;
 284         return 0;
 285
 286 free_cpudl:
 287         cpudl_cleanup(&rd->cpudl);
 288 free_rto_mask:
 289         free_cpumask_var(rd->rto_mask);
 290 free_dlo_mask:
 291         free_cpumask_var(rd->dlo_mask);
 292 free_online:
 293         free_cpumask_var(rd->online);
 294 free_span:
 295         free_cpumask_var(rd->span);
 296 out:
 297         return -ENOMEM;
 298 }
 299
 300 /*
 301  * By default the system creates a single root-domain with all CPUs as
 302  * members (mimicking the global state we have today).
 303  */
 304 struct root_domain def_root_domain;
 305
 306 void init_defrootdomain(void)
 307 {
 308         init_rootdomain(&def_root_domain);
 309
 310         atomic_set(&def_root_domain.refcount, 1);
 311 }
 312
 313 static struct root_domain *alloc_rootdomain(void)
 314 {
 315         struct root_domain *rd;
 316
 317         rd = kzalloc(sizeof(*rd), GFP_KERNEL);
 318         if (!rd)
 319                 return NULL;
 320
 321         if (init_rootdomain(rd) != 0) {
 322                 kfree(rd);
 323                 return NULL;
 324         }
 325
 326         return rd;
 327 }
 328
 329 static void free_sched_groups(struct sched_group *sg, int free_sgc)
 330 {
 331         struct sched_group *tmp, *first;
 332
 333         if (!sg)
 334                 return;
 335
 336         first = sg;
 337         do {
 338                 tmp = sg->next;
 339
 340                 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
 341                         kfree(sg->sgc);
 342
 343                 if (atomic_dec_and_test(&sg->ref))
 344                         kfree(sg);
 345                 sg = tmp;
 346         } while (sg != first);
 347 }
 348
 349 static void destroy_sched_domain(struct sched_domain *sd)
 350 {
 351         /*
 352          * A normal sched domain may have multiple group references, an
 353          * overlapping domain, having private groups, only one.  Iterate,
 354          * dropping group/capacity references, freeing where none remain.
 355          */
 356         free_sched_groups(sd->groups, 1);
 357
 358         if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
 359                 kfree(sd->shared);
 360         kfree(sd);
 361 }
 362
 363 static void destroy_sched_domains_rcu(struct rcu_head *rcu)
 364 {
 365         struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
 366
 367         while (sd) {
 368                 struct sched_domain *parent = sd->parent;
 369                 destroy_sched_domain(sd);
 370                 sd = parent;
 371         }
 372 }
 373
 374 static void destroy_sched_domains(struct sched_domain *sd)
 375 {
 376         if (sd)
 377                 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
 378 }
 379
 380 /*
 381  * Keep a special pointer to the highest sched_domain that has
 382  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 383  * allows us to avoid some pointer chasing select_idle_sibling().
 384  *
 385  * Also keep a unique ID per domain (we use the first CPU number in
 386  * the cpumask of the domain), this allows us to quickly tell if
 387  * two CPUs are in the same cache domain, see cpus_share_cache().
 388  */
 389 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 390 DEFINE_PER_CPU(int, sd_llc_size);
 391 DEFINE_PER_CPU(int, sd_llc_id);
 392 DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 393 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 394 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 395
 396 static void update_top_cache_domain(int cpu)
 397 {
 398         struct sched_domain_shared *sds = NULL;
 399         struct sched_domain *sd;
 400         int id = cpu;
 401         int size = 1;
 402
 403         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
 404         if (sd) {
 405                 id = cpumask_first(sched_domain_span(sd));
 406                 size = cpumask_weight(sched_domain_span(sd));
 407                 sds = sd->shared;
 408         }
 409
 410         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 411         per_cpu(sd_llc_size, cpu) = size;
 412         per_cpu(sd_llc_id, cpu) = id;
 413         rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 414
 415         sd = lowest_flag_domain(cpu, SD_NUMA);
 416         rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 417
 418         sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
 419         rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 420 }
 421
 422 /*
 423  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
 424  * hold the hotplug lock.
 425  */
 426 static void
 427 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 428 {
 429         struct rq *rq = cpu_rq(cpu);
 430         struct sched_domain *tmp;
 431
 432         /* Remove the sched domains which do not contribute to scheduling. */
 433         for (tmp = sd; tmp; ) {
 434                 struct sched_domain *parent = tmp->parent;
 435                 if (!parent)
 436                         break;
 437
 438                 if (sd_parent_degenerate(tmp, parent)) {
 439                         tmp->parent = parent->parent;
 440                         if (parent->parent)
 441                                 parent->parent->child = tmp;
 442                         /*
 443                          * Transfer SD_PREFER_SIBLING down in case of a
 444                          * degenerate parent; the spans match for this
 445                          * so the property transfers.
 446                          */
 447                         if (parent->flags & SD_PREFER_SIBLING)
 448                                 tmp->flags |= SD_PREFER_SIBLING;
 449                         destroy_sched_domain(parent);
 450                 } else
 451                         tmp = tmp->parent;
 452         }
 453
 454         if (sd && sd_degenerate(sd)) {
 455                 tmp = sd;
 456                 sd = sd->parent;
 457                 destroy_sched_domain(tmp);
 458                 if (sd)
 459                         sd->child = NULL;
 460         }
 461
 462         sched_domain_debug(sd, cpu);
 463
 464         rq_attach_root(rq, rd);
 465         tmp = rq->sd;
 466         rcu_assign_pointer(rq->sd, sd);
 467         dirty_sched_domain_sysctl(cpu);
 468         destroy_sched_domains(tmp);
 469
 470         update_top_cache_domain(cpu);
 471 }
 472
 473 struct s_data {
 474         struct sched_domain ** __percpu sd;
 475         struct root_domain      *rd;
 476 };
 477
 478 enum s_alloc {
 479         sa_rootdomain,
 480         sa_sd,
 481         sa_sd_storage,
 482         sa_none,
 483 };
 484
 485 /*
 486  * Return the canonical balance CPU for this group, this is the first CPU
 487  * of this group that's also in the balance mask.
 488  *
 489  * The balance mask are all those CPUs that could actually end up at this
 490  * group. See build_balance_mask().
 491  *
 492  * Also see should_we_balance().
 493  */
 494 int group_balance_cpu(struct sched_group *sg)
 495 {
 496         return cpumask_first(group_balance_mask(sg));
 497 }
 498
 499
 500 /*
 501  * NUMA topology (first read the regular topology blurb below)
 502  *
 503  * Given a node-distance table, for example:
 504  *
 505  *   node   0   1   2   3
 506  *     0:  10  20  30  20
 507  *     1:  20  10  20  30
 508  *     2:  30  20  10  20
 509  *     3:  20  30  20  10
 510  *
 511  * which represents a 4 node ring topology like:
 512  *
 513  *   0 ----- 1
 514  *   |       |
 515  *   |       |
 516  *   |       |
 517  *   3 ----- 2
 518  *
 519  * We want to construct domains and groups to represent this. The way we go
 520  * about doing this is to build the domains on 'hops'. For each NUMA level we
 521  * construct the mask of all nodes reachable in @level hops.
 522  *
 523  * For the above NUMA topology that gives 3 levels:
 524  *
 525  * NUMA-2       0-3             0-3             0-3             0-3
 526  *  groups:     {0-1,3},{1-3}   {0-2},{0,2-3}   {1-3},{0-1,3}   {0,2-3},{0-2}
 527  *
 528  * NUMA-1       0-1,3           0-2             1-3             0,2-3
 529  *  groups:     {0},{1},{3}     {0},{1},{2}     {1},{2},{3}     {0},{2},{3}
 530  *
 531  * NUMA-0       0               1               2               3
 532  *
 533  *
 534  * As can be seen; things don't nicely line up as with the regular topology.
 535  * When we iterate a domain in child domain chunks some nodes can be
 536  * represented multiple times -- hence the "overlap" naming for this part of
 537  * the topology.
 538  *
 539  * In order to minimize this overlap, we only build enough groups to cover the
 540  * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
 541  *
 542  * Because:
 543  *
 544  *  - the first group of each domain is its child domain; this
 545  *    gets us the first 0-1,3
 546  *  - the only uncovered node is 2, who's child domain is 1-3.
 547  *
 548  * However, because of the overlap, computing a unique CPU for each group is
 549  * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
 550  * groups include the CPUs of Node-0, while those CPUs would not in fact ever
 551  * end up at those groups (they would end up in group: 0-1,3).
 552  *
 553  * To correct this we have to introduce the group balance mask. This mask
 554  * will contain those CPUs in the group that can reach this group given the
 555  * (child) domain tree.
 556  *
 557  * With this we can once again compute balance_cpu and sched_group_capacity
 558  * relations.
 559  *
 560  * XXX include words on how balance_cpu is unique and therefore can be
 561  * used for sched_group_capacity links.
 562  *
 563  *
 564  * Another 'interesting' topology is:
 565  *
 566  *   node   0   1   2   3
 567  *     0:  10  20  20  30
 568  *     1:  20  10  20  20
 569  *     2:  20  20  10  20
 570  *     3:  30  20  20  10
 571  *
 572  * Which looks a little like:
 573  *
 574  *   0 ----- 1
 575  *   |     / |
 576  *   |   /   |
 577  *   | /     |
 578  *   2 ----- 3
 579  *
 580  * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
 581  * are not.
 582  *
 583  * This leads to a few particularly weird cases where the sched_domain's are
 584  * not of the same number for each cpu. Consider:
 585  *
 586  * NUMA-2       0-3                                             0-3
 587  *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
 588  *
 589  * NUMA-1       0-2             0-3             0-3             1-3
 590  *
 591  * NUMA-0       0               1               2               3
 592  *
 593  */
 594
 595
 596 /*
 597  * Build the balance mask; it contains only those CPUs that can arrive at this
 598  * group and should be considered to continue balancing.
 599  *
 600  * We do this during the group creation pass, therefore the group information
 601  * isn't complete yet, however since each group represents a (child) domain we
 602  * can fully construct this using the sched_domain bits (which are already
 603  * complete).
 604  */
 605 static void
 606 build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
 607 {
 608         const struct cpumask *sg_span = sched_group_span(sg);
 609         struct sd_data *sdd = sd->private;
 610         struct sched_domain *sibling;
 611         int i;
 612
 613         cpumask_clear(mask);
 614
 615         for_each_cpu(i, sg_span) {
 616                 sibling = *per_cpu_ptr(sdd->sd, i);
 617
 618                 /*
 619                  * Can happen in the asymmetric case, where these siblings are
 620                  * unused. The mask will not be empty because those CPUs that
 621                  * do have the top domain _should_ span the domain.
 622                  */
 623                 if (!sibling->child)
 624                         continue;
 625
 626                 /* If we would not end up here, we can't continue from here */
 627                 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
 628                         continue;
 629
 630                 cpumask_set_cpu(i, mask);
 631         }
 632
 633         /* We must not have empty masks here */
 634         WARN_ON_ONCE(cpumask_empty(mask));
 635 }
 636
 637 /*
 638  * XXX: This creates per-node group entries; since the load-balancer will
 639  * immediately access remote memory to construct this group's load-balance
 640  * statistics having the groups node local is of dubious benefit.
 641  */
 642 static struct sched_group *
 643 build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
 644 {
 645         struct sched_group *sg;
 646         struct cpumask *sg_span;
 647
 648         sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 649                         GFP_KERNEL, cpu_to_node(cpu));
 650
 651         if (!sg)
 652                 return NULL;
 653
 654         sg_span = sched_group_span(sg);
 655         if (sd->child)
 656                 cpumask_copy(sg_span, sched_domain_span(sd->child));
 657         else
 658                 cpumask_copy(sg_span, sched_domain_span(sd));
 659
 660         atomic_inc(&sg->ref);
 661         return sg;
 662 }
 663
 664 static void init_overlap_sched_group(struct sched_domain *sd,
 665                                      struct sched_group *sg)
 666 {
 667         struct cpumask *mask = sched_domains_tmpmask2;
 668         struct sd_data *sdd = sd->private;
 669         struct cpumask *sg_span;
 670         int cpu;
 671
 672         build_balance_mask(sd, sg, mask);
 673         cpu = cpumask_first_and(sched_group_span(sg), mask);
 674
 675         sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 676         if (atomic_inc_return(&sg->sgc->ref) == 1)
 677                 cpumask_copy(group_balance_mask(sg), mask);
 678         else
 679                 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
 680
 681         /*
 682          * Initialize sgc->capacity such that even if we mess up the
 683          * domains and no possible iteration will get us here, we won't
 684          * die on a /0 trap.
 685          */
 686         sg_span = sched_group_span(sg);
 687         sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
 688         sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
 689 }
 690
 691 static int
 692 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 693 {
 694         struct sched_group *first = NULL, *last = NULL, *sg;
 695         const struct cpumask *span = sched_domain_span(sd);
 696         struct cpumask *covered = sched_domains_tmpmask;
 697         struct sd_data *sdd = sd->private;
 698         struct sched_domain *sibling;
 699         int i;
 700
 701         cpumask_clear(covered);
 702
 703         for_each_cpu_wrap(i, span, cpu) {
 704                 struct cpumask *sg_span;
 705
 706                 if (cpumask_test_cpu(i, covered))
 707                         continue;
 708
 709                 sibling = *per_cpu_ptr(sdd->sd, i);
 710
 711                 /*
 712                  * Asymmetric node setups can result in situations where the
 713                  * domain tree is of unequal depth, make sure to skip domains
 714                  * that already cover the entire range.
 715                  *
 716                  * In that case build_sched_domains() will have terminated the
 717                  * iteration early and our sibling sd spans will be empty.
 718                  * Domains should always include the CPU they're built on, so
 719                  * check that.
 720                  */
 721                 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 722                         continue;
 723
 724                 sg = build_group_from_child_sched_domain(sibling, cpu);
 725                 if (!sg)
 726                         goto fail;
 727
 728                 sg_span = sched_group_span(sg);
 729                 cpumask_or(covered, covered, sg_span);
 730
 731                 init_overlap_sched_group(sd, sg);
 732
 733                 if (!first)
 734                         first = sg;
 735                 if (last)
 736                         last->next = sg;
 737                 last = sg;
 738                 last->next = first;
 739         }
 740         sd->groups = first;
 741
 742         return 0;
 743
 744 fail:
 745         free_sched_groups(first, 0);
 746
 747         return -ENOMEM;
 748 }
 749
 750
 751 /*
 752  * Package topology (also see the load-balance blurb in fair.c)
 753  *
 754  * The scheduler builds a tree structure to represent a number of important
 755  * topology features. By default (default_topology[]) these include:
 756  *
 757  *  - Simultaneous multithreading (SMT)
 758  *  - Multi-Core Cache (MC)
 759  *  - Package (DIE)
 760  *
 761  * Where the last one more or less denotes everything up to a NUMA node.
 762  *
 763  * The tree consists of 3 primary data structures:
 764  *
 765  *      sched_domain -> sched_group -> sched_group_capacity
 766  *          ^ ^             ^ ^
 767  *          `-'             `-'
 768  *
 769  * The sched_domains are per-cpu and have a two way link (parent & child) and
 770  * denote the ever growing mask of CPUs belonging to that level of topology.
 771  *
 772  * Each sched_domain has a circular (double) linked list of sched_group's, each
 773  * denoting the domains of the level below (or individual CPUs in case of the
 774  * first domain level). The sched_group linked by a sched_domain includes the
 775  * CPU of that sched_domain [*].
 776  *
 777  * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
 778  *
 779  * CPU   0   1   2   3   4   5   6   7
 780  *
 781  * DIE  [                             ]
 782  * MC   [             ] [             ]
 783  * SMT  [     ] [     ] [     ] [     ]
 784  *
 785  *  - or -
 786  *
 787  * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
 788  * MC   0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
 789  * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
 790  *
 791  * CPU   0   1   2   3   4   5   6   7
 792  *
 793  * One way to think about it is: sched_domain moves you up and down among these
 794  * topology levels, while sched_group moves you sideways through it, at child
 795  * domain granularity.
 796  *
 797  * sched_group_capacity ensures each unique sched_group has shared storage.
 798  *
 799  * There are two related construction problems, both require a CPU that
 800  * uniquely identify each group (for a given domain):
 801  *
 802  *  - The first is the balance_cpu (see should_we_balance() and the
 803  *    load-balance blub in fair.c); for each group we only want 1 CPU to
 804  *    continue balancing at a higher domain.
 805  *
 806  *  - The second is the sched_group_capacity; we want all identical groups
 807  *    to share a single sched_group_capacity.
 808  *
 809  * Since these topologies are exclusive by construction. That is, its
 810  * impossible for an SMT thread to belong to multiple cores, and cores to
 811  * be part of multiple caches. There is a very clear and unique location
 812  * for each CPU in the hierarchy.
 813  *
 814  * Therefore computing a unique CPU for each group is trivial (the iteration
 815  * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
 816  * group), we can simply pick the first CPU in each group.
 817  *
 818  *
 819  * [*] in other words, the first group of each domain is its child domain.
 820  */
 821
 822 static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 823 {
 824         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 825         struct sched_domain *child = sd->child;
 826         struct sched_group *sg;
 827
 828         if (child)
 829                 cpu = cpumask_first(sched_domain_span(child));
 830
 831         sg = *per_cpu_ptr(sdd->sg, cpu);
 832         sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 833
 834         /* For claim_allocations: */
 835         atomic_inc(&sg->ref);
 836         atomic_inc(&sg->sgc->ref);
 837
 838         if (child) {
 839                 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
 840                 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
 841         } else {
 842                 cpumask_set_cpu(cpu, sched_group_span(sg));
 843                 cpumask_set_cpu(cpu, group_balance_mask(sg));
 844         }
 845
 846         sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
 847         sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
 848
 849         return sg;
 850 }
 851
 852 /*
 853  * build_sched_groups will build a circular linked list of the groups
 854  * covered by the given span, and will set each group's ->cpumask correctly,
 855  * and ->cpu_capacity to 0.
 856  *
 857  * Assumes the sched_domain tree is fully constructed
 858  */
 859 static int
 860 build_sched_groups(struct sched_domain *sd, int cpu)
 861 {
 862         struct sched_group *first = NULL, *last = NULL;
 863         struct sd_data *sdd = sd->private;
 864         const struct cpumask *span = sched_domain_span(sd);
 865         struct cpumask *covered;
 866         int i;
 867
 868         lockdep_assert_held(&sched_domains_mutex);
 869         covered = sched_domains_tmpmask;
 870
 871         cpumask_clear(covered);
 872
 873         for_each_cpu_wrap(i, span, cpu) {
 874                 struct sched_group *sg;
 875
 876                 if (cpumask_test_cpu(i, covered))
 877                         continue;
 878
 879                 sg = get_group(i, sdd);
 880
 881                 cpumask_or(covered, covered, sched_group_span(sg));
 882
 883                 if (!first)
 884                         first = sg;
 885                 if (last)
 886                         last->next = sg;
 887                 last = sg;
 888         }
 889         last->next = first;
 890         sd->groups = first;
 891
 892         return 0;
 893 }
 894
 895 /*
 896  * Initialize sched groups cpu_capacity.
 897  *
 898  * cpu_capacity indicates the capacity of sched group, which is used while
 899  * distributing the load between different sched groups in a sched domain.
 900  * Typically cpu_capacity for all the groups in a sched domain will be same
 901  * unless there are asymmetries in the topology. If there are asymmetries,
 902  * group having more cpu_capacity will pickup more load compared to the
 903  * group having less cpu_capacity.
 904  */
 905 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 906 {
 907         struct sched_group *sg = sd->groups;
 908
 909         WARN_ON(!sg);
 910
 911         do {
 912                 int cpu, max_cpu = -1;
 913
 914                 sg->group_weight = cpumask_weight(sched_group_span(sg));
 915
 916                 if (!(sd->flags & SD_ASYM_PACKING))
 917                         goto next;
 918
 919                 for_each_cpu(cpu, sched_group_span(sg)) {
 920                         if (max_cpu < 0)
 921                                 max_cpu = cpu;
 922                         else if (sched_asym_prefer(cpu, max_cpu))
 923                                 max_cpu = cpu;
 924                 }
 925                 sg->asym_prefer_cpu = max_cpu;
 926
 927 next:
 928                 sg = sg->next;
 929         } while (sg != sd->groups);
 930
 931         if (cpu != group_balance_cpu(sg))
 932                 return;
 933
 934         update_group_capacity(sd, cpu);
 935 }
 936
 937 /*
 938  * Initializers for schedule domains
 939  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
 940  */
 941
 942 static int default_relax_domain_level = -1;
 943 int sched_domain_level_max;
 944
 945 static int __init setup_relax_domain_level(char *str)
 946 {
 947         if (kstrtoint(str, 0, &default_relax_domain_level))
 948                 pr_warn("Unable to set relax_domain_level\n");
 949
 950         return 1;
 951 }
 952 __setup("relax_domain_level=", setup_relax_domain_level);
 953
 954 static void set_domain_attribute(struct sched_domain *sd,
 955                                  struct sched_domain_attr *attr)
 956 {
 957         int request;
 958
 959         if (!attr || attr->relax_domain_level < 0) {
 960                 if (default_relax_domain_level < 0)
 961                         return;
 962                 else
 963                         request = default_relax_domain_level;
 964         } else
 965                 request = attr->relax_domain_level;
 966         if (request < sd->level) {
 967                 /* Turn off idle balance on this domain: */
 968                 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 969         } else {
 970                 /* Turn on idle balance on this domain: */
 971                 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 972         }
 973 }
 974
 975 static void __sdt_free(const struct cpumask *cpu_map);
 976 static int __sdt_alloc(const struct cpumask *cpu_map);
 977
 978 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 979                                  const struct cpumask *cpu_map)
 980 {
 981         switch (what) {
 982         case sa_rootdomain:
 983                 if (!atomic_read(&d->rd->refcount))
 984                         free_rootdomain(&d->rd->rcu);
 985                 /* Fall through */
 986         case sa_sd:
 987                 free_percpu(d->sd);
 988                 /* Fall through */
 989         case sa_sd_storage:
 990                 __sdt_free(cpu_map);
 991                 /* Fall through */
 992         case sa_none:
 993                 break;
 994         }
 995 }
 996
 997 static enum s_alloc
 998 __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
 999 {
1000         memset(d, 0, sizeof(*d));
1001
1002         if (__sdt_alloc(cpu_map))
1003                 return sa_sd_storage;
1004         d->sd = alloc_percpu(struct sched_domain *);
1005         if (!d->sd)
1006                 return sa_sd_storage;
1007         d->rd = alloc_rootdomain();
1008         if (!d->rd)
1009                 return sa_sd;
1010         return sa_rootdomain;
1011 }
1012
1013 /*
1014  * NULL the sd_data elements we've used to build the sched_domain and
1015  * sched_group structure so that the subsequent __free_domain_allocs()
1016  * will not free the data we're using.
1017  */
1018 static void claim_allocations(int cpu, struct sched_domain *sd)
1019 {
1020         struct sd_data *sdd = sd->private;
1021
1022         WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1023         *per_cpu_ptr(sdd->sd, cpu) = NULL;
1024
1025         if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1026                 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1027
1028         if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1029                 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1030
1031         if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1032                 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1033 }
1034
1035 #ifdef CONFIG_NUMA
1036 static int sched_domains_numa_levels;
1037 enum numa_topology_type sched_numa_topology_type;
1038 static int *sched_domains_numa_distance;
1039 int sched_max_numa_distance;
1040 static struct cpumask ***sched_domains_numa_masks;
1041 static int sched_domains_curr_level;
1042 #endif
1043
1044 /*
1045  * SD_flags allowed in topology descriptions.
1046  *
1047  * These flags are purely descriptive of the topology and do not prescribe
1048  * behaviour. Behaviour is artificial and mapped in the below sd_init()
1049  * function:
1050  *
1051  *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
1052  *   SD_SHARE_PKG_RESOURCES - describes shared caches
1053  *   SD_NUMA                - describes NUMA topologies
1054  *   SD_SHARE_POWERDOMAIN   - describes shared power domain
1055  *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
1056  *
1057  * Odd one out, which beside describing the topology has a quirk also
1058  * prescribes the desired behaviour that goes along with it:
1059  *
1060  *   SD_ASYM_PACKING        - describes SMT quirks
1061  */
1062 #define TOPOLOGY_SD_FLAGS               \
1063         (SD_SHARE_CPUCAPACITY |         \
1064          SD_SHARE_PKG_RESOURCES |       \
1065          SD_NUMA |                      \
1066          SD_ASYM_PACKING |              \
1067          SD_ASYM_CPUCAPACITY |          \
1068          SD_SHARE_POWERDOMAIN)
1069
1070 static struct sched_domain *
1071 sd_init(struct sched_domain_topology_level *tl,
1072         const struct cpumask *cpu_map,
1073         struct sched_domain *child, int cpu)
1074 {
1075         struct sd_data *sdd = &tl->data;
1076         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1077         int sd_id, sd_weight, sd_flags = 0;
1078
1079 #ifdef CONFIG_NUMA
1080         /*
1081          * Ugly hack to pass state to sd_numa_mask()...
1082          */
1083         sched_domains_curr_level = tl->numa_level;
1084 #endif
1085
1086         sd_weight = cpumask_weight(tl->mask(cpu));
1087
1088         if (tl->sd_flags)
1089                 sd_flags = (*tl->sd_flags)();
1090         if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1091                         "wrong sd_flags in topology description\n"))
1092                 sd_flags &= ~TOPOLOGY_SD_FLAGS;
1093
1094         *sd = (struct sched_domain){
1095                 .min_interval           = sd_weight,
1096                 .max_interval           = 2*sd_weight,
1097                 .busy_factor            = 32,
1098                 .imbalance_pct          = 125,
1099
1100                 .cache_nice_tries       = 0,
1101                 .busy_idx               = 0,
1102                 .idle_idx               = 0,
1103                 .newidle_idx            = 0,
1104                 .wake_idx               = 0,
1105                 .forkexec_idx           = 0,
1106
1107                 .flags                  = 1*SD_LOAD_BALANCE
1108                                         | 1*SD_BALANCE_NEWIDLE
1109                                         | 1*SD_BALANCE_EXEC
1110                                         | 1*SD_BALANCE_FORK
1111                                         | 0*SD_BALANCE_WAKE
1112                                         | 1*SD_WAKE_AFFINE
1113                                         | 0*SD_SHARE_CPUCAPACITY
1114                                         | 0*SD_SHARE_PKG_RESOURCES
1115                                         | 0*SD_SERIALIZE
1116                                         | 0*SD_PREFER_SIBLING
1117                                         | 0*SD_NUMA
1118                                         | sd_flags
1119                                         ,
1120
1121                 .last_balance           = jiffies,
1122                 .balance_interval       = sd_weight,
1123                 .smt_gain               = 0,
1124                 .max_newidle_lb_cost    = 0,
1125                 .next_decay_max_lb_cost = jiffies,
1126                 .child                  = child,
1127 #ifdef CONFIG_SCHED_DEBUG
1128                 .name                   = tl->name,
1129 #endif
1130         };
1131
1132         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1133         sd_id = cpumask_first(sched_domain_span(sd));
1134
1135         /*
1136          * Convert topological properties into behaviour.
1137          */
1138
1139         if (sd->flags & SD_ASYM_CPUCAPACITY) {
1140                 struct sched_domain *t = sd;
1141
1142                 for_each_lower_domain(t)
1143                         t->flags |= SD_BALANCE_WAKE;
1144         }
1145
1146         if (sd->flags & SD_SHARE_CPUCAPACITY) {
1147                 sd->flags |= SD_PREFER_SIBLING;
1148                 sd->imbalance_pct = 110;
1149                 sd->smt_gain = 1178; /* ~15% */
1150
1151         } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1152                 sd->flags |= SD_PREFER_SIBLING;
1153                 sd->imbalance_pct = 117;
1154                 sd->cache_nice_tries = 1;
1155                 sd->busy_idx = 2;
1156
1157 #ifdef CONFIG_NUMA
1158         } else if (sd->flags & SD_NUMA) {
1159                 sd->cache_nice_tries = 2;
1160                 sd->busy_idx = 3;
1161                 sd->idle_idx = 2;
1162
1163                 sd->flags |= SD_SERIALIZE;
1164                 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
1165                         sd->flags &= ~(SD_BALANCE_EXEC |
1166                                        SD_BALANCE_FORK |
1167                                        SD_WAKE_AFFINE);
1168                 }
1169
1170 #endif
1171         } else {
1172                 sd->flags |= SD_PREFER_SIBLING;
1173                 sd->cache_nice_tries = 1;
1174                 sd->busy_idx = 2;
1175                 sd->idle_idx = 1;
1176         }
1177
1178         /*
1179          * For all levels sharing cache; connect a sched_domain_shared
1180          * instance.
1181          */
1182         if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1183                 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1184                 atomic_inc(&sd->shared->ref);
1185                 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1186         }
1187
1188         sd->private = sdd;
1189
1190         return sd;
1191 }
1192
1193 /*
1194  * Topology list, bottom-up.
1195  */
1196 static struct sched_domain_topology_level default_topology[] = {
1197 #ifdef CONFIG_SCHED_SMT
1198         { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1199 #endif
1200 #ifdef CONFIG_SCHED_MC
1201         { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1202 #endif
1203         { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1204         { NULL, },
1205 };
1206
1207 static struct sched_domain_topology_level *sched_domain_topology =
1208         default_topology;
1209
1210 #define for_each_sd_topology(tl)                        \
1211         for (tl = sched_domain_topology; tl->mask; tl++)
1212
1213 void set_sched_topology(struct sched_domain_topology_level *tl)
1214 {
1215         if (WARN_ON_ONCE(sched_smp_initialized))
1216                 return;
1217
1218         sched_domain_topology = tl;
1219 }
1220
1221 #ifdef CONFIG_NUMA
1222
1223 static const struct cpumask *sd_numa_mask(int cpu)
1224 {
1225         return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1226 }
1227
1228 static void sched_numa_warn(const char *str)
1229 {
1230         static int done = false;
1231         int i,j;
1232
1233         if (done)
1234                 return;
1235
1236         done = true;
1237
1238         printk(KERN_WARNING "ERROR: %s\n\n", str);
1239
1240         for (i = 0; i < nr_node_ids; i++) {
1241                 printk(KERN_WARNING "  ");
1242                 for (j = 0; j < nr_node_ids; j++)
1243                         printk(KERN_CONT "%02d ", node_distance(i,j));
1244                 printk(KERN_CONT "\n");
1245         }
1246         printk(KERN_WARNING "\n");
1247 }
1248
1249 bool find_numa_distance(int distance)
1250 {
1251         int i;
1252
1253         if (distance == node_distance(0, 0))
1254                 return true;
1255
1256         for (i = 0; i < sched_domains_numa_levels; i++) {
1257                 if (sched_domains_numa_distance[i] == distance)
1258                         return true;
1259         }
1260
1261         return false;
1262 }
1263
1264 /*
1265  * A system can have three types of NUMA topology:
1266  * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1267  * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1268  * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1269  *
1270  * The difference between a glueless mesh topology and a backplane
1271  * topology lies in whether communication between not directly
1272  * connected nodes goes through intermediary nodes (where programs
1273  * could run), or through backplane controllers. This affects
1274  * placement of programs.
1275  *
1276  * The type of topology can be discerned with the following tests:
1277  * - If the maximum distance between any nodes is 1 hop, the system
1278  *   is directly connected.
1279  * - If for two nodes A and B, located N > 1 hops away from each other,
1280  *   there is an intermediary node C, which is < N hops away from both
1281  *   nodes A and B, the system is a glueless mesh.
1282  */
1283 static void init_numa_topology_type(void)
1284 {
1285         int a, b, c, n;
1286
1287         n = sched_max_numa_distance;
1288
1289         if (sched_domains_numa_levels <= 1) {
1290                 sched_numa_topology_type = NUMA_DIRECT;
1291                 return;
1292         }
1293
1294         for_each_online_node(a) {
1295                 for_each_online_node(b) {
1296                         /* Find two nodes furthest removed from each other. */
1297                         if (node_distance(a, b) < n)
1298                                 continue;
1299
1300                         /* Is there an intermediary node between a and b? */
1301                         for_each_online_node(c) {
1302                                 if (node_distance(a, c) < n &&
1303                                     node_distance(b, c) < n) {
1304                                         sched_numa_topology_type =
1305                                                         NUMA_GLUELESS_MESH;
1306                                         return;
1307                                 }
1308                         }
1309
1310                         sched_numa_topology_type = NUMA_BACKPLANE;
1311                         return;
1312                 }
1313         }
1314 }
1315
1316 void sched_init_numa(void)
1317 {
1318         int next_distance, curr_distance = node_distance(0, 0);
1319         struct sched_domain_topology_level *tl;
1320         int level = 0;
1321         int i, j, k;
1322
1323         sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
1324         if (!sched_domains_numa_distance)
1325                 return;
1326
1327         /* Includes NUMA identity node at level 0. */
1328         sched_domains_numa_distance[level++] = curr_distance;
1329         sched_domains_numa_levels = level;
1330
1331         /*
1332          * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1333          * unique distances in the node_distance() table.
1334          *
1335          * Assumes node_distance(0,j) includes all distances in
1336          * node_distance(i,j) in order to avoid cubic time.
1337          */
1338         next_distance = curr_distance;
1339         for (i = 0; i < nr_node_ids; i++) {
1340                 for (j = 0; j < nr_node_ids; j++) {
1341                         for (k = 0; k < nr_node_ids; k++) {
1342                                 int distance = node_distance(i, k);
1343
1344                                 if (distance > curr_distance &&
1345                                     (distance < next_distance ||
1346                                      next_distance == curr_distance))
1347                                         next_distance = distance;
1348
1349                                 /*
1350                                  * While not a strong assumption it would be nice to know
1351                                  * about cases where if node A is connected to B, B is not
1352                                  * equally connected to A.
1353                                  */
1354                                 if (sched_debug() && node_distance(k, i) != distance)
1355                                         sched_numa_warn("Node-distance not symmetric");
1356
1357                                 if (sched_debug() && i && !find_numa_distance(distance))
1358                                         sched_numa_warn("Node-0 not representative");
1359                         }
1360                         if (next_distance != curr_distance) {
1361                                 sched_domains_numa_distance[level++] = next_distance;
1362                                 sched_domains_numa_levels = level;
1363                                 curr_distance = next_distance;
1364                         } else break;
1365                 }
1366
1367                 /*
1368                  * In case of sched_debug() we verify the above assumption.
1369                  */
1370                 if (!sched_debug())
1371                         break;
1372         }
1373
1374         if (!level)
1375                 return;
1376
1377         /*
1378          * 'level' contains the number of unique distances
1379          *
1380          * The sched_domains_numa_distance[] array includes the actual distance
1381          * numbers.
1382          */
1383
1384         /*
1385          * Here, we should temporarily reset sched_domains_numa_levels to 0.
1386          * If it fails to allocate memory for array sched_domains_numa_masks[][],
1387          * the array will contain less then 'level' members. This could be
1388          * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1389          * in other functions.
1390          *
1391          * We reset it to 'level' at the end of this function.
1392          */
1393         sched_domains_numa_levels = 0;
1394
1395         sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1396         if (!sched_domains_numa_masks)
1397                 return;
1398
1399         /*
1400          * Now for each level, construct a mask per node which contains all
1401          * CPUs of nodes that are that many hops away from us.
1402          */
1403         for (i = 0; i < level; i++) {
1404                 sched_domains_numa_masks[i] =
1405                         kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1406                 if (!sched_domains_numa_masks[i])
1407                         return;
1408
1409                 for (j = 0; j < nr_node_ids; j++) {
1410                         struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1411                         if (!mask)
1412                                 return;
1413
1414                         sched_domains_numa_masks[i][j] = mask;
1415
1416                         for_each_node(k) {
1417                                 if (node_distance(j, k) > sched_domains_numa_distance[i])
1418                                         continue;
1419
1420                                 cpumask_or(mask, mask, cpumask_of_node(k));
1421                         }
1422                 }
1423         }
1424
1425         /* Compute default topology size */
1426         for (i = 0; sched_domain_topology[i].mask; i++);
1427
1428         tl = kzalloc((i + level + 1) *
1429                         sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1430         if (!tl)
1431                 return;
1432
1433         /*
1434          * Copy the default topology bits..
1435          */
1436         for (i = 0; sched_domain_topology[i].mask; i++)
1437                 tl[i] = sched_domain_topology[i];
1438
1439         /*
1440          * Add the NUMA identity distance, aka single NODE.
1441          */
1442         tl[i++] = (struct sched_domain_topology_level){
1443                 .mask = sd_numa_mask,
1444                 .numa_level = 0,
1445                 SD_INIT_NAME(NODE)
1446         };
1447
1448         /*
1449          * .. and append 'j' levels of NUMA goodness.
1450          */
1451         for (j = 1; j < level; i++, j++) {
1452                 tl[i] = (struct sched_domain_topology_level){
1453                         .mask = sd_numa_mask,
1454                         .sd_flags = cpu_numa_flags,
1455                         .flags = SDTL_OVERLAP,
1456                         .numa_level = j,
1457                         SD_INIT_NAME(NUMA)
1458                 };
1459         }
1460
1461         sched_domain_topology = tl;
1462
1463         sched_domains_numa_levels = level;
1464         sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1465
1466         init_numa_topology_type();
1467 }
1468
1469 void sched_domains_numa_masks_set(unsigned int cpu)
1470 {
1471         int node = cpu_to_node(cpu);
1472         int i, j;
1473
1474         for (i = 0; i < sched_domains_numa_levels; i++) {
1475                 for (j = 0; j < nr_node_ids; j++) {
1476                         if (node_distance(j, node) <= sched_domains_numa_distance[i])
1477                                 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1478                 }
1479         }
1480 }
1481
1482 void sched_domains_numa_masks_clear(unsigned int cpu)
1483 {
1484         int i, j;
1485
1486         for (i = 0; i < sched_domains_numa_levels; i++) {
1487                 for (j = 0; j < nr_node_ids; j++)
1488                         cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1489         }
1490 }
1491
1492 #endif /* CONFIG_NUMA */
1493
1494 static int __sdt_alloc(const struct cpumask *cpu_map)
1495 {
1496         struct sched_domain_topology_level *tl;
1497         int j;
1498
1499         for_each_sd_topology(tl) {
1500                 struct sd_data *sdd = &tl->data;
1501
1502                 sdd->sd = alloc_percpu(struct sched_domain *);
1503                 if (!sdd->sd)
1504                         return -ENOMEM;
1505
1506                 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1507                 if (!sdd->sds)
1508                         return -ENOMEM;
1509
1510                 sdd->sg = alloc_percpu(struct sched_group *);
1511                 if (!sdd->sg)
1512                         return -ENOMEM;
1513
1514                 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1515                 if (!sdd->sgc)
1516                         return -ENOMEM;
1517
1518                 for_each_cpu(j, cpu_map) {
1519                         struct sched_domain *sd;
1520                         struct sched_domain_shared *sds;
1521                         struct sched_group *sg;
1522                         struct sched_group_capacity *sgc;
1523
1524                         sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1525                                         GFP_KERNEL, cpu_to_node(j));
1526                         if (!sd)
1527                                 return -ENOMEM;
1528
1529                         *per_cpu_ptr(sdd->sd, j) = sd;
1530
1531                         sds = kzalloc_node(sizeof(struct sched_domain_shared),
1532                                         GFP_KERNEL, cpu_to_node(j));
1533                         if (!sds)
1534                                 return -ENOMEM;
1535
1536                         *per_cpu_ptr(sdd->sds, j) = sds;
1537
1538                         sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1539                                         GFP_KERNEL, cpu_to_node(j));
1540                         if (!sg)
1541                                 return -ENOMEM;
1542
1543                         sg->next = sg;
1544
1545                         *per_cpu_ptr(sdd->sg, j) = sg;
1546
1547                         sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1548                                         GFP_KERNEL, cpu_to_node(j));
1549                         if (!sgc)
1550                                 return -ENOMEM;
1551
1552 #ifdef CONFIG_SCHED_DEBUG
1553                         sgc->id = j;
1554 #endif
1555
1556                         *per_cpu_ptr(sdd->sgc, j) = sgc;
1557                 }
1558         }
1559
1560         return 0;
1561 }
1562
1563 static void __sdt_free(const struct cpumask *cpu_map)
1564 {
1565         struct sched_domain_topology_level *tl;
1566         int j;
1567
1568         for_each_sd_topology(tl) {
1569                 struct sd_data *sdd = &tl->data;
1570
1571                 for_each_cpu(j, cpu_map) {
1572                         struct sched_domain *sd;
1573
1574                         if (sdd->sd) {
1575                                 sd = *per_cpu_ptr(sdd->sd, j);
1576                                 if (sd && (sd->flags & SD_OVERLAP))
1577                                         free_sched_groups(sd->groups, 0);
1578                                 kfree(*per_cpu_ptr(sdd->sd, j));
1579                         }
1580
1581                         if (sdd->sds)
1582                                 kfree(*per_cpu_ptr(sdd->sds, j));
1583                         if (sdd->sg)
1584                                 kfree(*per_cpu_ptr(sdd->sg, j));
1585                         if (sdd->sgc)
1586                                 kfree(*per_cpu_ptr(sdd->sgc, j));
1587                 }
1588                 free_percpu(sdd->sd);
1589                 sdd->sd = NULL;
1590                 free_percpu(sdd->sds);
1591                 sdd->sds = NULL;
1592                 free_percpu(sdd->sg);
1593                 sdd->sg = NULL;
1594                 free_percpu(sdd->sgc);
1595                 sdd->sgc = NULL;
1596         }
1597 }
1598
1599 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
1600                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
1601                 struct sched_domain *child, int cpu)
1602 {
1603         struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
1604
1605         if (child) {
1606                 sd->level = child->level + 1;
1607                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
1608                 child->parent = sd;
1609
1610                 if (!cpumask_subset(sched_domain_span(child),
1611                                     sched_domain_span(sd))) {
1612                         pr_err("BUG: arch topology borken\n");
1613 #ifdef CONFIG_SCHED_DEBUG
1614                         pr_err("     the %s domain not a subset of the %s domain\n",
1615                                         child->name, sd->name);
1616 #endif
1617                         /* Fixup, ensure @sd has at least @child cpus. */
1618                         cpumask_or(sched_domain_span(sd),
1619                                    sched_domain_span(sd),
1620                                    sched_domain_span(child));
1621                 }
1622
1623         }
1624         set_domain_attribute(sd, attr);
1625
1626         return sd;
1627 }
1628
1629 /*
1630  * Build sched domains for a given set of CPUs and attach the sched domains
1631  * to the individual CPUs
1632  */
1633 static int
1634 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
1635 {
1636         enum s_alloc alloc_state;
1637         struct sched_domain *sd;
1638         struct s_data d;
1639         struct rq *rq = NULL;
1640         int i, ret = -ENOMEM;
1641
1642         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1643         if (alloc_state != sa_rootdomain)
1644                 goto error;
1645
1646         /* Set up domains for CPUs specified by the cpu_map: */
1647         for_each_cpu(i, cpu_map) {
1648                 struct sched_domain_topology_level *tl;
1649
1650                 sd = NULL;
1651                 for_each_sd_topology(tl) {
1652                         sd = build_sched_domain(tl, cpu_map, attr, sd, i);
1653                         if (tl == sched_domain_topology)
1654                                 *per_cpu_ptr(d.sd, i) = sd;
1655                         if (tl->flags & SDTL_OVERLAP)
1656                                 sd->flags |= SD_OVERLAP;
1657                         if (cpumask_equal(cpu_map, sched_domain_span(sd)))
1658                                 break;
1659                 }
1660         }
1661
1662         /* Build the groups for the domains */
1663         for_each_cpu(i, cpu_map) {
1664                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1665                         sd->span_weight = cpumask_weight(sched_domain_span(sd));
1666                         if (sd->flags & SD_OVERLAP) {
1667                                 if (build_overlap_sched_groups(sd, i))
1668                                         goto error;
1669                         } else {
1670                                 if (build_sched_groups(sd, i))
1671                                         goto error;
1672                         }
1673                 }
1674         }
1675
1676         /* Calculate CPU capacity for physical packages and nodes */
1677         for (i = nr_cpumask_bits-1; i >= 0; i--) {
1678                 if (!cpumask_test_cpu(i, cpu_map))
1679                         continue;
1680
1681                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1682                         claim_allocations(i, sd);
1683                         init_sched_groups_capacity(i, sd);
1684                 }
1685         }
1686
1687         /* Attach the domains */
1688         rcu_read_lock();
1689         for_each_cpu(i, cpu_map) {
1690                 rq = cpu_rq(i);
1691                 sd = *per_cpu_ptr(d.sd, i);
1692
1693                 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
1694                 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
1695                         WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
1696
1697                 cpu_attach_domain(sd, d.rd, i);
1698         }
1699         rcu_read_unlock();
1700
1701         if (rq && sched_debug_enabled) {
1702                 pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
1703                         cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
1704         }
1705
1706         ret = 0;
1707 error:
1708         __free_domain_allocs(&d, alloc_state, cpu_map);
1709         return ret;
1710 }
1711
1712 /* Current sched domains: */
1713 static cpumask_var_t                    *doms_cur;
1714
1715 /* Number of sched domains in 'doms_cur': */
1716 static int                              ndoms_cur;
1717
1718 /* Attribues of custom domains in 'doms_cur' */
1719 static struct sched_domain_attr         *dattr_cur;
1720
1721 /*
1722  * Special case: If a kmalloc() of a doms_cur partition (array of
1723  * cpumask) fails, then fallback to a single sched domain,
1724  * as determined by the single cpumask fallback_doms.
1725  */
1726 static cpumask_var_t                    fallback_doms;
1727
1728 /*
1729  * arch_update_cpu_topology lets virtualized architectures update the
1730  * CPU core maps. It is supposed to return 1 if the topology changed
1731  * or 0 if it stayed the same.
1732  */
1733 int __weak arch_update_cpu_topology(void)
1734 {
1735         return 0;
1736 }
1737
1738 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
1739 {
1740         int i;
1741         cpumask_var_t *doms;
1742
1743         doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
1744         if (!doms)
1745                 return NULL;
1746         for (i = 0; i < ndoms; i++) {
1747                 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
1748                         free_sched_domains(doms, i);
1749                         return NULL;
1750                 }
1751         }
1752         return doms;
1753 }
1754
1755 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
1756 {
1757         unsigned int i;
1758         for (i = 0; i < ndoms; i++)
1759                 free_cpumask_var(doms[i]);
1760         kfree(doms);
1761 }
1762
1763 /*
1764  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
1765  * For now this just excludes isolated CPUs, but could be used to
1766  * exclude other special cases in the future.
1767  */
1768 int sched_init_domains(const struct cpumask *cpu_map)
1769 {
1770         int err;
1771
1772         zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
1773         zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
1774         zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
1775
1776         arch_update_cpu_topology();
1777         ndoms_cur = 1;
1778         doms_cur = alloc_sched_domains(ndoms_cur);
1779         if (!doms_cur)
1780                 doms_cur = &fallback_doms;
1781         cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
1782         err = build_sched_domains(doms_cur[0], NULL);
1783         register_sched_domain_sysctl();
1784
1785         return err;
1786 }
1787
1788 /*
1789  * Detach sched domains from a group of CPUs specified in cpu_map
1790  * These CPUs will now be attached to the NULL domain
1791  */
1792 static void detach_destroy_domains(const struct cpumask *cpu_map)
1793 {
1794         int i;
1795
1796         rcu_read_lock();
1797         for_each_cpu(i, cpu_map)
1798                 cpu_attach_domain(NULL, &def_root_domain, i);
1799         rcu_read_unlock();
1800 }
1801
1802 /* handle null as "default" */
1803 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1804                         struct sched_domain_attr *new, int idx_new)
1805 {
1806         struct sched_domain_attr tmp;
1807
1808         /* Fast path: */
1809         if (!new && !cur)
1810                 return 1;
1811
1812         tmp = SD_ATTR_INIT;
1813         return !memcmp(cur ? (cur + idx_cur) : &tmp,
1814                         new ? (new + idx_new) : &tmp,
1815                         sizeof(struct sched_domain_attr));
1816 }
1817
1818 /*
1819  * Partition sched domains as specified by the 'ndoms_new'
1820  * cpumasks in the array doms_new[] of cpumasks. This compares
1821  * doms_new[] to the current sched domain partitioning, doms_cur[].
1822  * It destroys each deleted domain and builds each new domain.
1823  *
1824  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
1825  * The masks don't intersect (don't overlap.) We should setup one
1826  * sched domain for each mask. CPUs not in any of the cpumasks will
1827  * not be load balanced. If the same cpumask appears both in the
1828  * current 'doms_cur' domains and in the new 'doms_new', we can leave
1829  * it as it is.
1830  *
1831  * The passed in 'doms_new' should be allocated using
1832  * alloc_sched_domains.  This routine takes ownership of it and will
1833  * free_sched_domains it when done with it. If the caller failed the
1834  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
1835  * and partition_sched_domains() will fallback to the single partition
1836  * 'fallback_doms', it also forces the domains to be rebuilt.
1837  *
1838  * If doms_new == NULL it will be replaced with cpu_online_mask.
1839  * ndoms_new == 0 is a special case for destroying existing domains,
1840  * and it will not create the default domain.
1841  *
1842  * Call with hotplug lock held
1843  */
1844 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1845                              struct sched_domain_attr *dattr_new)
1846 {
1847         int i, j, n;
1848         int new_topology;
1849
1850         mutex_lock(&sched_domains_mutex);
1851
1852         /* Always unregister in case we don't destroy any domains: */
1853         unregister_sched_domain_sysctl();
1854
1855         /* Let the architecture update CPU core mappings: */
1856         new_topology = arch_update_cpu_topology();
1857
1858         if (!doms_new) {
1859                 WARN_ON_ONCE(dattr_new);
1860                 n = 0;
1861                 doms_new = alloc_sched_domains(1);
1862                 if (doms_new) {
1863                         n = 1;
1864                         cpumask_and(doms_new[0], cpu_active_mask,
1865                                     housekeeping_cpumask(HK_FLAG_DOMAIN));
1866                 }
1867         } else {
1868                 n = ndoms_new;
1869         }
1870
1871         /* Destroy deleted domains: */
1872         for (i = 0; i < ndoms_cur; i++) {
1873                 for (j = 0; j < n && !new_topology; j++) {
1874                         if (cpumask_equal(doms_cur[i], doms_new[j])
1875                             && dattrs_equal(dattr_cur, i, dattr_new, j))
1876                                 goto match1;
1877                 }
1878                 /* No match - a current sched domain not in new doms_new[] */
1879                 detach_destroy_domains(doms_cur[i]);
1880 match1:
1881                 ;
1882         }
1883
1884         n = ndoms_cur;
1885         if (!doms_new) {
1886                 n = 0;
1887                 doms_new = &fallback_doms;
1888                 cpumask_and(doms_new[0], cpu_active_mask,
1889                             housekeeping_cpumask(HK_FLAG_DOMAIN));
1890         }
1891
1892         /* Build new domains: */
1893         for (i = 0; i < ndoms_new; i++) {
1894                 for (j = 0; j < n && !new_topology; j++) {
1895                         if (cpumask_equal(doms_new[i], doms_cur[j])
1896                             && dattrs_equal(dattr_new, i, dattr_cur, j))
1897                                 goto match2;
1898                 }
1899                 /* No match - add a new doms_new */
1900                 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
1901 match2:
1902                 ;
1903         }
1904
1905         /* Remember the new sched domains: */
1906         if (doms_cur != &fallback_doms)
1907                 free_sched_domains(doms_cur, ndoms_cur);
1908
1909         kfree(dattr_cur);
1910         doms_cur = doms_new;
1911         dattr_cur = dattr_new;
1912         ndoms_cur = ndoms_new;
1913
1914         register_sched_domain_sysctl();
1915
1916         mutex_unlock(&sched_domains_mutex);
1917 }
1918