percpu: reduce the number of cpu distance comparisons
authorWonhyuk Yang <vvghjk1234@gmail.com>
Fri, 30 Oct 2020 01:38:20 +0000 (10:38 +0900)
committerDennis Zhou <dennis@kernel.org>
Sun, 14 Feb 2021 17:34:05 +0000 (17:34 +0000)
To build group_map[] and group_cnt[], we find out which group
CPUs belong to by comparing the distance of the cpu. However,
this includes cases where comparisons are not required.

This patch uses a bitmap to record CPUs that is not classified in
the group. CPUs that we know which group they belong to should be
cleared from the bitmap. In result, we can reduce the number of
unnecessary comparisons.

Signed-off-by: Wonhyuk Yang <vvghjk1234@gmail.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
[Dennis: added cpumask_clear() call and #include cpumask.h.]

mm/percpu.c

index ad7a37ee74ef5f2ed8ead98c966a08f7d4e2384a..80f8f885a9901c6e8ec400304f4bb30703e8bc76 100644 (file)
@@ -69,6 +69,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/bitmap.h>
+#include <linux/cpumask.h>
 #include <linux/memblock.h>
 #include <linux/err.h>
 #include <linux/lcm.h>
@@ -2669,6 +2670,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 {
        static int group_map[NR_CPUS] __initdata;
        static int group_cnt[NR_CPUS] __initdata;
+       static struct cpumask mask __initdata;
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        int nr_groups = 1, nr_units = 0;
        size_t size_sum, min_unit_size, alloc_size;
@@ -2681,6 +2683,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
        /* this function may be called multiple times */
        memset(group_map, 0, sizeof(group_map));
        memset(group_cnt, 0, sizeof(group_cnt));
+       cpumask_clear(&mask);
 
        /* calculate size_sum and ensure dyn_size is enough for early alloc */
        size_sum = PFN_ALIGN(static_size + reserved_size +
@@ -2702,24 +2705,27 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
                upa--;
        max_upa = upa;
 
+       cpumask_copy(&mask, cpu_possible_mask);
+
        /* group cpus according to their proximity */
-       for_each_possible_cpu(cpu) {
-               group = 0;
-       next_group:
-               for_each_possible_cpu(tcpu) {
-                       if (cpu == tcpu)
-                               break;
-                       if (group_map[tcpu] == group && cpu_distance_fn &&
-                           (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
-                            cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
-                               group++;
-                               nr_groups = max(nr_groups, group + 1);
-                               goto next_group;
-                       }
-               }
+       for (group = 0; !cpumask_empty(&mask); group++) {
+               /* pop the group's first cpu */
+               cpu = cpumask_first(&mask);
                group_map[cpu] = group;
                group_cnt[group]++;
+               cpumask_clear_cpu(cpu, &mask);
+
+               for_each_cpu(tcpu, &mask) {
+                       if (!cpu_distance_fn ||
+                           (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
+                            cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
+                               group_map[tcpu] = group;
+                               group_cnt[group]++;
+                               cpumask_clear_cpu(tcpu, &mask);
+                       }
+               }
        }
+       nr_groups = group;
 
        /*
         * Wasted space is caused by a ratio imbalance of upa to group_cnt.