mm/mempolicy: Weighted Interleave Auto-tuning

author Joshua Hahn <joshua.hahnjy@gmail.com>

Mon, 5 May 2025 18:23:28 +0000 (11:23 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Wed, 21 May 2025 16:55:15 +0000 (09:55 -0700)
author Joshua Hahn <joshua.hahnjy@gmail.com>
Mon, 5 May 2025 18:23:28 +0000 (11:23 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Wed, 21 May 2025 16:55:15 +0000 (09:55 -0700)
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave

index 0b7972de04e9392b0a56bbb5d1e0a8b963fa3d6f..649c0e9b895c4a8ee3dd2156d051ba25c9b9fdf8 100644 (file)
--- a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -20,6 +20,35 @@ Description: Weight configuration interface for nodeN
                 Minimum weight: 1
                 Maximum weight: 255
  
-               Writing an empty string or `0` will reset the weight to the
-               system default. The system default may be set by the kernel
-               or drivers at boot or during hotplug events.
+               Writing invalid values (i.e. any values not in [1,255],
+               empty string, ...) will return -EINVAL.
+
+               Changing the weight to a valid value will automatically
+               switch the system to manual mode as well.
+
+What:          /sys/kernel/mm/mempolicy/weighted_interleave/auto
+Date:          May 2025
+Contact:       Linux memory management mailing list <linux-mm@kvack.org>
+Description:   Auto-weighting configuration interface
+
+               Configuration mode for weighted interleave. 'true' indicates
+               that the system is in auto mode, and a 'false' indicates that
+               the system is in manual mode.
+
+               In auto mode, all node weights are re-calculated and overwritten
+               (visible via the nodeN interfaces) whenever new bandwidth data
+               is made available during either boot or hotplug events.
+
+               In manual mode, node weights can only be updated by the user.
+               Note that nodes that are onlined with previously set weights
+               will reuse those weights. If they were not previously set or
+               are onlined with missing bandwidth data, the weights will use
+               a default weight of 1.
+
+               Writing any true value string (e.g. Y or 1) will enable auto
+               mode, while writing any false value string (e.g. N or 0) will
+               enable manual mode. All other strings are ignored and will
+               return -EINVAL.
+
+               Writing a new weight to a node directly via the nodeN interface
+               will also automatically switch the system to manual mode.
diff --git a/drivers/base/node.c b/drivers/base/node.c

index cd13ef2870119fd2d9f6b6d3f84ad7797ce76732..25ab9ec14eb8d46308bd32869624d01583df5a69 100644 (file)
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -7,6 +7,7 @@
  #include <linux/init.h>
  #include <linux/mm.h>
  #include <linux/memory.h>
+#include <linux/mempolicy.h>
  #include <linux/vmstat.h>
  #include <linux/notifier.h>
  #include <linux/node.h>
@@ -214,6 +215,14 @@ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord,
                         break;
                 }
         }
+
+       /* When setting CPU access coordinates, update mempolicy */
+       if (access == ACCESS_COORDINATE_CPU) {
+               if (mempolicy_set_node_perf(nid, coord)) {
+                       pr_info("failed to set mempolicy attrs for node %d\n",
+                               nid);
+               }
+       }
  }
  EXPORT_SYMBOL_GPL(node_set_perf_attrs);
  
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h

index ce9885e0178adfa37a7cb466102d4955a54bf587..0fe96f3ab3ef02e902e1676e750c2006ecd6147f 100644 (file)
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -11,6 +11,7 @@
  #include <linux/slab.h>
  #include <linux/rbtree.h>
  #include <linux/spinlock.h>
+#include <linux/node.h>
  #include <linux/nodemask.h>
  #include <linux/pagemap.h>
  #include <uapi/linux/mempolicy.h>
@@ -178,6 +179,9 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
  
  extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
  
+extern int mempolicy_set_node_perf(unsigned int node,
+                                  struct access_coordinate *coords);
+
  #else
  
  struct mempolicy {};
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 9a2b4b36f55818c3230d67b0ac6e9f0557023465..72fd72e156b1a5dc8706126115c7f137358b6d01 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -109,6 +109,7 @@
  #include <linux/mmu_notifier.h>
  #include <linux/printk.h>
  #include <linux/swapops.h>
+#include <linux/gcd.h>
  
  #include <asm/tlbflush.h>
  #include <asm/tlb.h>
@@ -140,31 +141,138 @@ static struct mempolicy default_policy = {
  static struct mempolicy preferred_node_policy[MAX_NUMNODES];
  
  /*
- * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
- * system-default value should be used. A NULL iw_table also denotes that
- * system-default values should be used. Until the system-default table
- * is implemented, the system-default is always 1.
- *
- * iw_table is RCU protected
+ * weightiness balances the tradeoff between small weights (cycles through nodes
+ * faster, more fair/even distribution) and large weights (smaller errors
+ * between actual bandwidth ratios and weight ratios). 32 is a number that has
+ * been found to perform at a reasonable compromise between the two goals.
+ */
+static const int weightiness = 32;
+
+/*
+ * A null weighted_interleave_state is interpreted as having .mode="auto",
+ * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
+ */
+struct weighted_interleave_state {
+       bool mode_auto;
+       u8 iw_table[];
+};
+static struct weighted_interleave_state __rcu *wi_state;
+static unsigned int *node_bw_table;
+
+/*
+ * wi_state_lock protects both wi_state and node_bw_table.
+ * node_bw_table is only used by writers to update wi_state.
   */
-static u8 __rcu *iw_table;
-static DEFINE_MUTEX(iw_table_lock);
+static DEFINE_MUTEX(wi_state_lock);
  
  static u8 get_il_weight(int node)
  {
-       u8 *table;
-       u8 weight;
+       struct weighted_interleave_state *state;
+       u8 weight = 1;
  
         rcu_read_lock();
-       table = rcu_dereference(iw_table);
-       /* if no iw_table, use system default */
-       weight = table ? table[node] : 1;
-       /* if value in iw_table is 0, use system default */
-       weight = weight ? weight : 1;
+       state = rcu_dereference(wi_state);
+       if (state)
+               weight = state->iw_table[node];
         rcu_read_unlock();
         return weight;
  }
  
+/*
+ * Convert bandwidth values into weighted interleave weights.
+ * Call with wi_state_lock.
+ */
+static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
+{
+       u64 sum_bw = 0;
+       unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
+       int nid;
+
+       for_each_node_state(nid, N_MEMORY)
+               sum_bw += bw[nid];
+
+       /* Scale bandwidths to whole numbers in the range [1, weightiness] */
+       for_each_node_state(nid, N_MEMORY) {
+               /*
+                * Try not to perform 64-bit division.
+                * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
+                * If sum_bw > scaling_factor, then round the weight up to 1.
+                */
+               scaling_factor = weightiness * bw[nid];
+               if (bw[nid] && sum_bw < scaling_factor) {
+                       cast_sum_bw = (unsigned int)sum_bw;
+                       new_iw[nid] = scaling_factor / cast_sum_bw;
+               } else {
+                       new_iw[nid] = 1;
+               }
+               if (!iw_gcd)
+                       iw_gcd = new_iw[nid];
+               iw_gcd = gcd(iw_gcd, new_iw[nid]);
+       }
+
+       /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
+       for_each_node_state(nid, N_MEMORY)
+               new_iw[nid] /= iw_gcd;
+}
+
+int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
+{
+       struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+       unsigned int *old_bw, *new_bw;
+       unsigned int bw_val;
+       int i;
+
+       bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
+       new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+       if (!new_bw)
+               return -ENOMEM;
+
+       new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+                              GFP_KERNEL);
+       if (!new_wi_state) {
+               kfree(new_bw);
+               return -ENOMEM;
+       }
+       new_wi_state->mode_auto = true;
+       for (i = 0; i < nr_node_ids; i++)
+               new_wi_state->iw_table[i] = 1;
+
+       /*
+        * Update bandwidth info, even in manual mode. That way, when switching
+        * to auto mode in the future, iw_table can be overwritten using
+        * accurate bw data.
+        */
+       mutex_lock(&wi_state_lock);
+
+       old_bw = node_bw_table;
+       if (old_bw)
+               memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
+       new_bw[node] = bw_val;
+       node_bw_table = new_bw;
+
+       old_wi_state = rcu_dereference_protected(wi_state,
+                                       lockdep_is_held(&wi_state_lock));
+       if (old_wi_state && !old_wi_state->mode_auto) {
+               /* Manual mode; skip reducing weights and updating wi_state */
+               mutex_unlock(&wi_state_lock);
+               kfree(new_wi_state);
+               goto out;
+       }
+
+       /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
+       reduce_interleave_weights(new_bw, new_wi_state->iw_table);
+       rcu_assign_pointer(wi_state, new_wi_state);
+
+       mutex_unlock(&wi_state_lock);
+       if (old_wi_state) {
+               synchronize_rcu();
+               kfree(old_wi_state);
+       }
+out:
+       kfree(old_bw);
+       return 0;
+}
+
  /**
   * numa_nearest_node - Find nearest node by state
   * @node: Node id to start the search
@@ -2023,26 +2131,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
  
  static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
  {
+       struct weighted_interleave_state *state;
         nodemask_t nodemask;
         unsigned int target, nr_nodes;
-       u8 *table;
+       u8 *table = NULL;
         unsigned int weight_total = 0;
         u8 weight;
-       int nid;
+       int nid = 0;
  
         nr_nodes = read_once_policy_nodemask(pol, &nodemask);
         if (!nr_nodes)
                 return numa_node_id();
  
         rcu_read_lock();
-       table = rcu_dereference(iw_table);
+
+       state = rcu_dereference(wi_state);
+       /* Uninitialized wi_state means we should assume all weights are 1 */
+       if (state)
+               table = state->iw_table;
+
         /* calculate the total weight */
-       for_each_node_mask(nid, nodemask) {
-               /* detect system default usage */
-               weight = table ? table[nid] : 1;
-               weight = weight ? weight : 1;
-               weight_total += weight;
-       }
+       for_each_node_mask(nid, nodemask)
+               weight_total += table ? table[nid] : 1;
  
         /* Calculate the node offset based on totals */
         target = ilx % weight_total;
@@ -2050,7 +2160,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
         while (target) {
                 /* detect system default usage */
                 weight = table ? table[nid] : 1;
-               weight = weight ? weight : 1;
                 if (target < weight)
                         break;
                 target -= weight;
@@ -2451,13 +2560,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
                 struct mempolicy *pol, unsigned long nr_pages,
                 struct page **page_array)
  {
+       struct weighted_interleave_state *state;
         struct task_struct *me = current;
         unsigned int cpuset_mems_cookie;
         unsigned long total_allocated = 0;
         unsigned long nr_allocated = 0;
         unsigned long rounds;
         unsigned long node_pages, delta;
-       u8 *table, *weights, weight;
+       u8 *weights, weight;
         unsigned int weight_total = 0;
         unsigned long rem_pages = nr_pages;
         nodemask_t nodes;
@@ -2507,17 +2617,19 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
                 return total_allocated;
  
         rcu_read_lock();
-       table = rcu_dereference(iw_table);
-       if (table)
-               memcpy(weights, table, nr_node_ids);
-       rcu_read_unlock();
+       state = rcu_dereference(wi_state);
+       if (state) {
+               memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
+               rcu_read_unlock();
+       } else {
+               rcu_read_unlock();
+               for (i = 0; i < nr_node_ids; i++)
+                       weights[i] = 1;
+       }
  
         /* calculate total, detect system default usage */
-       for_each_node_mask(node, nodes) {
-               if (!weights[node])
-                       weights[node] = 1;
+       for_each_node_mask(node, nodes)
                 weight_total += weights[node];
-       }
  
         /*
          * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
@@ -3450,31 +3562,109 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
  static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
                           const char *buf, size_t count)
  {
+       struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
         struct iw_node_attr *node_attr;
-       u8 *new;
-       u8 *old;
         u8 weight = 0;
+       int i;
  
         node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
-       if (count == 0 || sysfs_streq(buf, ""))
-               weight = 0;
-       else if (kstrtou8(buf, 0, &weight))
+       if (count == 0 || sysfs_streq(buf, "") ||
+           kstrtou8(buf, 0, &weight) || weight == 0)
                 return -EINVAL;
  
-       new = kzalloc(nr_node_ids, GFP_KERNEL);
-       if (!new)
+       new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+                              GFP_KERNEL);
+       if (!new_wi_state)
                 return -ENOMEM;
  
-       mutex_lock(&iw_table_lock);
-       old = rcu_dereference_protected(iw_table,
-                                       lockdep_is_held(&iw_table_lock));
-       if (old)
-               memcpy(new, old, nr_node_ids);
-       new[node_attr->nid] = weight;
-       rcu_assign_pointer(iw_table, new);
-       mutex_unlock(&iw_table_lock);
-       synchronize_rcu();
-       kfree(old);
+       mutex_lock(&wi_state_lock);
+       old_wi_state = rcu_dereference_protected(wi_state,
+                                       lockdep_is_held(&wi_state_lock));
+       if (old_wi_state) {
+               memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+                                       nr_node_ids * sizeof(u8));
+       } else {
+               for (i = 0; i < nr_node_ids; i++)
+                       new_wi_state->iw_table[i] = 1;
+       }
+       new_wi_state->iw_table[node_attr->nid] = weight;
+       new_wi_state->mode_auto = false;
+
+       rcu_assign_pointer(wi_state, new_wi_state);
+       mutex_unlock(&wi_state_lock);
+       if (old_wi_state) {
+               synchronize_rcu();
+               kfree(old_wi_state);
+       }
+       return count;
+}
+
+static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
+               struct kobj_attribute *attr, char *buf)
+{
+       struct weighted_interleave_state *state;
+       bool wi_auto = true;
+
+       rcu_read_lock();
+       state = rcu_dereference(wi_state);
+       if (state)
+               wi_auto = state->mode_auto;
+       rcu_read_unlock();
+
+       return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
+}
+
+static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
+               struct kobj_attribute *attr, const char *buf, size_t count)
+{
+       struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+       unsigned int *bw;
+       bool input;
+       int i;
+
+       if (kstrtobool(buf, &input))
+               return -EINVAL;
+
+       new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+                              GFP_KERNEL);
+       if (!new_wi_state)
+               return -ENOMEM;
+       for (i = 0; i < nr_node_ids; i++)
+               new_wi_state->iw_table[i] = 1;
+
+       mutex_lock(&wi_state_lock);
+       if (!input) {
+               old_wi_state = rcu_dereference_protected(wi_state,
+                                       lockdep_is_held(&wi_state_lock));
+               if (!old_wi_state)
+                       goto update_wi_state;
+               if (input == old_wi_state->mode_auto) {
+                       mutex_unlock(&wi_state_lock);
+                       return count;
+               }
+
+               memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+                                              nr_node_ids * sizeof(u8));
+               goto update_wi_state;
+       }
+
+       bw = node_bw_table;
+       if (!bw) {
+               mutex_unlock(&wi_state_lock);
+               kfree(new_wi_state);
+               return -ENODEV;
+       }
+
+       new_wi_state->mode_auto = true;
+       reduce_interleave_weights(bw, new_wi_state->iw_table);
+
+update_wi_state:
+       rcu_assign_pointer(wi_state, new_wi_state);
+       mutex_unlock(&wi_state_lock);
+       if (old_wi_state) {
+               synchronize_rcu();
+               kfree(old_wi_state);
+       }
         return count;
  }
  
@@ -3508,23 +3698,35 @@ static void sysfs_wi_node_delete_all(void)
                 sysfs_wi_node_delete(nid);
  }
  
-static void iw_table_free(void)
+static void wi_state_free(void)
  {
-       u8 *old;
+       struct weighted_interleave_state *old_wi_state;
  
-       mutex_lock(&iw_table_lock);
-       old = rcu_dereference_protected(iw_table,
-                                       lockdep_is_held(&iw_table_lock));
-       rcu_assign_pointer(iw_table, NULL);
-       mutex_unlock(&iw_table_lock);
+       mutex_lock(&wi_state_lock);
+
+       old_wi_state = rcu_dereference_protected(wi_state,
+                       lockdep_is_held(&wi_state_lock));
+       if (!old_wi_state) {
+               mutex_unlock(&wi_state_lock);
+               goto out;
+       }
  
+       rcu_assign_pointer(wi_state, NULL);
+       mutex_unlock(&wi_state_lock);
         synchronize_rcu();
-       kfree(old);
+       kfree(old_wi_state);
+out:
+       kfree(&wi_group->wi_kobj);
  }
  
+static struct kobj_attribute wi_auto_attr =
+       __ATTR(auto, 0664, weighted_interleave_auto_show,
+                          weighted_interleave_auto_store);
+
  static void wi_cleanup(void) {
+       sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
         sysfs_wi_node_delete_all();
-       iw_table_free();
+       wi_state_free();
  }
  
  static void wi_kobj_release(struct kobject *wi_kobj)
@@ -3627,6 +3829,10 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
         if (err)
                 goto err_put_kobj;
  
+       err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+       if (err)
+               goto err_put_kobj;
+
         for_each_online_node(nid) {
                 if (!node_state(nid, N_MEMORY))
                         continue;
author	Joshua Hahn <joshua.hahnjy@gmail.com>
	Mon, 5 May 2025 18:23:28 +0000 (11:23 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Wed, 21 May 2025 16:55:15 +0000 (09:55 -0700)
Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave		patch \| blob \| blame \| history
drivers/base/node.c		patch \| blob \| blame \| history
include/linux/mempolicy.h		patch \| blob \| blame \| history
mm/mempolicy.c		patch \| blob \| blame \| history