mm, swap: use percpu cluster as allocation fast path

author Kairui Song <kasong@tencent.com>

Thu, 13 Mar 2025 16:59:33 +0000 (00:59 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 17 Mar 2025 05:06:43 +0000 (22:06 -0700)
author Kairui Song <kasong@tencent.com>
Thu, 13 Mar 2025 16:59:33 +0000 (00:59 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 17 Mar 2025 05:06:43 +0000 (22:06 -0700)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 2fe91c293636b6bee8022ab3066ffbb42916585f..374bffc8742797b20503037971eadad44af9fd17 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -284,12 +284,10 @@ enum swap_cluster_flags {
  #endif
  
  /*
- * We assign a cluster to each CPU, so each CPU can allocate swap entry from
- * its own cluster and swapout sequentially. The purpose is to optimize swapout
- * throughput.
+ * We keep using same cluster for rotational device so IO will be sequential.
+ * The purpose is to optimize SWAP throughput on these device.
   */
-struct percpu_cluster {
-       local_lock_t lock; /* Protect the percpu_cluster above */
+struct swap_sequential_cluster {
         unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
  };
  
@@ -315,8 +313,7 @@ struct swap_info_struct {
         atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
         unsigned int pages;             /* total of usable pages of swap */
         atomic_long_t inuse_pages;      /* number of those currently in use */
-       struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
-       struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+       struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
         spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
         struct rb_root swap_extent_root;/* root of the swap extent rbtree */
         struct block_device *bdev;      /* swap device or bdev of swap file */
diff --git a/mm/swapfile.c b/mm/swapfile.c

index db836670c334217154115e8fe120d0fa73bbce3f..8b296c4c636bd1c210552c36a21ed2c16a1010b5 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
  
  atomic_t nr_rotate_swap = ATOMIC_INIT(0);
  
+struct percpu_swap_cluster {
+       struct swap_info_struct *si[SWAP_NR_ORDERS];
+       unsigned long offset[SWAP_NR_ORDERS];
+       local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
+       .si = { NULL },
+       .offset = { SWAP_ENTRY_INVALID },
+       .lock = INIT_LOCAL_LOCK(),
+};
+
  static struct swap_info_struct *swap_type_to_swap_info(int type)
  {
         if (type >= MAX_SWAPFILES)
@@ -539,7 +551,7 @@ static bool swap_do_scheduled_discard(struct swap_info_struct *si)
                 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
                 /*
                  * Delete the cluster from list to prepare for discard, but keep
-                * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
+                * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
                  * pointing to it, or ran into by relocate_cluster.
                  */
                 list_del(&ci->list);
@@ -805,10 +817,12 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
  out:
         relocate_cluster(si, ci);
         unlock_cluster(ci);
-       if (si->flags & SWP_SOLIDSTATE)
-               __this_cpu_write(si->percpu_cluster->next[order], next);
-       else
+       if (si->flags & SWP_SOLIDSTATE) {
+               this_cpu_write(percpu_swap_cluster.offset[order], next);
+               this_cpu_write(percpu_swap_cluster.si[order], si);
+       } else {
                 si->global_cluster->next[order] = next;
+       }
         return found;
  }
  
@@ -862,20 +876,18 @@ static void swap_reclaim_work(struct work_struct *work)
  }
  
  /*
- * Try to get swap entries with specified order from current cpu's swap entry
- * pool (a cluster). This might involve allocating a new cluster for current CPU
- * too.
+ * Try to allocate swap entries with specified order and try set a new
+ * cluster for current CPU too.
   */
  static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
                                               unsigned char usage)
  {
         struct swap_cluster_info *ci;
-       unsigned int offset, found = 0;
+       unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
  
         if (si->flags & SWP_SOLIDSTATE) {
-               /* Fast path using per CPU cluster */
-               local_lock(&si->percpu_cluster->lock);
-               offset = __this_cpu_read(si->percpu_cluster->next[order]);
+               if (si == this_cpu_read(percpu_swap_cluster.si[order]))
+                       offset = this_cpu_read(percpu_swap_cluster.offset[order]);
         } else {
                 /* Serialize HDD SWAP allocation for each device. */
                 spin_lock(&si->global_cluster_lock);
@@ -973,9 +985,7 @@ new_cluster:
                 }
         }
  done:
-       if (si->flags & SWP_SOLIDSTATE)
-               local_unlock(&si->percpu_cluster->lock);
-       else
+       if (!(si->flags & SWP_SOLIDSTATE))
                 spin_unlock(&si->global_cluster_lock);
         return found;
  }
@@ -1196,6 +1206,51 @@ static bool get_swap_device_info(struct swap_info_struct *si)
         return true;
  }
  
+/*
+ * Fast path try to get swap entries with specified order from current
+ * CPU's swap entry pool (a cluster).
+ */
+static int swap_alloc_fast(swp_entry_t entries[],
+                          unsigned char usage,
+                          int order, int n_goal)
+{
+       struct swap_cluster_info *ci;
+       struct swap_info_struct *si;
+       unsigned int offset, found;
+       int n_ret = 0;
+
+       n_goal = min(n_goal, SWAP_BATCH);
+
+       /*
+        * Once allocated, swap_info_struct will never be completely freed,
+        * so checking it's liveness by get_swap_device_info is enough.
+        */
+       si = this_cpu_read(percpu_swap_cluster.si[order]);
+       offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+       if (!si || !offset || !get_swap_device_info(si))
+               return 0;
+
+       while (offset) {
+               ci = lock_cluster(si, offset);
+               if (!cluster_is_usable(ci, order)) {
+                       unlock_cluster(ci);
+                       break;
+               }
+               if (cluster_is_empty(ci))
+                       offset = cluster_offset(si, ci);
+               found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+               if (!found)
+                       break;
+               entries[n_ret++] = swp_entry(si->type, found);
+               if (n_ret == n_goal)
+                       break;
+               offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+       }
+
+       put_swap_device(si);
+       return n_ret;
+}
+
  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
  {
         int order = swap_entry_order(entry_order);
@@ -1204,19 +1259,36 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
         int n_ret = 0;
         int node;
  
+       /* Fast path using percpu cluster */
+       local_lock(&percpu_swap_cluster.lock);
+       n_ret = swap_alloc_fast(swp_entries,
+                               SWAP_HAS_CACHE,
+                               order, n_goal);
+       if (n_ret == n_goal)
+               goto out;
+
+       n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
+       /* Rotate the device and switch to a new cluster */
         spin_lock(&swap_avail_lock);
  start_over:
         node = numa_node_id();
         plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
-               /* requeue si to after same-priority siblings */
                 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
                 spin_unlock(&swap_avail_lock);
                 if (get_swap_device_info(si)) {
-                       n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
-                                       n_goal, swp_entries, order);
+                       /*
+                        * For order 0 allocation, try best to fill the request
+                        * as it's used by slot cache.
+                        *
+                        * For mTHP allocation, it always have n_goal == 1,
+                        * and falling a mTHP swapin will just make the caller
+                        * fallback to order 0 allocation, so just bail out.
+                        */
+                       n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
+                                       swp_entries + n_ret, order);
                         put_swap_device(si);
                         if (n_ret || size > 1)
-                               goto check_out;
+                               goto out;
                 }
  
                 spin_lock(&swap_avail_lock);
@@ -1234,12 +1306,10 @@ start_over:
                 if (plist_node_empty(&next->avail_lists[node]))
                         goto start_over;
         }
-
         spin_unlock(&swap_avail_lock);
-
-check_out:
+out:
+       local_unlock(&percpu_swap_cluster.lock);
         atomic_long_sub(n_ret * size, &nr_swap_pages);
-
         return n_ret;
  }
  
@@ -2597,6 +2667,28 @@ static void wait_for_allocation(struct swap_info_struct *si)
         }
  }
  
+/*
+ * Called after swap device's reference count is dead, so
+ * neither scan nor allocation will use it.
+ */
+static void flush_percpu_swap_cluster(struct swap_info_struct *si)
+{
+       int cpu, i;
+       struct swap_info_struct **pcp_si;
+
+       for_each_possible_cpu(cpu) {
+               pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
+               /*
+                * Invalidate the percpu swap cluster cache, si->users
+                * is dead, so no new user will point to it, just flush
+                * any existing user.
+                */
+               for (i = 0; i < SWAP_NR_ORDERS; i++)
+                       cmpxchg(&pcp_si[i], si, NULL);
+       }
+}
+
+
  SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
  {
         struct swap_info_struct *p = NULL;
@@ -2698,6 +2790,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
  
         flush_work(&p->discard_work);
         flush_work(&p->reclaim_work);
+       flush_percpu_swap_cluster(p);
  
         destroy_swap_extents(p);
         if (p->flags & SWP_CONTINUED)
@@ -2725,8 +2818,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         arch_swap_invalidate_area(p->type);
         zswap_swapoff(p->type);
         mutex_unlock(&swapon_mutex);
-       free_percpu(p->percpu_cluster);
-       p->percpu_cluster = NULL;
         kfree(p->global_cluster);
         p->global_cluster = NULL;
         vfree(swap_map);
@@ -3125,7 +3216,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
         unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
         struct swap_cluster_info *cluster_info;
         unsigned long i, j, idx;
-       int cpu, err = -ENOMEM;
+       int err = -ENOMEM;
  
         cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
         if (!cluster_info)
@@ -3134,20 +3225,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
         for (i = 0; i < nr_clusters; i++)
                 spin_lock_init(&cluster_info[i].lock);
  
-       if (si->flags & SWP_SOLIDSTATE) {
-               si->percpu_cluster = alloc_percpu(struct percpu_cluster);
-               if (!si->percpu_cluster)
-                       goto err_free;
-
-               for_each_possible_cpu(cpu) {
-                       struct percpu_cluster *cluster;
-
-                       cluster = per_cpu_ptr(si->percpu_cluster, cpu);
-                       for (i = 0; i < SWAP_NR_ORDERS; i++)
-                               cluster->next[i] = SWAP_ENTRY_INVALID;
-                       local_lock_init(&cluster->lock);
-               }
-       } else {
+       if (!(si->flags & SWP_SOLIDSTATE)) {
                 si->global_cluster = kmalloc(sizeof(*si->global_cluster),
                                      GFP_KERNEL);
                 if (!si->global_cluster)
@@ -3424,8 +3502,6 @@ free_swap_address_space:
  bad_swap_unlock_inode:
         inode_unlock(inode);
  bad_swap:
-       free_percpu(si->percpu_cluster);
-       si->percpu_cluster = NULL;
         kfree(si->global_cluster);
         si->global_cluster = NULL;
         inode = NULL;
author	Kairui Song <kasong@tencent.com>
	Thu, 13 Mar 2025 16:59:33 +0000 (00:59 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 17 Mar 2025 05:06:43 +0000 (22:06 -0700)
include/linux/swap.h		patch \| blob \| blame \| history
mm/swapfile.c		patch \| blob \| blame \| history