atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+struct percpu_swap_cluster {
+ struct swap_info_struct *si[SWAP_NR_ORDERS];
+ unsigned long offset[SWAP_NR_ORDERS];
+ local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
+ .si = { NULL },
+ .offset = { SWAP_ENTRY_INVALID },
+ .lock = INIT_LOCAL_LOCK(),
+};
+
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
if (type >= MAX_SWAPFILES)
ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
/*
* Delete the cluster from list to prepare for discard, but keep
- * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
+ * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
* pointing to it, or ran into by relocate_cluster.
*/
list_del(&ci->list);
out:
relocate_cluster(si, ci);
unlock_cluster(ci);
- if (si->flags & SWP_SOLIDSTATE)
- __this_cpu_write(si->percpu_cluster->next[order], next);
- else
+ if (si->flags & SWP_SOLIDSTATE) {
+ this_cpu_write(percpu_swap_cluster.offset[order], next);
+ this_cpu_write(percpu_swap_cluster.si[order], si);
+ } else {
si->global_cluster->next[order] = next;
+ }
return found;
}
}
/*
- * Try to get swap entries with specified order from current cpu's swap entry
- * pool (a cluster). This might involve allocating a new cluster for current CPU
- * too.
+ * Try to allocate swap entries with specified order and try set a new
+ * cluster for current CPU too.
*/
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
unsigned char usage)
{
struct swap_cluster_info *ci;
- unsigned int offset, found = 0;
+ unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
if (si->flags & SWP_SOLIDSTATE) {
- /* Fast path using per CPU cluster */
- local_lock(&si->percpu_cluster->lock);
- offset = __this_cpu_read(si->percpu_cluster->next[order]);
+ if (si == this_cpu_read(percpu_swap_cluster.si[order]))
+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
} else {
/* Serialize HDD SWAP allocation for each device. */
spin_lock(&si->global_cluster_lock);
}
}
done:
- if (si->flags & SWP_SOLIDSTATE)
- local_unlock(&si->percpu_cluster->lock);
- else
+ if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock);
return found;
}
return true;
}
+/*
+ * Fast path try to get swap entries with specified order from current
+ * CPU's swap entry pool (a cluster).
+ */
+static int swap_alloc_fast(swp_entry_t entries[],
+ unsigned char usage,
+ int order, int n_goal)
+{
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si;
+ unsigned int offset, found;
+ int n_ret = 0;
+
+ n_goal = min(n_goal, SWAP_BATCH);
+
+ /*
+ * Once allocated, swap_info_struct will never be completely freed,
+ * so checking it's liveness by get_swap_device_info is enough.
+ */
+ si = this_cpu_read(percpu_swap_cluster.si[order]);
+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+ if (!si || !offset || !get_swap_device_info(si))
+ return 0;
+
+ while (offset) {
+ ci = lock_cluster(si, offset);
+ if (!cluster_is_usable(ci, order)) {
+ unlock_cluster(ci);
+ break;
+ }
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+ if (!found)
+ break;
+ entries[n_ret++] = swp_entry(si->type, found);
+ if (n_ret == n_goal)
+ break;
+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+ }
+
+ put_swap_device(si);
+ return n_ret;
+}
+
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
{
int order = swap_entry_order(entry_order);
int n_ret = 0;
int node;
+ /* Fast path using percpu cluster */
+ local_lock(&percpu_swap_cluster.lock);
+ n_ret = swap_alloc_fast(swp_entries,
+ SWAP_HAS_CACHE,
+ order, n_goal);
+ if (n_ret == n_goal)
+ goto out;
+
+ n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
+ /* Rotate the device and switch to a new cluster */
spin_lock(&swap_avail_lock);
start_over:
node = numa_node_id();
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
- /* requeue si to after same-priority siblings */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries, order);
+ /*
+ * For order 0 allocation, try best to fill the request
+ * as it's used by slot cache.
+ *
+ * For mTHP allocation, it always have n_goal == 1,
+ * and falling a mTHP swapin will just make the caller
+ * fallback to order 0 allocation, so just bail out.
+ */
+ n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
+ swp_entries + n_ret, order);
put_swap_device(si);
if (n_ret || size > 1)
- goto check_out;
+ goto out;
}
spin_lock(&swap_avail_lock);
if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
}
-
spin_unlock(&swap_avail_lock);
-
-check_out:
+out:
+ local_unlock(&percpu_swap_cluster.lock);
atomic_long_sub(n_ret * size, &nr_swap_pages);
-
return n_ret;
}
}
}
+/*
+ * Called after swap device's reference count is dead, so
+ * neither scan nor allocation will use it.
+ */
+static void flush_percpu_swap_cluster(struct swap_info_struct *si)
+{
+ int cpu, i;
+ struct swap_info_struct **pcp_si;
+
+ for_each_possible_cpu(cpu) {
+ pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
+ /*
+ * Invalidate the percpu swap cluster cache, si->users
+ * is dead, so no new user will point to it, just flush
+ * any existing user.
+ */
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cmpxchg(&pcp_si[i], si, NULL);
+ }
+}
+
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
flush_work(&p->discard_work);
flush_work(&p->reclaim_work);
+ flush_percpu_swap_cluster(p);
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
arch_swap_invalidate_area(p->type);
zswap_swapoff(p->type);
mutex_unlock(&swapon_mutex);
- free_percpu(p->percpu_cluster);
- p->percpu_cluster = NULL;
kfree(p->global_cluster);
p->global_cluster = NULL;
vfree(swap_map);
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
struct swap_cluster_info *cluster_info;
unsigned long i, j, idx;
- int cpu, err = -ENOMEM;
+ int err = -ENOMEM;
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- if (si->flags & SWP_SOLIDSTATE) {
- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!si->percpu_cluster)
- goto err_free;
-
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
-
- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
- for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_ENTRY_INVALID;
- local_lock_init(&cluster->lock);
- }
- } else {
+ if (!(si->flags & SWP_SOLIDSTATE)) {
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
GFP_KERNEL);
if (!si->global_cluster)
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
- free_percpu(si->percpu_cluster);
- si->percpu_cluster = NULL;
kfree(si->global_cluster);
si->global_cluster = NULL;
inode = NULL;