mm: zswap: tighten up entry invalidation
[linux-2.6-block.git] / mm / zswap.c
CommitLineData
c942fddf 1// SPDX-License-Identifier: GPL-2.0-or-later
2b281117
SJ
2/*
3 * zswap.c - zswap driver file
4 *
42c06a0e 5 * zswap is a cache that takes pages that are in the process
2b281117
SJ
6 * of being swapped out and attempts to compress and store them in a
7 * RAM-based memory pool. This can result in a significant I/O reduction on
8 * the swap device and, in the case where decompressing from RAM is faster
9 * than reading from the swap device, can also improve workload performance.
10 *
11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
2b281117
SJ
12*/
13
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
16#include <linux/module.h>
17#include <linux/cpu.h>
18#include <linux/highmem.h>
19#include <linux/slab.h>
20#include <linux/spinlock.h>
21#include <linux/types.h>
22#include <linux/atomic.h>
2b281117
SJ
23#include <linux/rbtree.h>
24#include <linux/swap.h>
25#include <linux/crypto.h>
1ec3b5fe 26#include <linux/scatterlist.h>
2b281117 27#include <linux/mempool.h>
12d79d64 28#include <linux/zpool.h>
1ec3b5fe 29#include <crypto/acompress.h>
42c06a0e 30#include <linux/zswap.h>
2b281117
SJ
31#include <linux/mm_types.h>
32#include <linux/page-flags.h>
33#include <linux/swapops.h>
34#include <linux/writeback.h>
35#include <linux/pagemap.h>
45190f01 36#include <linux/workqueue.h>
2b281117 37
014bb1de 38#include "swap.h"
e0228d59 39#include "internal.h"
014bb1de 40
2b281117
SJ
41/*********************************
42* statistics
43**********************************/
12d79d64 44/* Total bytes used by the compressed storage */
f6498b77 45u64 zswap_pool_total_size;
2b281117 46/* The number of compressed pages currently stored in zswap */
f6498b77 47atomic_t zswap_stored_pages = ATOMIC_INIT(0);
a85f878b
SD
48/* The number of same-value filled pages currently stored in zswap */
49static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
2b281117
SJ
50
51/*
52 * The statistics below are not protected from concurrent access for
53 * performance reasons so they may not be a 100% accurate. However,
54 * they do provide useful information on roughly how many times a
55 * certain event is occurring.
56*/
57
58/* Pool limit was hit (see zswap_max_pool_percent) */
59static u64 zswap_pool_limit_hit;
60/* Pages written back when pool limit was reached */
61static u64 zswap_written_back_pages;
62/* Store failed due to a reclaim failure after pool limit was reached */
63static u64 zswap_reject_reclaim_fail;
64/* Compressed page was too big for the allocator to (optimally) store */
65static u64 zswap_reject_compress_poor;
66/* Store failed because underlying allocator could not get memory */
67static u64 zswap_reject_alloc_fail;
68/* Store failed because the entry metadata could not be allocated (rare) */
69static u64 zswap_reject_kmemcache_fail;
70/* Duplicate store was encountered (rare) */
71static u64 zswap_duplicate_entry;
72
45190f01
VW
73/* Shrinker work queue */
74static struct workqueue_struct *shrink_wq;
75/* Pool limit was hit, we need to calm down */
76static bool zswap_pool_reached_full;
77
2b281117
SJ
78/*********************************
79* tunables
80**********************************/
c00ed16a 81
bae21db8
DS
82#define ZSWAP_PARAM_UNSET ""
83
141fdeec
LS
84static int zswap_setup(void);
85
bb8b93b5
MS
86/* Enable/disable zswap */
87static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
d7b028f5
DS
88static int zswap_enabled_param_set(const char *,
89 const struct kernel_param *);
83aed6cd 90static const struct kernel_param_ops zswap_enabled_param_ops = {
d7b028f5
DS
91 .set = zswap_enabled_param_set,
92 .get = param_get_bool,
93};
94module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
2b281117 95
90b0fc26 96/* Crypto compressor to use */
bb8b93b5 97static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
90b0fc26
DS
98static int zswap_compressor_param_set(const char *,
99 const struct kernel_param *);
83aed6cd 100static const struct kernel_param_ops zswap_compressor_param_ops = {
90b0fc26 101 .set = zswap_compressor_param_set,
c99b42c3
DS
102 .get = param_get_charp,
103 .free = param_free_charp,
90b0fc26
DS
104};
105module_param_cb(compressor, &zswap_compressor_param_ops,
c99b42c3 106 &zswap_compressor, 0644);
2b281117 107
90b0fc26 108/* Compressed storage zpool to use */
bb8b93b5 109static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
90b0fc26 110static int zswap_zpool_param_set(const char *, const struct kernel_param *);
83aed6cd 111static const struct kernel_param_ops zswap_zpool_param_ops = {
c99b42c3
DS
112 .set = zswap_zpool_param_set,
113 .get = param_get_charp,
114 .free = param_free_charp,
90b0fc26 115};
c99b42c3 116module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
12d79d64 117
90b0fc26
DS
118/* The maximum percentage of memory that the compressed pool can occupy */
119static unsigned int zswap_max_pool_percent = 20;
120module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
60105e12 121
45190f01
VW
122/* The threshold for accepting new pages after the max_pool_percent was hit */
123static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
124module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
125 uint, 0644);
126
cb325ddd
MS
127/*
128 * Enable/disable handling same-value filled pages (enabled by default).
129 * If disabled every page is considered non-same-value filled.
130 */
a85f878b
SD
131static bool zswap_same_filled_pages_enabled = true;
132module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
133 bool, 0644);
134
cb325ddd
MS
135/* Enable/disable handling non-same-value filled pages (enabled by default) */
136static bool zswap_non_same_filled_pages_enabled = true;
137module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
138 bool, 0644);
139
b9c91c43
YA
140static bool zswap_exclusive_loads_enabled = IS_ENABLED(
141 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
142module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
143
b8cf32dc
YA
144/* Number of zpools in zswap_pool (empirically determined for scalability) */
145#define ZSWAP_NR_ZPOOLS 32
146
2b281117 147/*********************************
f1c54846 148* data structures
2b281117 149**********************************/
2b281117 150
1ec3b5fe
BS
151struct crypto_acomp_ctx {
152 struct crypto_acomp *acomp;
153 struct acomp_req *req;
154 struct crypto_wait wait;
155 u8 *dstmem;
156 struct mutex *mutex;
157};
158
f999f38b
DC
159/*
160 * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
161 * The only case where lru_lock is not acquired while holding tree.lock is
162 * when a zswap_entry is taken off the lru for writeback, in that case it
163 * needs to be verified that it's still valid in the tree.
164 */
f1c54846 165struct zswap_pool {
b8cf32dc 166 struct zpool *zpools[ZSWAP_NR_ZPOOLS];
1ec3b5fe 167 struct crypto_acomp_ctx __percpu *acomp_ctx;
f1c54846
DS
168 struct kref kref;
169 struct list_head list;
45190f01
VW
170 struct work_struct release_work;
171 struct work_struct shrink_work;
cab7a7e5 172 struct hlist_node node;
f1c54846 173 char tfm_name[CRYPTO_MAX_ALG_NAME];
f999f38b
DC
174 struct list_head lru;
175 spinlock_t lru_lock;
2b281117
SJ
176};
177
2b281117
SJ
178/*
179 * struct zswap_entry
180 *
181 * This structure contains the metadata for tracking a single compressed
182 * page within zswap.
183 *
184 * rbnode - links the entry into red-black tree for the appropriate swap type
f1c54846 185 * offset - the swap offset for the entry. Index into the red-black tree.
2b281117
SJ
186 * refcount - the number of outstanding reference to the entry. This is needed
187 * to protect against premature freeing of the entry by code
6b452516 188 * concurrent calls to load, invalidate, and writeback. The lock
2b281117
SJ
189 * for the zswap_tree structure that contains the entry must
190 * be held while changing the refcount. Since the lock must
191 * be held, there is no reason to also make refcount atomic.
2b281117 192 * length - the length in bytes of the compressed page data. Needed during
f999f38b
DC
193 * decompression. For a same value filled page length is 0, and both
194 * pool and lru are invalid and must be ignored.
f1c54846
DS
195 * pool - the zswap_pool the entry's data is in
196 * handle - zpool allocation handle that stores the compressed page data
a85f878b 197 * value - value of the same-value filled pages which have same content
f999f38b 198 * lru - handle to the pool's lru used to evict pages.
2b281117
SJ
199 */
200struct zswap_entry {
201 struct rb_node rbnode;
0bb48849 202 swp_entry_t swpentry;
2b281117
SJ
203 int refcount;
204 unsigned int length;
f1c54846 205 struct zswap_pool *pool;
a85f878b
SD
206 union {
207 unsigned long handle;
208 unsigned long value;
209 };
f4840ccf 210 struct obj_cgroup *objcg;
f999f38b 211 struct list_head lru;
2b281117
SJ
212};
213
2b281117
SJ
214/*
215 * The tree lock in the zswap_tree struct protects a few things:
216 * - the rbtree
217 * - the refcount field of each entry in the tree
218 */
219struct zswap_tree {
220 struct rb_root rbroot;
221 spinlock_t lock;
2b281117
SJ
222};
223
224static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
225
f1c54846
DS
226/* RCU-protected iteration */
227static LIST_HEAD(zswap_pools);
228/* protects zswap_pools list modification */
229static DEFINE_SPINLOCK(zswap_pools_lock);
32a4e169
DS
230/* pool counter to provide unique names to zpool */
231static atomic_t zswap_pools_count = ATOMIC_INIT(0);
f1c54846 232
9021ccec
LS
233enum zswap_init_type {
234 ZSWAP_UNINIT,
235 ZSWAP_INIT_SUCCEED,
236 ZSWAP_INIT_FAILED
237};
90b0fc26 238
9021ccec 239static enum zswap_init_type zswap_init_state;
90b0fc26 240
141fdeec
LS
241/* used to ensure the integrity of initialization */
242static DEFINE_MUTEX(zswap_init_lock);
d7b028f5 243
ae3d89a7
DS
244/* init completed, but couldn't create the initial pool */
245static bool zswap_has_pool;
246
f1c54846
DS
247/*********************************
248* helpers and fwd declarations
249**********************************/
250
251#define zswap_pool_debug(msg, p) \
252 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
b8cf32dc 253 zpool_get_type((p)->zpools[0]))
f1c54846 254
0bb48849 255static int zswap_writeback_entry(struct zswap_entry *entry,
ff9d5ba2 256 struct zswap_tree *tree);
f1c54846
DS
257static int zswap_pool_get(struct zswap_pool *pool);
258static void zswap_pool_put(struct zswap_pool *pool);
259
f1c54846
DS
260static bool zswap_is_full(void)
261{
ca79b0c2
AK
262 return totalram_pages() * zswap_max_pool_percent / 100 <
263 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
f1c54846
DS
264}
265
45190f01
VW
266static bool zswap_can_accept(void)
267{
268 return totalram_pages() * zswap_accept_thr_percent / 100 *
269 zswap_max_pool_percent / 100 >
270 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
271}
272
f1c54846
DS
273static void zswap_update_total_size(void)
274{
275 struct zswap_pool *pool;
276 u64 total = 0;
b8cf32dc 277 int i;
f1c54846
DS
278
279 rcu_read_lock();
280
281 list_for_each_entry_rcu(pool, &zswap_pools, list)
b8cf32dc
YA
282 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
283 total += zpool_get_total_size(pool->zpools[i]);
f1c54846
DS
284
285 rcu_read_unlock();
286
287 zswap_pool_total_size = total;
288}
289
2b281117
SJ
290/*********************************
291* zswap entry functions
292**********************************/
293static struct kmem_cache *zswap_entry_cache;
294
2b281117
SJ
295static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
296{
297 struct zswap_entry *entry;
298 entry = kmem_cache_alloc(zswap_entry_cache, gfp);
299 if (!entry)
300 return NULL;
301 entry->refcount = 1;
0ab0abcf 302 RB_CLEAR_NODE(&entry->rbnode);
2b281117
SJ
303 return entry;
304}
305
306static void zswap_entry_cache_free(struct zswap_entry *entry)
307{
308 kmem_cache_free(zswap_entry_cache, entry);
309}
310
2b281117
SJ
311/*********************************
312* rbtree functions
313**********************************/
314static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
315{
316 struct rb_node *node = root->rb_node;
317 struct zswap_entry *entry;
0bb48849 318 pgoff_t entry_offset;
2b281117
SJ
319
320 while (node) {
321 entry = rb_entry(node, struct zswap_entry, rbnode);
0bb48849
DC
322 entry_offset = swp_offset(entry->swpentry);
323 if (entry_offset > offset)
2b281117 324 node = node->rb_left;
0bb48849 325 else if (entry_offset < offset)
2b281117
SJ
326 node = node->rb_right;
327 else
328 return entry;
329 }
330 return NULL;
331}
332
333/*
334 * In the case that a entry with the same offset is found, a pointer to
335 * the existing entry is stored in dupentry and the function returns -EEXIST
336 */
337static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
338 struct zswap_entry **dupentry)
339{
340 struct rb_node **link = &root->rb_node, *parent = NULL;
341 struct zswap_entry *myentry;
0bb48849 342 pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
2b281117
SJ
343
344 while (*link) {
345 parent = *link;
346 myentry = rb_entry(parent, struct zswap_entry, rbnode);
0bb48849
DC
347 myentry_offset = swp_offset(myentry->swpentry);
348 if (myentry_offset > entry_offset)
2b281117 349 link = &(*link)->rb_left;
0bb48849 350 else if (myentry_offset < entry_offset)
2b281117
SJ
351 link = &(*link)->rb_right;
352 else {
353 *dupentry = myentry;
354 return -EEXIST;
355 }
356 }
357 rb_link_node(&entry->rbnode, parent, link);
358 rb_insert_color(&entry->rbnode, root);
359 return 0;
360}
361
18a93707 362static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
0ab0abcf
WY
363{
364 if (!RB_EMPTY_NODE(&entry->rbnode)) {
365 rb_erase(&entry->rbnode, root);
366 RB_CLEAR_NODE(&entry->rbnode);
18a93707 367 return true;
0ab0abcf 368 }
18a93707 369 return false;
0ab0abcf
WY
370}
371
b8cf32dc
YA
372static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
373{
374 int i = 0;
375
376 if (ZSWAP_NR_ZPOOLS > 1)
377 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
378
379 return entry->pool->zpools[i];
380}
381
0ab0abcf 382/*
12d79d64 383 * Carries out the common pattern of freeing and entry's zpool allocation,
0ab0abcf
WY
384 * freeing the entry itself, and decrementing the number of stored pages.
385 */
60105e12 386static void zswap_free_entry(struct zswap_entry *entry)
0ab0abcf 387{
f4840ccf
JW
388 if (entry->objcg) {
389 obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
390 obj_cgroup_put(entry->objcg);
391 }
a85f878b
SD
392 if (!entry->length)
393 atomic_dec(&zswap_same_filled_pages);
394 else {
35499e2b
DC
395 spin_lock(&entry->pool->lru_lock);
396 list_del(&entry->lru);
397 spin_unlock(&entry->pool->lru_lock);
b8cf32dc 398 zpool_free(zswap_find_zpool(entry), entry->handle);
a85f878b
SD
399 zswap_pool_put(entry->pool);
400 }
0ab0abcf
WY
401 zswap_entry_cache_free(entry);
402 atomic_dec(&zswap_stored_pages);
f1c54846 403 zswap_update_total_size();
0ab0abcf
WY
404}
405
406/* caller must hold the tree lock */
407static void zswap_entry_get(struct zswap_entry *entry)
408{
409 entry->refcount++;
410}
411
412/* caller must hold the tree lock
413* remove from the tree and free it, if nobody reference the entry
414*/
415static void zswap_entry_put(struct zswap_tree *tree,
416 struct zswap_entry *entry)
417{
418 int refcount = --entry->refcount;
419
73108957 420 WARN_ON_ONCE(refcount < 0);
0ab0abcf 421 if (refcount == 0) {
73108957 422 WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
60105e12 423 zswap_free_entry(entry);
0ab0abcf
WY
424 }
425}
426
427/* caller must hold the tree lock */
428static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
429 pgoff_t offset)
430{
b0c9865f 431 struct zswap_entry *entry;
0ab0abcf
WY
432
433 entry = zswap_rb_search(root, offset);
434 if (entry)
435 zswap_entry_get(entry);
436
437 return entry;
438}
439
2b281117
SJ
440/*********************************
441* per-cpu code
442**********************************/
443static DEFINE_PER_CPU(u8 *, zswap_dstmem);
1ec3b5fe
BS
444/*
445 * If users dynamically change the zpool type and compressor at runtime, i.e.
446 * zswap is running, zswap can have more than one zpool on one cpu, but they
447 * are sharing dtsmem. So we need this mutex to be per-cpu.
448 */
449static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
2b281117 450
ad7ed770 451static int zswap_dstmem_prepare(unsigned int cpu)
2b281117 452{
1ec3b5fe 453 struct mutex *mutex;
2b281117
SJ
454 u8 *dst;
455
ad7ed770 456 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
2b2695f5 457 if (!dst)
ad7ed770 458 return -ENOMEM;
2b2695f5 459
1ec3b5fe
BS
460 mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
461 if (!mutex) {
462 kfree(dst);
463 return -ENOMEM;
464 }
465
466 mutex_init(mutex);
ad7ed770 467 per_cpu(zswap_dstmem, cpu) = dst;
1ec3b5fe 468 per_cpu(zswap_mutex, cpu) = mutex;
ad7ed770 469 return 0;
2b281117
SJ
470}
471
ad7ed770 472static int zswap_dstmem_dead(unsigned int cpu)
2b281117 473{
1ec3b5fe 474 struct mutex *mutex;
ad7ed770 475 u8 *dst;
2b281117 476
1ec3b5fe
BS
477 mutex = per_cpu(zswap_mutex, cpu);
478 kfree(mutex);
479 per_cpu(zswap_mutex, cpu) = NULL;
480
ad7ed770
SAS
481 dst = per_cpu(zswap_dstmem, cpu);
482 kfree(dst);
483 per_cpu(zswap_dstmem, cpu) = NULL;
f1c54846 484
f1c54846 485 return 0;
f1c54846
DS
486}
487
cab7a7e5 488static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
f1c54846 489{
cab7a7e5 490 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
1ec3b5fe
BS
491 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
492 struct crypto_acomp *acomp;
493 struct acomp_req *req;
494
495 acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
496 if (IS_ERR(acomp)) {
497 pr_err("could not alloc crypto acomp %s : %ld\n",
498 pool->tfm_name, PTR_ERR(acomp));
499 return PTR_ERR(acomp);
500 }
501 acomp_ctx->acomp = acomp;
f1c54846 502
1ec3b5fe
BS
503 req = acomp_request_alloc(acomp_ctx->acomp);
504 if (!req) {
505 pr_err("could not alloc crypto acomp_request %s\n",
506 pool->tfm_name);
507 crypto_free_acomp(acomp_ctx->acomp);
cab7a7e5
SAS
508 return -ENOMEM;
509 }
1ec3b5fe
BS
510 acomp_ctx->req = req;
511
512 crypto_init_wait(&acomp_ctx->wait);
513 /*
514 * if the backend of acomp is async zip, crypto_req_done() will wakeup
515 * crypto_wait_req(); if the backend of acomp is scomp, the callback
516 * won't be called, crypto_wait_req() will return without blocking.
517 */
518 acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
519 crypto_req_done, &acomp_ctx->wait);
520
521 acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
522 acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
523
2b281117 524 return 0;
2b281117
SJ
525}
526
cab7a7e5 527static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
f1c54846 528{
cab7a7e5 529 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
1ec3b5fe
BS
530 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
531
532 if (!IS_ERR_OR_NULL(acomp_ctx)) {
533 if (!IS_ERR_OR_NULL(acomp_ctx->req))
534 acomp_request_free(acomp_ctx->req);
535 if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
536 crypto_free_acomp(acomp_ctx->acomp);
537 }
f1c54846 538
cab7a7e5 539 return 0;
f1c54846
DS
540}
541
2b281117 542/*********************************
f1c54846 543* pool functions
2b281117 544**********************************/
f1c54846
DS
545
546static struct zswap_pool *__zswap_pool_current(void)
2b281117 547{
f1c54846
DS
548 struct zswap_pool *pool;
549
550 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
ae3d89a7
DS
551 WARN_ONCE(!pool && zswap_has_pool,
552 "%s: no page storage pool!\n", __func__);
f1c54846
DS
553
554 return pool;
555}
556
557static struct zswap_pool *zswap_pool_current(void)
558{
559 assert_spin_locked(&zswap_pools_lock);
560
561 return __zswap_pool_current();
562}
563
564static struct zswap_pool *zswap_pool_current_get(void)
565{
566 struct zswap_pool *pool;
567
568 rcu_read_lock();
569
570 pool = __zswap_pool_current();
ae3d89a7 571 if (!zswap_pool_get(pool))
f1c54846
DS
572 pool = NULL;
573
574 rcu_read_unlock();
575
576 return pool;
577}
578
579static struct zswap_pool *zswap_pool_last_get(void)
580{
581 struct zswap_pool *pool, *last = NULL;
582
583 rcu_read_lock();
584
585 list_for_each_entry_rcu(pool, &zswap_pools, list)
586 last = pool;
ae3d89a7
DS
587 WARN_ONCE(!last && zswap_has_pool,
588 "%s: no page storage pool!\n", __func__);
589 if (!zswap_pool_get(last))
f1c54846
DS
590 last = NULL;
591
592 rcu_read_unlock();
593
594 return last;
595}
596
8bc8b228 597/* type and compressor must be null-terminated */
f1c54846
DS
598static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
599{
600 struct zswap_pool *pool;
601
602 assert_spin_locked(&zswap_pools_lock);
603
604 list_for_each_entry_rcu(pool, &zswap_pools, list) {
8bc8b228 605 if (strcmp(pool->tfm_name, compressor))
f1c54846 606 continue;
b8cf32dc
YA
607 /* all zpools share the same type */
608 if (strcmp(zpool_get_type(pool->zpools[0]), type))
f1c54846
DS
609 continue;
610 /* if we can't get it, it's about to be destroyed */
611 if (!zswap_pool_get(pool))
612 continue;
613 return pool;
614 }
615
616 return NULL;
617}
618
18a93707
YA
619/*
620 * If the entry is still valid in the tree, drop the initial ref and remove it
621 * from the tree. This function must be called with an additional ref held,
622 * otherwise it may race with another invalidation freeing the entry.
623 */
418fd29d
DC
624static void zswap_invalidate_entry(struct zswap_tree *tree,
625 struct zswap_entry *entry)
626{
18a93707
YA
627 if (zswap_rb_erase(&tree->rbroot, entry))
628 zswap_entry_put(tree, entry);
418fd29d
DC
629}
630
f999f38b
DC
631static int zswap_reclaim_entry(struct zswap_pool *pool)
632{
f999f38b
DC
633 struct zswap_entry *entry;
634 struct zswap_tree *tree;
635 pgoff_t swpoffset;
636 int ret;
637
638 /* Get an entry off the LRU */
639 spin_lock(&pool->lru_lock);
640 if (list_empty(&pool->lru)) {
641 spin_unlock(&pool->lru_lock);
642 return -EINVAL;
643 }
644 entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
645 list_del_init(&entry->lru);
f999f38b
DC
646 /*
647 * Once the lru lock is dropped, the entry might get freed. The
648 * swpoffset is copied to the stack, and entry isn't deref'd again
649 * until the entry is verified to still be alive in the tree.
650 */
0bb48849
DC
651 swpoffset = swp_offset(entry->swpentry);
652 tree = zswap_trees[swp_type(entry->swpentry)];
f999f38b
DC
653 spin_unlock(&pool->lru_lock);
654
655 /* Check for invalidate() race */
656 spin_lock(&tree->lock);
657 if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
658 ret = -EAGAIN;
659 goto unlock;
660 }
661 /* Hold a reference to prevent a free during writeback */
662 zswap_entry_get(entry);
663 spin_unlock(&tree->lock);
664
0bb48849 665 ret = zswap_writeback_entry(entry, tree);
f999f38b
DC
666
667 spin_lock(&tree->lock);
668 if (ret) {
669 /* Writeback failed, put entry back on LRU */
670 spin_lock(&pool->lru_lock);
671 list_move(&entry->lru, &pool->lru);
672 spin_unlock(&pool->lru_lock);
ff9d5ba2 673 goto put_unlock;
f999f38b
DC
674 }
675
418fd29d
DC
676 /*
677 * Writeback started successfully, the page now belongs to the
678 * swapcache. Drop the entry from zswap - unless invalidate already
679 * took it out while we had the tree->lock released for IO.
680 */
18a93707 681 zswap_invalidate_entry(tree, entry);
ff9d5ba2
DC
682
683put_unlock:
f999f38b
DC
684 /* Drop local reference */
685 zswap_entry_put(tree, entry);
686unlock:
687 spin_unlock(&tree->lock);
688 return ret ? -EAGAIN : 0;
689}
690
45190f01
VW
691static void shrink_worker(struct work_struct *w)
692{
693 struct zswap_pool *pool = container_of(w, typeof(*pool),
694 shrink_work);
e0228d59
DC
695 int ret, failures = 0;
696
697 do {
35499e2b 698 ret = zswap_reclaim_entry(pool);
e0228d59
DC
699 if (ret) {
700 zswap_reject_reclaim_fail++;
701 if (ret != -EAGAIN)
702 break;
703 if (++failures == MAX_RECLAIM_RETRIES)
704 break;
705 }
706 cond_resched();
707 } while (!zswap_can_accept());
45190f01
VW
708 zswap_pool_put(pool);
709}
710
f1c54846
DS
711static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
712{
b8cf32dc 713 int i;
f1c54846 714 struct zswap_pool *pool;
32a4e169 715 char name[38]; /* 'zswap' + 32 char (max) num + \0 */
d0164adc 716 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
cab7a7e5 717 int ret;
f1c54846 718
bae21db8
DS
719 if (!zswap_has_pool) {
720 /* if either are unset, pool initialization failed, and we
721 * need both params to be set correctly before trying to
722 * create a pool.
723 */
724 if (!strcmp(type, ZSWAP_PARAM_UNSET))
725 return NULL;
726 if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
727 return NULL;
728 }
729
f1c54846 730 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
f4ae0ce0 731 if (!pool)
f1c54846 732 return NULL;
f1c54846 733
b8cf32dc
YA
734 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
735 /* unique name for each pool specifically required by zsmalloc */
736 snprintf(name, 38, "zswap%x",
737 atomic_inc_return(&zswap_pools_count));
32a4e169 738
b8cf32dc
YA
739 pool->zpools[i] = zpool_create_pool(type, name, gfp);
740 if (!pool->zpools[i]) {
741 pr_err("%s zpool not available\n", type);
742 goto error;
743 }
f1c54846 744 }
b8cf32dc 745 pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
f1c54846 746
79cd4202 747 strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
1ec3b5fe
BS
748
749 pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
750 if (!pool->acomp_ctx) {
f1c54846
DS
751 pr_err("percpu alloc failed\n");
752 goto error;
753 }
754
cab7a7e5
SAS
755 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
756 &pool->node);
757 if (ret)
f1c54846
DS
758 goto error;
759 pr_debug("using %s compressor\n", pool->tfm_name);
760
761 /* being the current pool takes 1 ref; this func expects the
762 * caller to always add the new pool as the current pool
763 */
764 kref_init(&pool->kref);
765 INIT_LIST_HEAD(&pool->list);
f999f38b
DC
766 INIT_LIST_HEAD(&pool->lru);
767 spin_lock_init(&pool->lru_lock);
45190f01 768 INIT_WORK(&pool->shrink_work, shrink_worker);
f1c54846
DS
769
770 zswap_pool_debug("created", pool);
771
772 return pool;
773
774error:
1ec3b5fe
BS
775 if (pool->acomp_ctx)
776 free_percpu(pool->acomp_ctx);
b8cf32dc
YA
777 while (i--)
778 zpool_destroy_pool(pool->zpools[i]);
f1c54846
DS
779 kfree(pool);
780 return NULL;
781}
782
141fdeec 783static struct zswap_pool *__zswap_pool_create_fallback(void)
f1c54846 784{
bae21db8
DS
785 bool has_comp, has_zpool;
786
1ec3b5fe 787 has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
bb8b93b5
MS
788 if (!has_comp && strcmp(zswap_compressor,
789 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
f1c54846 790 pr_err("compressor %s not available, using default %s\n",
bb8b93b5 791 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
c99b42c3 792 param_free_charp(&zswap_compressor);
bb8b93b5 793 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
1ec3b5fe 794 has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
f1c54846 795 }
bae21db8
DS
796 if (!has_comp) {
797 pr_err("default compressor %s not available\n",
798 zswap_compressor);
799 param_free_charp(&zswap_compressor);
800 zswap_compressor = ZSWAP_PARAM_UNSET;
801 }
802
803 has_zpool = zpool_has_pool(zswap_zpool_type);
bb8b93b5
MS
804 if (!has_zpool && strcmp(zswap_zpool_type,
805 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
f1c54846 806 pr_err("zpool %s not available, using default %s\n",
bb8b93b5 807 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
c99b42c3 808 param_free_charp(&zswap_zpool_type);
bb8b93b5 809 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
bae21db8 810 has_zpool = zpool_has_pool(zswap_zpool_type);
f1c54846 811 }
bae21db8
DS
812 if (!has_zpool) {
813 pr_err("default zpool %s not available\n",
814 zswap_zpool_type);
815 param_free_charp(&zswap_zpool_type);
816 zswap_zpool_type = ZSWAP_PARAM_UNSET;
817 }
818
819 if (!has_comp || !has_zpool)
820 return NULL;
f1c54846
DS
821
822 return zswap_pool_create(zswap_zpool_type, zswap_compressor);
823}
824
825static void zswap_pool_destroy(struct zswap_pool *pool)
826{
b8cf32dc
YA
827 int i;
828
f1c54846
DS
829 zswap_pool_debug("destroying", pool);
830
cab7a7e5 831 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
1ec3b5fe 832 free_percpu(pool->acomp_ctx);
b8cf32dc
YA
833 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
834 zpool_destroy_pool(pool->zpools[i]);
f1c54846
DS
835 kfree(pool);
836}
837
838static int __must_check zswap_pool_get(struct zswap_pool *pool)
839{
ae3d89a7
DS
840 if (!pool)
841 return 0;
842
f1c54846
DS
843 return kref_get_unless_zero(&pool->kref);
844}
845
200867af 846static void __zswap_pool_release(struct work_struct *work)
f1c54846 847{
45190f01
VW
848 struct zswap_pool *pool = container_of(work, typeof(*pool),
849 release_work);
200867af
DS
850
851 synchronize_rcu();
f1c54846
DS
852
853 /* nobody should have been able to get a kref... */
854 WARN_ON(kref_get_unless_zero(&pool->kref));
855
856 /* pool is now off zswap_pools list and has no references. */
857 zswap_pool_destroy(pool);
858}
859
860static void __zswap_pool_empty(struct kref *kref)
861{
862 struct zswap_pool *pool;
863
864 pool = container_of(kref, typeof(*pool), kref);
865
866 spin_lock(&zswap_pools_lock);
867
868 WARN_ON(pool == zswap_pool_current());
869
870 list_del_rcu(&pool->list);
200867af 871
45190f01
VW
872 INIT_WORK(&pool->release_work, __zswap_pool_release);
873 schedule_work(&pool->release_work);
f1c54846
DS
874
875 spin_unlock(&zswap_pools_lock);
876}
877
878static void zswap_pool_put(struct zswap_pool *pool)
879{
880 kref_put(&pool->kref, __zswap_pool_empty);
2b281117
SJ
881}
882
90b0fc26
DS
883/*********************************
884* param callbacks
885**********************************/
886
141fdeec
LS
887static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
888{
889 /* no change required */
890 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
891 return false;
892 return true;
893}
894
c99b42c3 895/* val must be a null-terminated string */
90b0fc26
DS
896static int __zswap_param_set(const char *val, const struct kernel_param *kp,
897 char *type, char *compressor)
898{
899 struct zswap_pool *pool, *put_pool = NULL;
c99b42c3 900 char *s = strstrip((char *)val);
141fdeec
LS
901 int ret = 0;
902 bool new_pool = false;
90b0fc26 903
141fdeec 904 mutex_lock(&zswap_init_lock);
9021ccec
LS
905 switch (zswap_init_state) {
906 case ZSWAP_UNINIT:
907 /* if this is load-time (pre-init) param setting,
908 * don't create a pool; that's done during init.
909 */
141fdeec
LS
910 ret = param_set_charp(s, kp);
911 break;
9021ccec 912 case ZSWAP_INIT_SUCCEED:
141fdeec 913 new_pool = zswap_pool_changed(s, kp);
9021ccec
LS
914 break;
915 case ZSWAP_INIT_FAILED:
d7b028f5 916 pr_err("can't set param, initialization failed\n");
141fdeec 917 ret = -ENODEV;
d7b028f5 918 }
141fdeec 919 mutex_unlock(&zswap_init_lock);
d7b028f5 920
141fdeec
LS
921 /* no need to create a new pool, return directly */
922 if (!new_pool)
923 return ret;
90b0fc26
DS
924
925 if (!type) {
c99b42c3
DS
926 if (!zpool_has_pool(s)) {
927 pr_err("zpool %s not available\n", s);
90b0fc26
DS
928 return -ENOENT;
929 }
c99b42c3 930 type = s;
90b0fc26 931 } else if (!compressor) {
1ec3b5fe 932 if (!crypto_has_acomp(s, 0, 0)) {
c99b42c3 933 pr_err("compressor %s not available\n", s);
90b0fc26
DS
934 return -ENOENT;
935 }
c99b42c3
DS
936 compressor = s;
937 } else {
938 WARN_ON(1);
939 return -EINVAL;
90b0fc26
DS
940 }
941
942 spin_lock(&zswap_pools_lock);
943
944 pool = zswap_pool_find_get(type, compressor);
945 if (pool) {
946 zswap_pool_debug("using existing", pool);
fd5bb66c 947 WARN_ON(pool == zswap_pool_current());
90b0fc26 948 list_del_rcu(&pool->list);
90b0fc26
DS
949 }
950
fd5bb66c
DS
951 spin_unlock(&zswap_pools_lock);
952
953 if (!pool)
954 pool = zswap_pool_create(type, compressor);
955
90b0fc26 956 if (pool)
c99b42c3 957 ret = param_set_charp(s, kp);
90b0fc26
DS
958 else
959 ret = -EINVAL;
960
fd5bb66c
DS
961 spin_lock(&zswap_pools_lock);
962
90b0fc26
DS
963 if (!ret) {
964 put_pool = zswap_pool_current();
965 list_add_rcu(&pool->list, &zswap_pools);
ae3d89a7 966 zswap_has_pool = true;
90b0fc26
DS
967 } else if (pool) {
968 /* add the possibly pre-existing pool to the end of the pools
969 * list; if it's new (and empty) then it'll be removed and
970 * destroyed by the put after we drop the lock
971 */
972 list_add_tail_rcu(&pool->list, &zswap_pools);
973 put_pool = pool;
fd5bb66c
DS
974 }
975
976 spin_unlock(&zswap_pools_lock);
977
978 if (!zswap_has_pool && !pool) {
ae3d89a7
DS
979 /* if initial pool creation failed, and this pool creation also
980 * failed, maybe both compressor and zpool params were bad.
981 * Allow changing this param, so pool creation will succeed
982 * when the other param is changed. We already verified this
1ec3b5fe 983 * param is ok in the zpool_has_pool() or crypto_has_acomp()
ae3d89a7
DS
984 * checks above.
985 */
986 ret = param_set_charp(s, kp);
90b0fc26
DS
987 }
988
90b0fc26
DS
989 /* drop the ref from either the old current pool,
990 * or the new pool we failed to add
991 */
992 if (put_pool)
993 zswap_pool_put(put_pool);
994
995 return ret;
996}
997
998static int zswap_compressor_param_set(const char *val,
999 const struct kernel_param *kp)
1000{
1001 return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
1002}
1003
1004static int zswap_zpool_param_set(const char *val,
1005 const struct kernel_param *kp)
1006{
1007 return __zswap_param_set(val, kp, NULL, zswap_compressor);
1008}
1009
d7b028f5
DS
1010static int zswap_enabled_param_set(const char *val,
1011 const struct kernel_param *kp)
1012{
141fdeec
LS
1013 int ret = -ENODEV;
1014
1015 /* if this is load-time (pre-init) param setting, only set param. */
1016 if (system_state != SYSTEM_RUNNING)
1017 return param_set_bool(val, kp);
1018
1019 mutex_lock(&zswap_init_lock);
9021ccec
LS
1020 switch (zswap_init_state) {
1021 case ZSWAP_UNINIT:
141fdeec
LS
1022 if (zswap_setup())
1023 break;
1024 fallthrough;
9021ccec 1025 case ZSWAP_INIT_SUCCEED:
141fdeec 1026 if (!zswap_has_pool)
9021ccec 1027 pr_err("can't enable, no pool configured\n");
141fdeec
LS
1028 else
1029 ret = param_set_bool(val, kp);
1030 break;
9021ccec 1031 case ZSWAP_INIT_FAILED:
d7b028f5 1032 pr_err("can't enable, initialization failed\n");
ae3d89a7 1033 }
141fdeec 1034 mutex_unlock(&zswap_init_lock);
d7b028f5 1035
141fdeec 1036 return ret;
d7b028f5
DS
1037}
1038
2b281117
SJ
1039/*********************************
1040* writeback code
1041**********************************/
1042/* return enum for zswap_get_swap_cache_page */
1043enum zswap_get_swap_ret {
1044 ZSWAP_SWAPCACHE_NEW,
1045 ZSWAP_SWAPCACHE_EXIST,
67d13fe8 1046 ZSWAP_SWAPCACHE_FAIL,
2b281117
SJ
1047};
1048
1049/*
1050 * zswap_get_swap_cache_page
1051 *
1052 * This is an adaption of read_swap_cache_async()
1053 *
1054 * This function tries to find a page with the given swap entry
1055 * in the swapper_space address space (the swap cache). If the page
1056 * is found, it is returned in retpage. Otherwise, a page is allocated,
1057 * added to the swap cache, and returned in retpage.
1058 *
1059 * If success, the swap cache page is returned in retpage
67d13fe8
WY
1060 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
1061 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
1062 * the new page is added to swapcache and locked
1063 * Returns ZSWAP_SWAPCACHE_FAIL on error
2b281117
SJ
1064 */
1065static int zswap_get_swap_cache_page(swp_entry_t entry,
1066 struct page **retpage)
1067{
5b999aad 1068 bool page_was_allocated;
2b281117 1069
5b999aad
DS
1070 *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
1071 NULL, 0, &page_was_allocated);
1072 if (page_was_allocated)
1073 return ZSWAP_SWAPCACHE_NEW;
1074 if (!*retpage)
67d13fe8 1075 return ZSWAP_SWAPCACHE_FAIL;
2b281117
SJ
1076 return ZSWAP_SWAPCACHE_EXIST;
1077}
1078
1079/*
1080 * Attempts to free an entry by adding a page to the swap cache,
1081 * decompressing the entry data into the page, and issuing a
1082 * bio write to write the page back to the swap device.
1083 *
1084 * This can be thought of as a "resumed writeback" of the page
1085 * to the swap device. We are basically resuming the same swap
42c06a0e 1086 * writeback path that was intercepted with the zswap_store()
2b281117
SJ
1087 * in the first place. After the page has been decompressed into
1088 * the swap cache, the compressed version stored by zswap can be
1089 * freed.
1090 */
0bb48849 1091static int zswap_writeback_entry(struct zswap_entry *entry,
ff9d5ba2 1092 struct zswap_tree *tree)
2b281117 1093{
0bb48849 1094 swp_entry_t swpentry = entry->swpentry;
2b281117 1095 struct page *page;
1ec3b5fe
BS
1096 struct scatterlist input, output;
1097 struct crypto_acomp_ctx *acomp_ctx;
b8cf32dc 1098 struct zpool *pool = zswap_find_zpool(entry);
1ec3b5fe 1099
fc6697a8 1100 u8 *src, *tmp = NULL;
2b281117 1101 unsigned int dlen;
0ab0abcf 1102 int ret;
2b281117
SJ
1103 struct writeback_control wbc = {
1104 .sync_mode = WB_SYNC_NONE,
1105 };
1106
fc6697a8 1107 if (!zpool_can_sleep_mapped(pool)) {
8d9b6370 1108 tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
fc6697a8
TT
1109 if (!tmp)
1110 return -ENOMEM;
1111 }
1112
2b281117
SJ
1113 /* try to allocate swap cache page */
1114 switch (zswap_get_swap_cache_page(swpentry, &page)) {
67d13fe8 1115 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
2b281117
SJ
1116 ret = -ENOMEM;
1117 goto fail;
1118
67d13fe8 1119 case ZSWAP_SWAPCACHE_EXIST:
2b281117 1120 /* page is already in the swap cache, ignore for now */
09cbfeaf 1121 put_page(page);
2b281117
SJ
1122 ret = -EEXIST;
1123 goto fail;
1124
1125 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
04fc7816
DC
1126 /*
1127 * Having a local reference to the zswap entry doesn't exclude
1128 * swapping from invalidating and recycling the swap slot. Once
1129 * the swapcache is secured against concurrent swapping to and
1130 * from the slot, recheck that the entry is still current before
1131 * writing.
1132 */
1133 spin_lock(&tree->lock);
0bb48849 1134 if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
04fc7816
DC
1135 spin_unlock(&tree->lock);
1136 delete_from_swap_cache(page_folio(page));
1137 ret = -ENOMEM;
1138 goto fail;
1139 }
1140 spin_unlock(&tree->lock);
1141
2b281117 1142 /* decompress */
1ec3b5fe 1143 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
2b281117 1144 dlen = PAGE_SIZE;
fc6697a8 1145
0bb48849 1146 src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
6b3379e8
JW
1147 if (!zpool_can_sleep_mapped(pool)) {
1148 memcpy(tmp, src, entry->length);
1149 src = tmp;
ff9d5ba2 1150 zpool_unmap_handle(pool, entry->handle);
6b3379e8
JW
1151 }
1152
1ec3b5fe
BS
1153 mutex_lock(acomp_ctx->mutex);
1154 sg_init_one(&input, src, entry->length);
1155 sg_init_table(&output, 1);
1156 sg_set_page(&output, page, PAGE_SIZE, 0);
1157 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1158 ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1159 dlen = acomp_ctx->req->dlen;
1160 mutex_unlock(acomp_ctx->mutex);
1161
6b3379e8
JW
1162 if (!zpool_can_sleep_mapped(pool))
1163 kfree(tmp);
1164 else
ff9d5ba2 1165 zpool_unmap_handle(pool, entry->handle);
6b3379e8 1166
2b281117
SJ
1167 BUG_ON(ret);
1168 BUG_ON(dlen != PAGE_SIZE);
1169
1170 /* page is up to date */
1171 SetPageUptodate(page);
1172 }
1173
b349acc7
WY
1174 /* move it to the tail of the inactive list after end_writeback */
1175 SetPageReclaim(page);
1176
2b281117 1177 /* start writeback */
cf1e3fe4 1178 __swap_writepage(page, &wbc);
09cbfeaf 1179 put_page(page);
2b281117
SJ
1180 zswap_written_back_pages++;
1181
6b3379e8 1182 return ret;
6b3379e8
JW
1183fail:
1184 if (!zpool_can_sleep_mapped(pool))
1185 kfree(tmp);
0ab0abcf
WY
1186
1187 /*
1188 * if we get here due to ZSWAP_SWAPCACHE_EXIST
c0c641d7
RD
1189 * a load may be happening concurrently.
1190 * it is safe and okay to not free the entry.
c0c641d7 1191 * it is also okay to return !0
0ab0abcf 1192 */
2b281117
SJ
1193 return ret;
1194}
1195
a85f878b
SD
1196static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
1197{
a85f878b 1198 unsigned long *page;
62bf1258
TS
1199 unsigned long val;
1200 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
a85f878b
SD
1201
1202 page = (unsigned long *)ptr;
62bf1258
TS
1203 val = page[0];
1204
1205 if (val != page[last_pos])
1206 return 0;
1207
1208 for (pos = 1; pos < last_pos; pos++) {
1209 if (val != page[pos])
a85f878b
SD
1210 return 0;
1211 }
62bf1258
TS
1212
1213 *value = val;
1214
a85f878b
SD
1215 return 1;
1216}
1217
1218static void zswap_fill_page(void *ptr, unsigned long value)
1219{
1220 unsigned long *page;
1221
1222 page = (unsigned long *)ptr;
1223 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
1224}
1225
34f4c198 1226bool zswap_store(struct folio *folio)
2b281117 1227{
34f4c198 1228 swp_entry_t swp = folio_swap_entry(folio);
42c06a0e
JW
1229 int type = swp_type(swp);
1230 pgoff_t offset = swp_offset(swp);
34f4c198 1231 struct page *page = &folio->page;
2b281117
SJ
1232 struct zswap_tree *tree = zswap_trees[type];
1233 struct zswap_entry *entry, *dupentry;
1ec3b5fe
BS
1234 struct scatterlist input, output;
1235 struct crypto_acomp_ctx *acomp_ctx;
f4840ccf
JW
1236 struct obj_cgroup *objcg = NULL;
1237 struct zswap_pool *pool;
b8cf32dc 1238 struct zpool *zpool;
0bb48849 1239 unsigned int dlen = PAGE_SIZE;
a85f878b 1240 unsigned long handle, value;
2b281117
SJ
1241 char *buf;
1242 u8 *src, *dst;
d2fcd82b 1243 gfp_t gfp;
42c06a0e
JW
1244 int ret;
1245
34f4c198
MWO
1246 VM_WARN_ON_ONCE(!folio_test_locked(folio));
1247 VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
2b281117 1248
34f4c198
MWO
1249 /* Large folios aren't supported */
1250 if (folio_test_large(folio))
42c06a0e 1251 return false;
7ba71669 1252
42c06a0e
JW
1253 if (!zswap_enabled || !tree)
1254 return false;
2b281117 1255
0bdf0efa
NP
1256 /*
1257 * XXX: zswap reclaim does not work with cgroups yet. Without a
1258 * cgroup-aware entry LRU, we will push out entries system-wide based on
1259 * local cgroup limits.
1260 */
074e3e26 1261 objcg = get_obj_cgroup_from_folio(folio);
42c06a0e 1262 if (objcg && !obj_cgroup_may_zswap(objcg))
0bdf0efa 1263 goto reject;
f4840ccf 1264
2b281117
SJ
1265 /* reclaim space if needed */
1266 if (zswap_is_full()) {
1267 zswap_pool_limit_hit++;
45190f01 1268 zswap_pool_reached_full = true;
f4840ccf 1269 goto shrink;
45190f01 1270 }
16e536ef 1271
45190f01 1272 if (zswap_pool_reached_full) {
42c06a0e 1273 if (!zswap_can_accept())
e0228d59 1274 goto shrink;
42c06a0e 1275 else
45190f01 1276 zswap_pool_reached_full = false;
2b281117
SJ
1277 }
1278
1279 /* allocate entry */
1280 entry = zswap_entry_cache_alloc(GFP_KERNEL);
1281 if (!entry) {
1282 zswap_reject_kmemcache_fail++;
2b281117
SJ
1283 goto reject;
1284 }
1285
a85f878b
SD
1286 if (zswap_same_filled_pages_enabled) {
1287 src = kmap_atomic(page);
1288 if (zswap_is_page_same_filled(src, &value)) {
1289 kunmap_atomic(src);
0bb48849 1290 entry->swpentry = swp_entry(type, offset);
a85f878b
SD
1291 entry->length = 0;
1292 entry->value = value;
1293 atomic_inc(&zswap_same_filled_pages);
1294 goto insert_entry;
1295 }
1296 kunmap_atomic(src);
1297 }
1298
42c06a0e 1299 if (!zswap_non_same_filled_pages_enabled)
cb325ddd 1300 goto freepage;
cb325ddd 1301
f1c54846
DS
1302 /* if entry is successfully added, it keeps the reference */
1303 entry->pool = zswap_pool_current_get();
42c06a0e 1304 if (!entry->pool)
f1c54846 1305 goto freepage;
f1c54846 1306
2b281117 1307 /* compress */
1ec3b5fe
BS
1308 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1309
1310 mutex_lock(acomp_ctx->mutex);
1311
1312 dst = acomp_ctx->dstmem;
1313 sg_init_table(&input, 1);
1314 sg_set_page(&input, page, PAGE_SIZE, 0);
1315
1316 /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
1317 sg_init_one(&output, dst, PAGE_SIZE * 2);
1318 acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
1319 /*
1320 * it maybe looks a little bit silly that we send an asynchronous request,
1321 * then wait for its completion synchronously. This makes the process look
1322 * synchronous in fact.
1323 * Theoretically, acomp supports users send multiple acomp requests in one
1324 * acomp instance, then get those requests done simultaneously. but in this
42c06a0e 1325 * case, zswap actually does store and load page by page, there is no
1ec3b5fe 1326 * existing method to send the second page before the first page is done
42c06a0e 1327 * in one thread doing zwap.
1ec3b5fe
BS
1328 * but in different threads running on different cpu, we have different
1329 * acomp instance, so multiple threads can do (de)compression in parallel.
1330 */
1331 ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
1332 dlen = acomp_ctx->req->dlen;
1333
42c06a0e 1334 if (ret)
f1c54846 1335 goto put_dstmem;
2b281117
SJ
1336
1337 /* store */
b8cf32dc 1338 zpool = zswap_find_zpool(entry);
d2fcd82b 1339 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
b8cf32dc 1340 if (zpool_malloc_support_movable(zpool))
d2fcd82b 1341 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
b8cf32dc 1342 ret = zpool_malloc(zpool, dlen, gfp, &handle);
2b281117
SJ
1343 if (ret == -ENOSPC) {
1344 zswap_reject_compress_poor++;
f1c54846 1345 goto put_dstmem;
2b281117
SJ
1346 }
1347 if (ret) {
1348 zswap_reject_alloc_fail++;
f1c54846 1349 goto put_dstmem;
2b281117 1350 }
b8cf32dc 1351 buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
0bb48849 1352 memcpy(buf, dst, dlen);
b8cf32dc 1353 zpool_unmap_handle(zpool, handle);
1ec3b5fe 1354 mutex_unlock(acomp_ctx->mutex);
2b281117
SJ
1355
1356 /* populate entry */
0bb48849 1357 entry->swpentry = swp_entry(type, offset);
2b281117
SJ
1358 entry->handle = handle;
1359 entry->length = dlen;
1360
a85f878b 1361insert_entry:
f4840ccf
JW
1362 entry->objcg = objcg;
1363 if (objcg) {
1364 obj_cgroup_charge_zswap(objcg, entry->length);
1365 /* Account before objcg ref is moved to tree */
1366 count_objcg_event(objcg, ZSWPOUT);
1367 }
1368
2b281117
SJ
1369 /* map */
1370 spin_lock(&tree->lock);
42c06a0e
JW
1371 while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
1372 zswap_duplicate_entry++;
56c67049 1373 zswap_invalidate_entry(tree, dupentry);
42c06a0e 1374 }
35499e2b 1375 if (entry->length) {
f999f38b
DC
1376 spin_lock(&entry->pool->lru_lock);
1377 list_add(&entry->lru, &entry->pool->lru);
1378 spin_unlock(&entry->pool->lru_lock);
1379 }
2b281117
SJ
1380 spin_unlock(&tree->lock);
1381
1382 /* update stats */
1383 atomic_inc(&zswap_stored_pages);
f1c54846 1384 zswap_update_total_size();
f6498b77 1385 count_vm_event(ZSWPOUT);
2b281117 1386
42c06a0e 1387 return true;
2b281117 1388
f1c54846 1389put_dstmem:
1ec3b5fe 1390 mutex_unlock(acomp_ctx->mutex);
f1c54846
DS
1391 zswap_pool_put(entry->pool);
1392freepage:
2b281117
SJ
1393 zswap_entry_cache_free(entry);
1394reject:
f4840ccf
JW
1395 if (objcg)
1396 obj_cgroup_put(objcg);
42c06a0e 1397 return false;
f4840ccf
JW
1398
1399shrink:
1400 pool = zswap_pool_last_get();
1401 if (pool)
1402 queue_work(shrink_wq, &pool->shrink_work);
f4840ccf 1403 goto reject;
2b281117
SJ
1404}
1405
ca54f6d8 1406bool zswap_load(struct folio *folio)
2b281117 1407{
ca54f6d8 1408 swp_entry_t swp = folio_swap_entry(folio);
42c06a0e
JW
1409 int type = swp_type(swp);
1410 pgoff_t offset = swp_offset(swp);
ca54f6d8 1411 struct page *page = &folio->page;
2b281117
SJ
1412 struct zswap_tree *tree = zswap_trees[type];
1413 struct zswap_entry *entry;
1ec3b5fe
BS
1414 struct scatterlist input, output;
1415 struct crypto_acomp_ctx *acomp_ctx;
fc6697a8 1416 u8 *src, *dst, *tmp;
b8cf32dc 1417 struct zpool *zpool;
2b281117 1418 unsigned int dlen;
42c06a0e
JW
1419 bool ret;
1420
ca54f6d8 1421 VM_WARN_ON_ONCE(!folio_test_locked(folio));
2b281117
SJ
1422
1423 /* find */
1424 spin_lock(&tree->lock);
0ab0abcf 1425 entry = zswap_entry_find_get(&tree->rbroot, offset);
2b281117 1426 if (!entry) {
2b281117 1427 spin_unlock(&tree->lock);
42c06a0e 1428 return false;
2b281117 1429 }
2b281117
SJ
1430 spin_unlock(&tree->lock);
1431
a85f878b
SD
1432 if (!entry->length) {
1433 dst = kmap_atomic(page);
1434 zswap_fill_page(dst, entry->value);
1435 kunmap_atomic(dst);
42c06a0e 1436 ret = true;
f6498b77 1437 goto stats;
a85f878b
SD
1438 }
1439
b8cf32dc
YA
1440 zpool = zswap_find_zpool(entry);
1441 if (!zpool_can_sleep_mapped(zpool)) {
8d9b6370 1442 tmp = kmalloc(entry->length, GFP_KERNEL);
fc6697a8 1443 if (!tmp) {
42c06a0e 1444 ret = false;
fc6697a8
TT
1445 goto freeentry;
1446 }
1447 }
1448
2b281117
SJ
1449 /* decompress */
1450 dlen = PAGE_SIZE;
b8cf32dc 1451 src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
1ec3b5fe 1452
b8cf32dc 1453 if (!zpool_can_sleep_mapped(zpool)) {
fc6697a8
TT
1454 memcpy(tmp, src, entry->length);
1455 src = tmp;
b8cf32dc 1456 zpool_unmap_handle(zpool, entry->handle);
fc6697a8
TT
1457 }
1458
1ec3b5fe
BS
1459 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1460 mutex_lock(acomp_ctx->mutex);
1461 sg_init_one(&input, src, entry->length);
1462 sg_init_table(&output, 1);
1463 sg_set_page(&output, page, PAGE_SIZE, 0);
1464 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
42c06a0e
JW
1465 if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait))
1466 WARN_ON(1);
1ec3b5fe
BS
1467 mutex_unlock(acomp_ctx->mutex);
1468
b8cf32dc
YA
1469 if (zpool_can_sleep_mapped(zpool))
1470 zpool_unmap_handle(zpool, entry->handle);
fc6697a8
TT
1471 else
1472 kfree(tmp);
1473
42c06a0e 1474 ret = true;
f6498b77
JW
1475stats:
1476 count_vm_event(ZSWPIN);
f4840ccf
JW
1477 if (entry->objcg)
1478 count_objcg_event(entry->objcg, ZSWPIN);
a85f878b 1479freeentry:
2b281117 1480 spin_lock(&tree->lock);
42c06a0e 1481 if (ret && zswap_exclusive_loads_enabled) {
b9c91c43 1482 zswap_invalidate_entry(tree, entry);
ca54f6d8 1483 folio_mark_dirty(folio);
35499e2b 1484 } else if (entry->length) {
f999f38b
DC
1485 spin_lock(&entry->pool->lru_lock);
1486 list_move(&entry->lru, &entry->pool->lru);
1487 spin_unlock(&entry->pool->lru_lock);
b9c91c43 1488 }
18a93707 1489 zswap_entry_put(tree, entry);
2b281117
SJ
1490 spin_unlock(&tree->lock);
1491
fc6697a8 1492 return ret;
2b281117
SJ
1493}
1494
42c06a0e 1495void zswap_invalidate(int type, pgoff_t offset)
2b281117
SJ
1496{
1497 struct zswap_tree *tree = zswap_trees[type];
1498 struct zswap_entry *entry;
2b281117
SJ
1499
1500 /* find */
1501 spin_lock(&tree->lock);
1502 entry = zswap_rb_search(&tree->rbroot, offset);
1503 if (!entry) {
1504 /* entry was written back */
1505 spin_unlock(&tree->lock);
1506 return;
1507 }
b9c91c43 1508 zswap_invalidate_entry(tree, entry);
2b281117 1509 spin_unlock(&tree->lock);
2b281117
SJ
1510}
1511
42c06a0e
JW
1512void zswap_swapon(int type)
1513{
1514 struct zswap_tree *tree;
1515
1516 tree = kzalloc(sizeof(*tree), GFP_KERNEL);
1517 if (!tree) {
1518 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1519 return;
1520 }
1521
1522 tree->rbroot = RB_ROOT;
1523 spin_lock_init(&tree->lock);
1524 zswap_trees[type] = tree;
1525}
1526
1527void zswap_swapoff(int type)
2b281117
SJ
1528{
1529 struct zswap_tree *tree = zswap_trees[type];
0bd42136 1530 struct zswap_entry *entry, *n;
2b281117
SJ
1531
1532 if (!tree)
1533 return;
1534
1535 /* walk the tree and free everything */
1536 spin_lock(&tree->lock);
0ab0abcf 1537 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
60105e12 1538 zswap_free_entry(entry);
2b281117
SJ
1539 tree->rbroot = RB_ROOT;
1540 spin_unlock(&tree->lock);
aa9bca05
WY
1541 kfree(tree);
1542 zswap_trees[type] = NULL;
2b281117
SJ
1543}
1544
2b281117
SJ
1545/*********************************
1546* debugfs functions
1547**********************************/
1548#ifdef CONFIG_DEBUG_FS
1549#include <linux/debugfs.h>
1550
1551static struct dentry *zswap_debugfs_root;
1552
141fdeec 1553static int zswap_debugfs_init(void)
2b281117
SJ
1554{
1555 if (!debugfs_initialized())
1556 return -ENODEV;
1557
1558 zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
2b281117 1559
0825a6f9
JP
1560 debugfs_create_u64("pool_limit_hit", 0444,
1561 zswap_debugfs_root, &zswap_pool_limit_hit);
1562 debugfs_create_u64("reject_reclaim_fail", 0444,
1563 zswap_debugfs_root, &zswap_reject_reclaim_fail);
1564 debugfs_create_u64("reject_alloc_fail", 0444,
1565 zswap_debugfs_root, &zswap_reject_alloc_fail);
1566 debugfs_create_u64("reject_kmemcache_fail", 0444,
1567 zswap_debugfs_root, &zswap_reject_kmemcache_fail);
1568 debugfs_create_u64("reject_compress_poor", 0444,
1569 zswap_debugfs_root, &zswap_reject_compress_poor);
1570 debugfs_create_u64("written_back_pages", 0444,
1571 zswap_debugfs_root, &zswap_written_back_pages);
1572 debugfs_create_u64("duplicate_entry", 0444,
1573 zswap_debugfs_root, &zswap_duplicate_entry);
1574 debugfs_create_u64("pool_total_size", 0444,
1575 zswap_debugfs_root, &zswap_pool_total_size);
1576 debugfs_create_atomic_t("stored_pages", 0444,
1577 zswap_debugfs_root, &zswap_stored_pages);
a85f878b 1578 debugfs_create_atomic_t("same_filled_pages", 0444,
0825a6f9 1579 zswap_debugfs_root, &zswap_same_filled_pages);
2b281117
SJ
1580
1581 return 0;
1582}
2b281117 1583#else
141fdeec 1584static int zswap_debugfs_init(void)
2b281117
SJ
1585{
1586 return 0;
1587}
2b281117
SJ
1588#endif
1589
1590/*********************************
1591* module init and exit
1592**********************************/
141fdeec 1593static int zswap_setup(void)
2b281117 1594{
f1c54846 1595 struct zswap_pool *pool;
ad7ed770 1596 int ret;
60105e12 1597
b7919122
LS
1598 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
1599 if (!zswap_entry_cache) {
2b281117 1600 pr_err("entry cache creation failed\n");
f1c54846 1601 goto cache_fail;
2b281117 1602 }
f1c54846 1603
ad7ed770
SAS
1604 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
1605 zswap_dstmem_prepare, zswap_dstmem_dead);
1606 if (ret) {
f1c54846
DS
1607 pr_err("dstmem alloc failed\n");
1608 goto dstmem_fail;
2b281117 1609 }
f1c54846 1610
cab7a7e5
SAS
1611 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
1612 "mm/zswap_pool:prepare",
1613 zswap_cpu_comp_prepare,
1614 zswap_cpu_comp_dead);
1615 if (ret)
1616 goto hp_fail;
1617
f1c54846 1618 pool = __zswap_pool_create_fallback();
ae3d89a7
DS
1619 if (pool) {
1620 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
b8cf32dc 1621 zpool_get_type(pool->zpools[0]));
ae3d89a7
DS
1622 list_add(&pool->list, &zswap_pools);
1623 zswap_has_pool = true;
1624 } else {
f1c54846 1625 pr_err("pool creation failed\n");
ae3d89a7 1626 zswap_enabled = false;
2b281117 1627 }
60105e12 1628
45190f01
VW
1629 shrink_wq = create_workqueue("zswap-shrink");
1630 if (!shrink_wq)
1631 goto fallback_fail;
1632
2b281117
SJ
1633 if (zswap_debugfs_init())
1634 pr_warn("debugfs initialization failed\n");
9021ccec 1635 zswap_init_state = ZSWAP_INIT_SUCCEED;
2b281117 1636 return 0;
f1c54846 1637
45190f01 1638fallback_fail:
38aeb071
DC
1639 if (pool)
1640 zswap_pool_destroy(pool);
cab7a7e5 1641hp_fail:
ad7ed770 1642 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
f1c54846 1643dstmem_fail:
b7919122 1644 kmem_cache_destroy(zswap_entry_cache);
f1c54846 1645cache_fail:
d7b028f5 1646 /* if built-in, we aren't unloaded on failure; don't allow use */
9021ccec 1647 zswap_init_state = ZSWAP_INIT_FAILED;
d7b028f5 1648 zswap_enabled = false;
2b281117
SJ
1649 return -ENOMEM;
1650}
141fdeec
LS
1651
1652static int __init zswap_init(void)
1653{
1654 if (!zswap_enabled)
1655 return 0;
1656 return zswap_setup();
1657}
2b281117 1658/* must be late so crypto has time to come up */
141fdeec 1659late_initcall(zswap_init);
2b281117 1660
68386da8 1661MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
2b281117 1662MODULE_DESCRIPTION("Compressed cache for swap pages");