Commit | Line | Data |
---|---|---|
91d0ec83 PK |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | ||
61989a80 NG |
3 | /* |
4 | * zsmalloc memory allocator | |
5 | * | |
6 | * Copyright (C) 2011 Nitin Gupta | |
31fc00bb | 7 | * Copyright (C) 2012, 2013 Minchan Kim |
61989a80 NG |
8 | * |
9 | * This code is released using a dual license strategy: BSD/GPL | |
10 | * You can choose the license that better fits your requirements. | |
11 | * | |
12 | * Released under the terms of 3-clause BSD License | |
13 | * Released under the terms of GNU General Public License Version 2.0 | |
14 | */ | |
15 | ||
4abaac9b DS |
16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
17 | ||
b475d42d MK |
18 | /* |
19 | * lock ordering: | |
20 | * page_lock | |
0d6fa44e | 21 | * pool->lock |
64bd0197 | 22 | * class->lock |
b475d42d MK |
23 | * zspage->lock |
24 | */ | |
25 | ||
61989a80 NG |
26 | #include <linux/module.h> |
27 | #include <linux/kernel.h> | |
312fcae2 | 28 | #include <linux/sched.h> |
61989a80 NG |
29 | #include <linux/errno.h> |
30 | #include <linux/highmem.h> | |
61989a80 NG |
31 | #include <linux/string.h> |
32 | #include <linux/slab.h> | |
0959c63f | 33 | #include <linux/spinlock.h> |
6040f650 | 34 | #include <linux/sprintf.h> |
93144ca3 | 35 | #include <linux/shrinker.h> |
0959c63f | 36 | #include <linux/types.h> |
0f050d99 | 37 | #include <linux/debugfs.h> |
bcf1647d | 38 | #include <linux/zsmalloc.h> |
c795779d | 39 | #include <linux/zpool.h> |
cdc346b3 | 40 | #include <linux/fs.h> |
7eeafde0 | 41 | #include <linux/workqueue.h> |
f4e33d32 | 42 | #include "zpdesc.h" |
48b4800a MK |
43 | |
44 | #define ZSPAGE_MAGIC 0x58 | |
0959c63f SJ |
45 | |
46 | /* | |
cb152a1a | 47 | * This must be power of 2 and greater than or equal to sizeof(link_free). |
0959c63f SJ |
48 | * These two conditions ensure that any 'struct link_free' itself doesn't |
49 | * span more than 1 page which avoids complex case of mapping 2 pages simply | |
50 | * to restore link_free pointer values. | |
51 | */ | |
52 | #define ZS_ALIGN 8 | |
53 | ||
2e40e163 MK |
54 | #define ZS_HANDLE_SIZE (sizeof(unsigned long)) |
55 | ||
0959c63f SJ |
56 | /* |
57 | * Object location (<PFN>, <obj_idx>) is encoded as | |
b956b5ac | 58 | * a single (unsigned long) handle value. |
0959c63f | 59 | * |
bfd093f5 | 60 | * Note that object index <obj_idx> starts from 0. |
0959c63f SJ |
61 | * |
62 | * This is made more complicated by various memory models and PAE. | |
63 | */ | |
64 | ||
02390b87 KS |
65 | #ifndef MAX_POSSIBLE_PHYSMEM_BITS |
66 | #ifdef MAX_PHYSMEM_BITS | |
67 | #define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS | |
68 | #else | |
0959c63f SJ |
69 | /* |
70 | * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just | |
71 | * be PAGE_SHIFT | |
72 | */ | |
02390b87 | 73 | #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG |
0959c63f SJ |
74 | #endif |
75 | #endif | |
02390b87 KS |
76 | |
77 | #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) | |
312fcae2 | 78 | |
312fcae2 MK |
79 | /* |
80 | * Head in allocated object should have OBJ_ALLOCATED_TAG | |
81 | * to identify the object was allocated or not. | |
82 | * It's okay to add the status bit in the least bit because | |
83 | * header keeps handle which is 4byte-aligned address so we | |
84 | * have room for two bit at least. | |
85 | */ | |
86 | #define OBJ_ALLOCATED_TAG 1 | |
85b32581 | 87 | |
85b32581 NP |
88 | #define OBJ_TAG_BITS 1 |
89 | #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG | |
85b32581 | 90 | |
26e93839 | 91 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) |
0959c63f SJ |
92 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) |
93 | ||
a41ec880 | 94 | #define HUGE_BITS 1 |
4c7ac972 | 95 | #define FULLNESS_BITS 4 |
cf8e0fed | 96 | #define CLASS_BITS 8 |
cf8e0fed JM |
97 | #define MAGIC_VAL_BITS 8 |
98 | ||
4ff93b29 SS |
99 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) |
100 | ||
0959c63f SJ |
101 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ |
102 | #define ZS_MIN_ALLOC_SIZE \ | |
103 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) | |
2e40e163 | 104 | /* each chunk includes extra space to keep handle */ |
7b60a685 | 105 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE |
0959c63f SJ |
106 | |
107 | /* | |
7eb52512 | 108 | * On systems with 4K page size, this gives 255 size classes! There is a |
0959c63f SJ |
109 | * trader-off here: |
110 | * - Large number of size classes is potentially wasteful as free page are | |
111 | * spread across these classes | |
112 | * - Small number of size classes causes large internal fragmentation | |
113 | * - Probably its better to use specific size classes (empirically | |
114 | * determined). NOTE: all those class sizes must be set as multiple of | |
115 | * ZS_ALIGN to make sure link_free itself never has to span 2 pages. | |
116 | * | |
117 | * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN | |
118 | * (reason above) | |
119 | */ | |
3783689a | 120 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) |
cf8e0fed JM |
121 | #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \ |
122 | ZS_SIZE_CLASS_DELTA) + 1) | |
0959c63f | 123 | |
4c7ac972 SS |
124 | /* |
125 | * Pages are distinguished by the ratio of used memory (that is the ratio | |
126 | * of ->inuse objects to all objects that page can store). For example, | |
127 | * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%. | |
128 | * | |
129 | * The number of fullness groups is not random. It allows us to keep | |
130 | * difference between the least busy page in the group (minimum permitted | |
131 | * number of ->inuse objects) and the most busy page (maximum permitted | |
132 | * number of ->inuse objects) at a reasonable value. | |
133 | */ | |
0959c63f | 134 | enum fullness_group { |
4c7ac972 SS |
135 | ZS_INUSE_RATIO_0, |
136 | ZS_INUSE_RATIO_10, | |
e1807d5d | 137 | /* NOTE: 8 more fullness groups here */ |
4c7ac972 SS |
138 | ZS_INUSE_RATIO_99 = 10, |
139 | ZS_INUSE_RATIO_100, | |
140 | NR_FULLNESS_GROUPS, | |
0959c63f SJ |
141 | }; |
142 | ||
3828a764 | 143 | enum class_stat_type { |
4c7ac972 SS |
144 | /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */ |
145 | ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS, | |
146 | ZS_OBJS_INUSE, | |
147 | NR_CLASS_STAT_TYPES, | |
0f050d99 GM |
148 | }; |
149 | ||
0f050d99 | 150 | struct zs_size_stat { |
4c7ac972 | 151 | unsigned long objs[NR_CLASS_STAT_TYPES]; |
0f050d99 GM |
152 | }; |
153 | ||
57244594 SS |
154 | #ifdef CONFIG_ZSMALLOC_STAT |
155 | static struct dentry *zs_stat_root; | |
0f050d99 GM |
156 | #endif |
157 | ||
010b495e | 158 | static size_t huge_class_size; |
0959c63f SJ |
159 | |
160 | struct size_class { | |
64bd0197 | 161 | spinlock_t lock; |
4c7ac972 | 162 | struct list_head fullness_list[NR_FULLNESS_GROUPS]; |
0959c63f SJ |
163 | /* |
164 | * Size of objects stored in this class. Must be multiple | |
165 | * of ZS_ALIGN. | |
166 | */ | |
167 | int size; | |
1fc6e27d | 168 | int objs_per_zspage; |
7dfa4612 WY |
169 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ |
170 | int pages_per_zspage; | |
48b4800a MK |
171 | |
172 | unsigned int index; | |
173 | struct zs_size_stat stats; | |
0959c63f SJ |
174 | }; |
175 | ||
176 | /* | |
177 | * Placed within free objects to form a singly linked list. | |
3783689a | 178 | * For every zspage, zspage->freeobj gives head of this list. |
0959c63f SJ |
179 | * |
180 | * This must be power of 2 and less than or equal to ZS_ALIGN | |
181 | */ | |
182 | struct link_free { | |
2e40e163 MK |
183 | union { |
184 | /* | |
bfd093f5 | 185 | * Free object index; |
2e40e163 MK |
186 | * It's valid for non-allocated object |
187 | */ | |
bfd093f5 | 188 | unsigned long next; |
2e40e163 MK |
189 | /* |
190 | * Handle of allocated object. | |
191 | */ | |
192 | unsigned long handle; | |
193 | }; | |
0959c63f SJ |
194 | }; |
195 | ||
196 | struct zs_pool { | |
6f3526d6 | 197 | const char *name; |
0f050d99 | 198 | |
cf8e0fed | 199 | struct size_class *size_class[ZS_SIZE_CLASSES]; |
2e40e163 | 200 | struct kmem_cache *handle_cachep; |
3783689a | 201 | struct kmem_cache *zspage_cachep; |
0959c63f | 202 | |
13de8933 | 203 | atomic_long_t pages_allocated; |
0f050d99 | 204 | |
7d3f3938 | 205 | struct zs_pool_stats stats; |
ab9d306d SS |
206 | |
207 | /* Compact classes */ | |
c19b548b | 208 | struct shrinker *shrinker; |
93144ca3 | 209 | |
0f050d99 GM |
210 | #ifdef CONFIG_ZSMALLOC_STAT |
211 | struct dentry *stat_dentry; | |
212 | #endif | |
48b4800a | 213 | #ifdef CONFIG_COMPACTION |
48b4800a MK |
214 | struct work_struct free_work; |
215 | #endif | |
0d6fa44e SS |
216 | /* protect zspage migration/compaction */ |
217 | rwlock_t lock; | |
d2658f20 | 218 | atomic_t compaction_in_progress; |
0959c63f | 219 | }; |
61989a80 | 220 | |
7d2e1a69 AS |
221 | static inline void zpdesc_set_first(struct zpdesc *zpdesc) |
222 | { | |
223 | SetPagePrivate(zpdesc_page(zpdesc)); | |
224 | } | |
225 | ||
226 | static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc) | |
227 | { | |
228 | inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); | |
229 | } | |
230 | ||
231 | static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) | |
232 | { | |
233 | dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); | |
234 | } | |
235 | ||
56e5a103 | 236 | static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid) |
7d2e1a69 | 237 | { |
56e5a103 | 238 | struct page *page = alloc_pages_node(nid, gfp, 0); |
7d2e1a69 AS |
239 | |
240 | return page_zpdesc(page); | |
241 | } | |
242 | ||
243 | static inline void free_zpdesc(struct zpdesc *zpdesc) | |
244 | { | |
245 | struct page *page = zpdesc_page(zpdesc); | |
246 | ||
247 | __free_page(page); | |
248 | } | |
249 | ||
e27af3f9 SS |
250 | #define ZS_PAGE_UNLOCKED 0 |
251 | #define ZS_PAGE_WRLOCKED -1 | |
252 | ||
253 | struct zspage_lock { | |
254 | spinlock_t lock; | |
255 | int cnt; | |
256 | struct lockdep_map dep_map; | |
257 | }; | |
258 | ||
3783689a MK |
259 | struct zspage { |
260 | struct { | |
a41ec880 | 261 | unsigned int huge:HUGE_BITS; |
3783689a | 262 | unsigned int fullness:FULLNESS_BITS; |
85d492f2 | 263 | unsigned int class:CLASS_BITS + 1; |
48b4800a | 264 | unsigned int magic:MAGIC_VAL_BITS; |
3783689a MK |
265 | }; |
266 | unsigned int inuse; | |
bfd093f5 | 267 | unsigned int freeobj; |
f4e33d32 | 268 | struct zpdesc *first_zpdesc; |
3783689a | 269 | struct list_head list; /* fullness list */ |
68f2736a | 270 | struct zs_pool *pool; |
e27af3f9 | 271 | struct zspage_lock zsl; |
3783689a | 272 | }; |
61989a80 | 273 | |
e27af3f9 SS |
274 | static void zspage_lock_init(struct zspage *zspage) |
275 | { | |
276 | static struct lock_class_key __key; | |
277 | struct zspage_lock *zsl = &zspage->zsl; | |
278 | ||
279 | lockdep_init_map(&zsl->dep_map, "zspage->lock", &__key, 0); | |
280 | spin_lock_init(&zsl->lock); | |
281 | zsl->cnt = ZS_PAGE_UNLOCKED; | |
282 | } | |
283 | ||
284 | /* | |
285 | * The zspage lock can be held from atomic contexts, but it needs to remain | |
286 | * preemptible when held for reading because it remains held outside of those | |
287 | * atomic contexts, otherwise we unnecessarily lose preemptibility. | |
288 | * | |
289 | * To achieve this, the following rules are enforced on readers and writers: | |
290 | * | |
291 | * - Writers are blocked by both writers and readers, while readers are only | |
292 | * blocked by writers (i.e. normal rwlock semantics). | |
293 | * | |
294 | * - Writers are always atomic (to allow readers to spin waiting for them). | |
295 | * | |
296 | * - Writers always use trylock (as the lock may be held be sleeping readers). | |
297 | * | |
298 | * - Readers may spin on the lock (as they can only wait for atomic writers). | |
299 | * | |
300 | * - Readers may sleep while holding the lock (as writes only use trylock). | |
301 | */ | |
302 | static void zspage_read_lock(struct zspage *zspage) | |
303 | { | |
304 | struct zspage_lock *zsl = &zspage->zsl; | |
305 | ||
306 | rwsem_acquire_read(&zsl->dep_map, 0, 0, _RET_IP_); | |
307 | ||
308 | spin_lock(&zsl->lock); | |
309 | zsl->cnt++; | |
310 | spin_unlock(&zsl->lock); | |
311 | ||
312 | lock_acquired(&zsl->dep_map, _RET_IP_); | |
313 | } | |
314 | ||
315 | static void zspage_read_unlock(struct zspage *zspage) | |
316 | { | |
317 | struct zspage_lock *zsl = &zspage->zsl; | |
318 | ||
319 | rwsem_release(&zsl->dep_map, _RET_IP_); | |
320 | ||
321 | spin_lock(&zsl->lock); | |
322 | zsl->cnt--; | |
323 | spin_unlock(&zsl->lock); | |
324 | } | |
325 | ||
326 | static __must_check bool zspage_write_trylock(struct zspage *zspage) | |
327 | { | |
328 | struct zspage_lock *zsl = &zspage->zsl; | |
329 | ||
330 | spin_lock(&zsl->lock); | |
331 | if (zsl->cnt == ZS_PAGE_UNLOCKED) { | |
332 | zsl->cnt = ZS_PAGE_WRLOCKED; | |
333 | rwsem_acquire(&zsl->dep_map, 0, 1, _RET_IP_); | |
334 | lock_acquired(&zsl->dep_map, _RET_IP_); | |
335 | return true; | |
336 | } | |
337 | ||
338 | spin_unlock(&zsl->lock); | |
339 | return false; | |
340 | } | |
341 | ||
342 | static void zspage_write_unlock(struct zspage *zspage) | |
343 | { | |
344 | struct zspage_lock *zsl = &zspage->zsl; | |
345 | ||
346 | rwsem_release(&zsl->dep_map, _RET_IP_); | |
347 | ||
348 | zsl->cnt = ZS_PAGE_UNLOCKED; | |
349 | spin_unlock(&zsl->lock); | |
350 | } | |
351 | ||
a41ec880 MK |
352 | /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ |
353 | static void SetZsHugePage(struct zspage *zspage) | |
354 | { | |
355 | zspage->huge = 1; | |
356 | } | |
357 | ||
358 | static bool ZsHugePage(struct zspage *zspage) | |
359 | { | |
360 | return zspage->huge; | |
361 | } | |
362 | ||
568b567f | 363 | #ifdef CONFIG_COMPACTION |
48b4800a MK |
364 | static void kick_deferred_free(struct zs_pool *pool); |
365 | static void init_deferred_free(struct zs_pool *pool); | |
366 | static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); | |
367 | #else | |
48b4800a MK |
368 | static void kick_deferred_free(struct zs_pool *pool) {} |
369 | static void init_deferred_free(struct zs_pool *pool) {} | |
370 | static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} | |
371 | #endif | |
372 | ||
3783689a | 373 | static int create_cache(struct zs_pool *pool) |
2e40e163 | 374 | { |
6040f650 SS |
375 | char *name; |
376 | ||
377 | name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name); | |
378 | if (!name) | |
379 | return -ENOMEM; | |
380 | pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE, | |
381 | 0, 0, NULL); | |
382 | kfree(name); | |
3783689a | 383 | if (!pool->handle_cachep) |
6040f650 SS |
384 | return -EINVAL; |
385 | ||
386 | name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name); | |
387 | if (!name) | |
388 | return -ENOMEM; | |
389 | pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage), | |
390 | 0, 0, NULL); | |
391 | kfree(name); | |
3783689a MK |
392 | if (!pool->zspage_cachep) { |
393 | kmem_cache_destroy(pool->handle_cachep); | |
394 | pool->handle_cachep = NULL; | |
6040f650 | 395 | return -EINVAL; |
3783689a MK |
396 | } |
397 | ||
398 | return 0; | |
2e40e163 MK |
399 | } |
400 | ||
3783689a | 401 | static void destroy_cache(struct zs_pool *pool) |
2e40e163 | 402 | { |
cd10add0 | 403 | kmem_cache_destroy(pool->handle_cachep); |
3783689a | 404 | kmem_cache_destroy(pool->zspage_cachep); |
2e40e163 MK |
405 | } |
406 | ||
3783689a | 407 | static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) |
2e40e163 MK |
408 | { |
409 | return (unsigned long)kmem_cache_alloc(pool->handle_cachep, | |
48b4800a | 410 | gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); |
2e40e163 MK |
411 | } |
412 | ||
3783689a | 413 | static void cache_free_handle(struct zs_pool *pool, unsigned long handle) |
2e40e163 MK |
414 | { |
415 | kmem_cache_free(pool->handle_cachep, (void *)handle); | |
416 | } | |
417 | ||
3783689a MK |
418 | static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) |
419 | { | |
f0231305 | 420 | return kmem_cache_zalloc(pool->zspage_cachep, |
48b4800a | 421 | flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); |
399d8eeb | 422 | } |
3783689a MK |
423 | |
424 | static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) | |
425 | { | |
426 | kmem_cache_free(pool->zspage_cachep, zspage); | |
427 | } | |
428 | ||
64bd0197 | 429 | /* class->lock(which owns the handle) synchronizes races */ |
2e40e163 MK |
430 | static void record_obj(unsigned long handle, unsigned long obj) |
431 | { | |
b475d42d | 432 | *(unsigned long *)handle = obj; |
2e40e163 MK |
433 | } |
434 | ||
c795779d DS |
435 | /* zpool driver */ |
436 | ||
437 | #ifdef CONFIG_ZPOOL | |
438 | ||
35499e2b | 439 | static void *zs_zpool_create(const char *name, gfp_t gfp) |
c795779d | 440 | { |
d0d8da2d SS |
441 | /* |
442 | * Ignore global gfp flags: zs_malloc() may be invoked from | |
443 | * different contexts and its caller must provide a valid | |
444 | * gfp mask. | |
445 | */ | |
b3067742 | 446 | return zs_create_pool(name); |
c795779d DS |
447 | } |
448 | ||
449 | static void zs_zpool_destroy(void *pool) | |
450 | { | |
451 | zs_destroy_pool(pool); | |
452 | } | |
453 | ||
454 | static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, | |
56e5a103 | 455 | unsigned long *handle, const int nid) |
c795779d | 456 | { |
56e5a103 | 457 | *handle = zs_malloc(pool, size, gfp, nid); |
c7e6f17b | 458 | |
65917b53 | 459 | if (IS_ERR_VALUE(*handle)) |
c7e6f17b HZ |
460 | return PTR_ERR((void *)*handle); |
461 | return 0; | |
c795779d DS |
462 | } |
463 | static void zs_zpool_free(void *pool, unsigned long handle) | |
464 | { | |
465 | zs_free(pool, handle); | |
466 | } | |
467 | ||
9bbe033c YA |
468 | static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle, |
469 | void *local_copy) | |
470 | { | |
471 | return zs_obj_read_begin(pool, handle, local_copy); | |
472 | } | |
473 | ||
474 | static void zs_zpool_obj_read_end(void *pool, unsigned long handle, | |
475 | void *handle_mem) | |
476 | { | |
477 | zs_obj_read_end(pool, handle, handle_mem); | |
478 | } | |
479 | ||
480 | static void zs_zpool_obj_write(void *pool, unsigned long handle, | |
481 | void *handle_mem, size_t mem_len) | |
482 | { | |
483 | zs_obj_write(pool, handle, handle_mem, mem_len); | |
484 | } | |
485 | ||
4196b48d | 486 | static u64 zs_zpool_total_pages(void *pool) |
c795779d | 487 | { |
4196b48d | 488 | return zs_get_total_pages(pool); |
c795779d DS |
489 | } |
490 | ||
491 | static struct zpool_driver zs_zpool_driver = { | |
c165f25d HZ |
492 | .type = "zsmalloc", |
493 | .owner = THIS_MODULE, | |
494 | .create = zs_zpool_create, | |
495 | .destroy = zs_zpool_destroy, | |
c165f25d HZ |
496 | .malloc = zs_zpool_malloc, |
497 | .free = zs_zpool_free, | |
9bbe033c YA |
498 | .obj_read_begin = zs_zpool_obj_read_begin, |
499 | .obj_read_end = zs_zpool_obj_read_end, | |
500 | .obj_write = zs_zpool_obj_write, | |
4196b48d | 501 | .total_pages = zs_zpool_total_pages, |
c795779d DS |
502 | }; |
503 | ||
137f8cff | 504 | MODULE_ALIAS("zpool-zsmalloc"); |
c795779d DS |
505 | #endif /* CONFIG_ZPOOL */ |
506 | ||
6e8e0429 | 507 | static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) |
c1b3bb73 AS |
508 | { |
509 | return PagePrivate(zpdesc_page(zpdesc)); | |
510 | } | |
511 | ||
64bd0197 | 512 | /* Protected by class->lock */ |
3783689a | 513 | static inline int get_zspage_inuse(struct zspage *zspage) |
4f42047b | 514 | { |
3783689a | 515 | return zspage->inuse; |
4f42047b MK |
516 | } |
517 | ||
3783689a | 518 | static inline void mod_zspage_inuse(struct zspage *zspage, int val) |
4f42047b | 519 | { |
3783689a | 520 | zspage->inuse += val; |
4f42047b MK |
521 | } |
522 | ||
c1b3bb73 AS |
523 | static struct zpdesc *get_first_zpdesc(struct zspage *zspage) |
524 | { | |
525 | struct zpdesc *first_zpdesc = zspage->first_zpdesc; | |
526 | ||
527 | VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc)); | |
528 | return first_zpdesc; | |
529 | } | |
530 | ||
04cb7502 | 531 | #define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff |
43d746dc | 532 | |
fc5eec0d | 533 | static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc) |
4f42047b | 534 | { |
fc5eec0d AS |
535 | VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); |
536 | return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK; | |
48b4800a | 537 | } |
3783689a | 538 | |
fc5eec0d | 539 | static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset) |
48b4800a | 540 | { |
04cb7502 MWO |
541 | /* With 24 bits available, we can support offsets into 16 MiB pages. */ |
542 | BUILD_BUG_ON(PAGE_SIZE > SZ_16M); | |
fc5eec0d | 543 | VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); |
43d746dc | 544 | VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); |
fc5eec0d AS |
545 | zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK; |
546 | zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK; | |
4f42047b MK |
547 | } |
548 | ||
bfd093f5 | 549 | static inline unsigned int get_freeobj(struct zspage *zspage) |
4f42047b | 550 | { |
bfd093f5 | 551 | return zspage->freeobj; |
4f42047b MK |
552 | } |
553 | ||
bfd093f5 | 554 | static inline void set_freeobj(struct zspage *zspage, unsigned int obj) |
4f42047b | 555 | { |
bfd093f5 | 556 | zspage->freeobj = obj; |
4f42047b MK |
557 | } |
558 | ||
67f1c9cd | 559 | static struct size_class *zspage_class(struct zs_pool *pool, |
4c7ac972 | 560 | struct zspage *zspage) |
67f1c9cd MK |
561 | { |
562 | return pool->size_class[zspage->class]; | |
563 | } | |
564 | ||
c3e3e88a NC |
565 | /* |
566 | * zsmalloc divides the pool into various size classes where each | |
567 | * class maintains a list of zspages where each zspage is divided | |
568 | * into equal sized chunks. Each allocation falls into one of these | |
569 | * classes depending on its size. This function returns index of the | |
cb152a1a | 570 | * size class which has chunk size big enough to hold the given size. |
c3e3e88a | 571 | */ |
61989a80 NG |
572 | static int get_size_class_index(int size) |
573 | { | |
574 | int idx = 0; | |
575 | ||
576 | if (likely(size > ZS_MIN_ALLOC_SIZE)) | |
577 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, | |
578 | ZS_SIZE_CLASS_DELTA); | |
579 | ||
cf8e0fed | 580 | return min_t(int, ZS_SIZE_CLASSES - 1, idx); |
61989a80 NG |
581 | } |
582 | ||
791abe1e SS |
583 | static inline void class_stat_add(struct size_class *class, int type, |
584 | unsigned long cnt) | |
248ca1b0 | 585 | { |
48b4800a | 586 | class->stats.objs[type] += cnt; |
248ca1b0 MK |
587 | } |
588 | ||
791abe1e SS |
589 | static inline void class_stat_sub(struct size_class *class, int type, |
590 | unsigned long cnt) | |
248ca1b0 | 591 | { |
48b4800a | 592 | class->stats.objs[type] -= cnt; |
248ca1b0 MK |
593 | } |
594 | ||
791abe1e | 595 | static inline unsigned long class_stat_read(struct size_class *class, int type) |
248ca1b0 | 596 | { |
48b4800a | 597 | return class->stats.objs[type]; |
248ca1b0 MK |
598 | } |
599 | ||
57244594 SS |
600 | #ifdef CONFIG_ZSMALLOC_STAT |
601 | ||
4abaac9b | 602 | static void __init zs_stat_init(void) |
248ca1b0 | 603 | { |
4abaac9b DS |
604 | if (!debugfs_initialized()) { |
605 | pr_warn("debugfs not available, stat dir not created\n"); | |
606 | return; | |
607 | } | |
248ca1b0 MK |
608 | |
609 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | |
248ca1b0 MK |
610 | } |
611 | ||
612 | static void __exit zs_stat_exit(void) | |
613 | { | |
614 | debugfs_remove_recursive(zs_stat_root); | |
615 | } | |
616 | ||
1120ed54 SS |
617 | static unsigned long zs_can_compact(struct size_class *class); |
618 | ||
248ca1b0 MK |
619 | static int zs_stats_size_show(struct seq_file *s, void *v) |
620 | { | |
e1807d5d | 621 | int i, fg; |
248ca1b0 MK |
622 | struct zs_pool *pool = s->private; |
623 | struct size_class *class; | |
624 | int objs_per_zspage; | |
1120ed54 | 625 | unsigned long obj_allocated, obj_used, pages_used, freeable; |
248ca1b0 | 626 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; |
1120ed54 | 627 | unsigned long total_freeable = 0; |
e1807d5d | 628 | unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, }; |
248ca1b0 | 629 | |
e1807d5d SS |
630 | seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n", |
631 | "class", "size", "10%", "20%", "30%", "40%", | |
632 | "50%", "60%", "70%", "80%", "90%", "99%", "100%", | |
248ca1b0 | 633 | "obj_allocated", "obj_used", "pages_used", |
1120ed54 | 634 | "pages_per_zspage", "freeable"); |
248ca1b0 | 635 | |
cf8e0fed | 636 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { |
4c7ac972 | 637 | |
248ca1b0 MK |
638 | class = pool->size_class[i]; |
639 | ||
640 | if (class->index != i) | |
641 | continue; | |
642 | ||
64bd0197 | 643 | spin_lock(&class->lock); |
e1807d5d SS |
644 | |
645 | seq_printf(s, " %5u %5u ", i, class->size); | |
646 | for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) { | |
791abe1e SS |
647 | inuse_totals[fg] += class_stat_read(class, fg); |
648 | seq_printf(s, "%9lu ", class_stat_read(class, fg)); | |
e1807d5d | 649 | } |
4c7ac972 | 650 | |
791abe1e SS |
651 | obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED); |
652 | obj_used = class_stat_read(class, ZS_OBJS_INUSE); | |
1120ed54 | 653 | freeable = zs_can_compact(class); |
64bd0197 | 654 | spin_unlock(&class->lock); |
248ca1b0 | 655 | |
b4fd07a0 | 656 | objs_per_zspage = class->objs_per_zspage; |
248ca1b0 MK |
657 | pages_used = obj_allocated / objs_per_zspage * |
658 | class->pages_per_zspage; | |
659 | ||
e1807d5d SS |
660 | seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n", |
661 | obj_allocated, obj_used, pages_used, | |
662 | class->pages_per_zspage, freeable); | |
248ca1b0 | 663 | |
248ca1b0 MK |
664 | total_objs += obj_allocated; |
665 | total_used_objs += obj_used; | |
666 | total_pages += pages_used; | |
1120ed54 | 667 | total_freeable += freeable; |
248ca1b0 MK |
668 | } |
669 | ||
670 | seq_puts(s, "\n"); | |
e1807d5d SS |
671 | seq_printf(s, " %5s %5s ", "Total", ""); |
672 | ||
673 | for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) | |
674 | seq_printf(s, "%9lu ", inuse_totals[fg]); | |
675 | ||
676 | seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n", | |
677 | total_objs, total_used_objs, total_pages, "", | |
678 | total_freeable); | |
248ca1b0 MK |
679 | |
680 | return 0; | |
681 | } | |
5ad35093 | 682 | DEFINE_SHOW_ATTRIBUTE(zs_stats_size); |
248ca1b0 | 683 | |
d34f6157 | 684 | static void zs_pool_stat_create(struct zs_pool *pool, const char *name) |
248ca1b0 | 685 | { |
4abaac9b DS |
686 | if (!zs_stat_root) { |
687 | pr_warn("no root stat dir, not creating <%s> stat dir\n", name); | |
d34f6157 | 688 | return; |
4abaac9b | 689 | } |
248ca1b0 | 690 | |
4268509a GKH |
691 | pool->stat_dentry = debugfs_create_dir(name, zs_stat_root); |
692 | ||
693 | debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool, | |
694 | &zs_stats_size_fops); | |
248ca1b0 MK |
695 | } |
696 | ||
697 | static void zs_pool_stat_destroy(struct zs_pool *pool) | |
698 | { | |
699 | debugfs_remove_recursive(pool->stat_dentry); | |
700 | } | |
701 | ||
702 | #else /* CONFIG_ZSMALLOC_STAT */ | |
4abaac9b | 703 | static void __init zs_stat_init(void) |
248ca1b0 | 704 | { |
248ca1b0 MK |
705 | } |
706 | ||
707 | static void __exit zs_stat_exit(void) | |
708 | { | |
709 | } | |
710 | ||
d34f6157 | 711 | static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) |
248ca1b0 | 712 | { |
248ca1b0 MK |
713 | } |
714 | ||
715 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | |
716 | { | |
717 | } | |
248ca1b0 MK |
718 | #endif |
719 | ||
48b4800a | 720 | |
c3e3e88a NC |
721 | /* |
722 | * For each size class, zspages are divided into different groups | |
4c7ac972 | 723 | * depending on their usage ratio. This function returns fullness |
c3e3e88a NC |
724 | * status of the given page. |
725 | */ | |
4c7ac972 | 726 | static int get_fullness_group(struct size_class *class, struct zspage *zspage) |
61989a80 | 727 | { |
4c7ac972 | 728 | int inuse, objs_per_zspage, ratio; |
830e4bc5 | 729 | |
3783689a | 730 | inuse = get_zspage_inuse(zspage); |
1fc6e27d | 731 | objs_per_zspage = class->objs_per_zspage; |
61989a80 NG |
732 | |
733 | if (inuse == 0) | |
4c7ac972 SS |
734 | return ZS_INUSE_RATIO_0; |
735 | if (inuse == objs_per_zspage) | |
736 | return ZS_INUSE_RATIO_100; | |
61989a80 | 737 | |
4c7ac972 SS |
738 | ratio = 100 * inuse / objs_per_zspage; |
739 | /* | |
740 | * Take integer division into consideration: a page with one inuse | |
741 | * object out of 127 possible, will end up having 0 usage ratio, | |
742 | * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group. | |
743 | */ | |
744 | return ratio / 10 + 1; | |
61989a80 NG |
745 | } |
746 | ||
c3e3e88a NC |
747 | /* |
748 | * Each size class maintains various freelists and zspages are assigned | |
749 | * to one of these freelists based on the number of live objects they | |
750 | * have. This functions inserts the given zspage into the freelist | |
751 | * identified by <class, fullness_group>. | |
752 | */ | |
251cbb95 | 753 | static void insert_zspage(struct size_class *class, |
3783689a | 754 | struct zspage *zspage, |
4c7ac972 | 755 | int fullness) |
61989a80 | 756 | { |
791abe1e | 757 | class_stat_add(class, fullness, 1); |
a40a71e8 | 758 | list_add(&zspage->list, &class->fullness_list[fullness]); |
a6a8cdfd | 759 | zspage->fullness = fullness; |
61989a80 NG |
760 | } |
761 | ||
c3e3e88a NC |
762 | /* |
763 | * This function removes the given zspage from the freelist identified | |
764 | * by <class, fullness_group>. | |
765 | */ | |
67eaedc1 | 766 | static void remove_zspage(struct size_class *class, struct zspage *zspage) |
61989a80 | 767 | { |
67eaedc1 CZ |
768 | int fullness = zspage->fullness; |
769 | ||
3783689a | 770 | VM_BUG_ON(list_empty(&class->fullness_list[fullness])); |
61989a80 | 771 | |
3783689a | 772 | list_del_init(&zspage->list); |
791abe1e | 773 | class_stat_sub(class, fullness, 1); |
61989a80 NG |
774 | } |
775 | ||
c3e3e88a NC |
776 | /* |
777 | * Each size class maintains zspages in different fullness groups depending | |
778 | * on the number of live objects they contain. When allocating or freeing | |
4c7ac972 SS |
779 | * objects, the fullness status of the page can change, for instance, from |
780 | * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function | |
781 | * checks if such a status change has occurred for the given page and | |
782 | * accordingly moves the page from the list of the old fullness group to that | |
783 | * of the new fullness group. | |
c3e3e88a | 784 | */ |
4c7ac972 | 785 | static int fix_fullness_group(struct size_class *class, struct zspage *zspage) |
61989a80 | 786 | { |
ce335e07 | 787 | int newfg; |
61989a80 | 788 | |
3783689a | 789 | newfg = get_fullness_group(class, zspage); |
ce335e07 | 790 | if (newfg == zspage->fullness) |
61989a80 NG |
791 | goto out; |
792 | ||
67eaedc1 | 793 | remove_zspage(class, zspage); |
c4549b87 | 794 | insert_zspage(class, zspage, newfg); |
61989a80 NG |
795 | out: |
796 | return newfg; | |
797 | } | |
798 | ||
6d0adf4b | 799 | static struct zspage *get_zspage(struct zpdesc *zpdesc) |
61989a80 | 800 | { |
6d0adf4b | 801 | struct zspage *zspage = zpdesc->zspage; |
48b4800a MK |
802 | |
803 | BUG_ON(zspage->magic != ZSPAGE_MAGIC); | |
804 | return zspage; | |
61989a80 NG |
805 | } |
806 | ||
c1b3bb73 AS |
807 | static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc) |
808 | { | |
6d0adf4b | 809 | struct zspage *zspage = get_zspage(zpdesc); |
c1b3bb73 AS |
810 | |
811 | if (unlikely(ZsHugePage(zspage))) | |
812 | return NULL; | |
813 | ||
814 | return zpdesc->next; | |
815 | } | |
816 | ||
bfd093f5 | 817 | /** |
b5c1d8b5 | 818 | * obj_to_location - get (<zpdesc>, <obj_idx>) from encoded object value |
e8b098fc | 819 | * @obj: the encoded object value |
b5c1d8b5 | 820 | * @zpdesc: zpdesc object resides in zspage |
bfd093f5 | 821 | * @obj_idx: object index |
67296874 | 822 | */ |
b5c1d8b5 | 823 | static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc, |
bfd093f5 | 824 | unsigned int *obj_idx) |
61989a80 | 825 | { |
b5c1d8b5 | 826 | *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); |
bfd093f5 MK |
827 | *obj_idx = (obj & OBJ_INDEX_MASK); |
828 | } | |
61989a80 | 829 | |
4e04d10c | 830 | static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc) |
67f1c9cd | 831 | { |
4e04d10c | 832 | *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); |
67f1c9cd MK |
833 | } |
834 | ||
bfd093f5 | 835 | /** |
2d57eb9e HY |
836 | * location_to_obj - get obj value encoded from (<zpdesc>, <obj_idx>) |
837 | * @zpdesc: zpdesc object resides in zspage | |
bfd093f5 MK |
838 | * @obj_idx: object index |
839 | */ | |
2d57eb9e | 840 | static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx) |
bfd093f5 MK |
841 | { |
842 | unsigned long obj; | |
61989a80 | 843 | |
2d57eb9e | 844 | obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS; |
bfd093f5 | 845 | obj |= obj_idx & OBJ_INDEX_MASK; |
61989a80 | 846 | |
bfd093f5 | 847 | return obj; |
61989a80 NG |
848 | } |
849 | ||
2e40e163 MK |
850 | static unsigned long handle_to_obj(unsigned long handle) |
851 | { | |
852 | return *(unsigned long *)handle; | |
853 | } | |
854 | ||
76fb5d99 | 855 | static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj, |
f9044f17 | 856 | unsigned long *phandle) |
312fcae2 | 857 | { |
3ae92ac2 | 858 | unsigned long handle; |
6d0adf4b | 859 | struct zspage *zspage = get_zspage(zpdesc); |
3ae92ac2 | 860 | |
a41ec880 | 861 | if (unlikely(ZsHugePage(zspage))) { |
76fb5d99 HY |
862 | VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc)); |
863 | handle = zpdesc->handle; | |
7b60a685 | 864 | } else |
3ae92ac2 MK |
865 | handle = *(unsigned long *)obj; |
866 | ||
f9044f17 | 867 | if (!(handle & OBJ_ALLOCATED_TAG)) |
3ae92ac2 MK |
868 | return false; |
869 | ||
85b32581 NP |
870 | /* Clear all tags before returning the handle */ |
871 | *phandle = handle & ~OBJ_TAG_MASK; | |
3ae92ac2 | 872 | return true; |
312fcae2 MK |
873 | } |
874 | ||
73349afa | 875 | static void reset_zpdesc(struct zpdesc *zpdesc) |
f4477e90 | 876 | { |
73349afa AS |
877 | struct page *page = zpdesc_page(zpdesc); |
878 | ||
48b4800a | 879 | __ClearPageMovable(page); |
18fd06bf | 880 | ClearPagePrivate(page); |
73349afa AS |
881 | zpdesc->zspage = NULL; |
882 | zpdesc->next = NULL; | |
43d746dc | 883 | __ClearPageZsmalloc(page); |
48b4800a MK |
884 | } |
885 | ||
4d0a5402 | 886 | static int trylock_zspage(struct zspage *zspage) |
48b4800a | 887 | { |
c1b3bb73 | 888 | struct zpdesc *cursor, *fail; |
48b4800a | 889 | |
c1b3bb73 AS |
890 | for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor = |
891 | get_next_zpdesc(cursor)) { | |
892 | if (!zpdesc_trylock(cursor)) { | |
48b4800a MK |
893 | fail = cursor; |
894 | goto unlock; | |
895 | } | |
896 | } | |
897 | ||
898 | return 1; | |
899 | unlock: | |
c1b3bb73 AS |
900 | for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor = |
901 | get_next_zpdesc(cursor)) | |
902 | zpdesc_unlock(cursor); | |
48b4800a MK |
903 | |
904 | return 0; | |
f4477e90 NG |
905 | } |
906 | ||
48b4800a MK |
907 | static void __free_zspage(struct zs_pool *pool, struct size_class *class, |
908 | struct zspage *zspage) | |
61989a80 | 909 | { |
7f0b0c66 | 910 | struct zpdesc *zpdesc, *next; |
48b4800a | 911 | |
64bd0197 | 912 | assert_spin_locked(&class->lock); |
61989a80 | 913 | |
3783689a | 914 | VM_BUG_ON(get_zspage_inuse(zspage)); |
ce335e07 | 915 | VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); |
61989a80 | 916 | |
7f0b0c66 | 917 | next = zpdesc = get_first_zpdesc(zspage); |
3783689a | 918 | do { |
7f0b0c66 HY |
919 | VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc)); |
920 | next = get_next_zpdesc(zpdesc); | |
921 | reset_zpdesc(zpdesc); | |
922 | zpdesc_unlock(zpdesc); | |
923 | zpdesc_dec_zone_page_state(zpdesc); | |
924 | zpdesc_put(zpdesc); | |
925 | zpdesc = next; | |
926 | } while (zpdesc != NULL); | |
61989a80 | 927 | |
3783689a | 928 | cache_free_zspage(pool, zspage); |
48b4800a | 929 | |
791abe1e | 930 | class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); |
4c7ac972 | 931 | atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); |
48b4800a MK |
932 | } |
933 | ||
934 | static void free_zspage(struct zs_pool *pool, struct size_class *class, | |
935 | struct zspage *zspage) | |
936 | { | |
937 | VM_BUG_ON(get_zspage_inuse(zspage)); | |
938 | VM_BUG_ON(list_empty(&zspage->list)); | |
939 | ||
b475d42d MK |
940 | /* |
941 | * Since zs_free couldn't be sleepable, this function cannot call | |
942 | * lock_page. The page locks trylock_zspage got will be released | |
943 | * by __free_zspage. | |
944 | */ | |
48b4800a MK |
945 | if (!trylock_zspage(zspage)) { |
946 | kick_deferred_free(pool); | |
947 | return; | |
948 | } | |
949 | ||
67eaedc1 | 950 | remove_zspage(class, zspage); |
48b4800a | 951 | __free_zspage(pool, class, zspage); |
61989a80 NG |
952 | } |
953 | ||
954 | /* Initialize a newly allocated zspage */ | |
3783689a | 955 | static void init_zspage(struct size_class *class, struct zspage *zspage) |
61989a80 | 956 | { |
bfd093f5 | 957 | unsigned int freeobj = 1; |
61989a80 | 958 | unsigned long off = 0; |
acaf4184 | 959 | struct zpdesc *zpdesc = get_first_zpdesc(zspage); |
830e4bc5 | 960 | |
acaf4184 HY |
961 | while (zpdesc) { |
962 | struct zpdesc *next_zpdesc; | |
61989a80 | 963 | struct link_free *link; |
af4ee5e9 | 964 | void *vaddr; |
61989a80 | 965 | |
fc5eec0d | 966 | set_first_obj_offset(zpdesc, off); |
61989a80 | 967 | |
acaf4184 | 968 | vaddr = kmap_local_zpdesc(zpdesc); |
af4ee5e9 | 969 | link = (struct link_free *)vaddr + off / sizeof(*link); |
5538c562 DS |
970 | |
971 | while ((off += class->size) < PAGE_SIZE) { | |
3b1d9ca6 | 972 | link->next = freeobj++ << OBJ_TAG_BITS; |
5538c562 | 973 | link += class->size / sizeof(*link); |
61989a80 NG |
974 | } |
975 | ||
976 | /* | |
977 | * We now come to the last (full or partial) object on this | |
978 | * page, which must point to the first object on the next | |
979 | * page (if present) | |
980 | */ | |
acaf4184 HY |
981 | next_zpdesc = get_next_zpdesc(zpdesc); |
982 | if (next_zpdesc) { | |
3b1d9ca6 | 983 | link->next = freeobj++ << OBJ_TAG_BITS; |
bfd093f5 MK |
984 | } else { |
985 | /* | |
3b1d9ca6 | 986 | * Reset OBJ_TAG_BITS bit to last link to tell |
bfd093f5 MK |
987 | * whether it's allocated object or not. |
988 | */ | |
01a6ad9a | 989 | link->next = -1UL << OBJ_TAG_BITS; |
bfd093f5 | 990 | } |
91d0ec83 | 991 | kunmap_local(vaddr); |
acaf4184 | 992 | zpdesc = next_zpdesc; |
5538c562 | 993 | off %= PAGE_SIZE; |
61989a80 | 994 | } |
bdb0af7c | 995 | |
bfd093f5 | 996 | set_freeobj(zspage, 0); |
61989a80 NG |
997 | } |
998 | ||
48b4800a | 999 | static void create_page_chain(struct size_class *class, struct zspage *zspage, |
7d2e1a69 | 1000 | struct zpdesc *zpdescs[]) |
61989a80 | 1001 | { |
bdb0af7c | 1002 | int i; |
7d2e1a69 AS |
1003 | struct zpdesc *zpdesc; |
1004 | struct zpdesc *prev_zpdesc = NULL; | |
1005 | int nr_zpdescs = class->pages_per_zspage; | |
61989a80 NG |
1006 | |
1007 | /* | |
1008 | * Allocate individual pages and link them together as: | |
7d2e1a69 AS |
1009 | * 1. all pages are linked together using zpdesc->next |
1010 | * 2. each sub-page point to zspage using zpdesc->zspage | |
61989a80 | 1011 | * |
7d2e1a69 | 1012 | * we set PG_private to identify the first zpdesc (i.e. no other zpdesc |
22c5cef1 | 1013 | * has this flag set). |
61989a80 | 1014 | */ |
7d2e1a69 AS |
1015 | for (i = 0; i < nr_zpdescs; i++) { |
1016 | zpdesc = zpdescs[i]; | |
1017 | zpdesc->zspage = zspage; | |
1018 | zpdesc->next = NULL; | |
bdb0af7c | 1019 | if (i == 0) { |
7d2e1a69 AS |
1020 | zspage->first_zpdesc = zpdesc; |
1021 | zpdesc_set_first(zpdesc); | |
48b4800a MK |
1022 | if (unlikely(class->objs_per_zspage == 1 && |
1023 | class->pages_per_zspage == 1)) | |
a41ec880 | 1024 | SetZsHugePage(zspage); |
3783689a | 1025 | } else { |
7d2e1a69 | 1026 | prev_zpdesc->next = zpdesc; |
61989a80 | 1027 | } |
7d2e1a69 | 1028 | prev_zpdesc = zpdesc; |
61989a80 | 1029 | } |
bdb0af7c | 1030 | } |
61989a80 | 1031 | |
bdb0af7c MK |
1032 | /* |
1033 | * Allocate a zspage for the given size class | |
1034 | */ | |
3783689a | 1035 | static struct zspage *alloc_zspage(struct zs_pool *pool, |
56e5a103 NP |
1036 | struct size_class *class, |
1037 | gfp_t gfp, const int nid) | |
bdb0af7c MK |
1038 | { |
1039 | int i; | |
7d2e1a69 | 1040 | struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; |
3783689a MK |
1041 | struct zspage *zspage = cache_alloc_zspage(pool, gfp); |
1042 | ||
1043 | if (!zspage) | |
1044 | return NULL; | |
1045 | ||
48b4800a | 1046 | zspage->magic = ZSPAGE_MAGIC; |
e27af3f9 SS |
1047 | zspage->pool = pool; |
1048 | zspage->class = class->index; | |
1049 | zspage_lock_init(zspage); | |
61989a80 | 1050 | |
bdb0af7c | 1051 | for (i = 0; i < class->pages_per_zspage; i++) { |
7d2e1a69 | 1052 | struct zpdesc *zpdesc; |
61989a80 | 1053 | |
56e5a103 | 1054 | zpdesc = alloc_zpdesc(gfp, nid); |
7d2e1a69 | 1055 | if (!zpdesc) { |
91537fee | 1056 | while (--i >= 0) { |
7d2e1a69 | 1057 | zpdesc_dec_zone_page_state(zpdescs[i]); |
b5f469a1 | 1058 | __zpdesc_clear_zsmalloc(zpdescs[i]); |
7d2e1a69 | 1059 | free_zpdesc(zpdescs[i]); |
91537fee | 1060 | } |
3783689a | 1061 | cache_free_zspage(pool, zspage); |
bdb0af7c MK |
1062 | return NULL; |
1063 | } | |
b5f469a1 | 1064 | __zpdesc_set_zsmalloc(zpdesc); |
91537fee | 1065 | |
7d2e1a69 AS |
1066 | zpdesc_inc_zone_page_state(zpdesc); |
1067 | zpdescs[i] = zpdesc; | |
61989a80 NG |
1068 | } |
1069 | ||
7d2e1a69 | 1070 | create_page_chain(class, zspage, zpdescs); |
3783689a | 1071 | init_zspage(class, zspage); |
bdb0af7c | 1072 | |
3783689a | 1073 | return zspage; |
61989a80 NG |
1074 | } |
1075 | ||
3783689a | 1076 | static struct zspage *find_get_zspage(struct size_class *class) |
61989a80 NG |
1077 | { |
1078 | int i; | |
3783689a | 1079 | struct zspage *zspage; |
61989a80 | 1080 | |
4c7ac972 | 1081 | for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) { |
3783689a | 1082 | zspage = list_first_entry_or_null(&class->fullness_list[i], |
4c7ac972 | 1083 | struct zspage, list); |
3783689a | 1084 | if (zspage) |
61989a80 NG |
1085 | break; |
1086 | } | |
1087 | ||
3783689a | 1088 | return zspage; |
61989a80 NG |
1089 | } |
1090 | ||
64d90465 GM |
1091 | static bool can_merge(struct size_class *prev, int pages_per_zspage, |
1092 | int objs_per_zspage) | |
9eec4cd5 | 1093 | { |
64d90465 GM |
1094 | if (prev->pages_per_zspage == pages_per_zspage && |
1095 | prev->objs_per_zspage == objs_per_zspage) | |
1096 | return true; | |
9eec4cd5 | 1097 | |
64d90465 | 1098 | return false; |
9eec4cd5 JK |
1099 | } |
1100 | ||
3783689a | 1101 | static bool zspage_full(struct size_class *class, struct zspage *zspage) |
312fcae2 | 1102 | { |
3783689a | 1103 | return get_zspage_inuse(zspage) == class->objs_per_zspage; |
312fcae2 | 1104 | } |
7c2af309 | 1105 | |
df9cd3cb SS |
1106 | static bool zspage_empty(struct zspage *zspage) |
1107 | { | |
1108 | return get_zspage_inuse(zspage) == 0; | |
1109 | } | |
1110 | ||
7c2af309 AR |
1111 | /** |
1112 | * zs_lookup_class_index() - Returns index of the zsmalloc &size_class | |
1113 | * that hold objects of the provided size. | |
1114 | * @pool: zsmalloc pool to use | |
1115 | * @size: object size | |
1116 | * | |
1117 | * Context: Any context. | |
1118 | * | |
1119 | * Return: the index of the zsmalloc &size_class that hold objects of the | |
1120 | * provided size. | |
1121 | */ | |
1122 | unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) | |
1123 | { | |
1124 | struct size_class *class; | |
1125 | ||
1126 | class = pool->size_class[get_size_class_index(size)]; | |
1127 | ||
1128 | return class->index; | |
1129 | } | |
1130 | EXPORT_SYMBOL_GPL(zs_lookup_class_index); | |
312fcae2 | 1131 | |
66cdef66 GM |
1132 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
1133 | { | |
1134 | return atomic_long_read(&pool->pages_allocated); | |
1135 | } | |
1136 | EXPORT_SYMBOL_GPL(zs_get_total_pages); | |
1137 | ||
44f76413 SS |
1138 | void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, |
1139 | void *local_copy) | |
1140 | { | |
1141 | struct zspage *zspage; | |
1142 | struct zpdesc *zpdesc; | |
1143 | unsigned long obj, off; | |
1144 | unsigned int obj_idx; | |
1145 | struct size_class *class; | |
1146 | void *addr; | |
1147 | ||
1148 | /* Guarantee we can get zspage from handle safely */ | |
1149 | read_lock(&pool->lock); | |
1150 | obj = handle_to_obj(handle); | |
1151 | obj_to_location(obj, &zpdesc, &obj_idx); | |
1152 | zspage = get_zspage(zpdesc); | |
1153 | ||
1154 | /* Make sure migration doesn't move any pages in this zspage */ | |
1155 | zspage_read_lock(zspage); | |
1156 | read_unlock(&pool->lock); | |
1157 | ||
1158 | class = zspage_class(pool, zspage); | |
1159 | off = offset_in_page(class->size * obj_idx); | |
1160 | ||
1161 | if (off + class->size <= PAGE_SIZE) { | |
1162 | /* this object is contained entirely within a page */ | |
1163 | addr = kmap_local_zpdesc(zpdesc); | |
1164 | addr += off; | |
1165 | } else { | |
1166 | size_t sizes[2]; | |
1167 | ||
1168 | /* this object spans two pages */ | |
1169 | sizes[0] = PAGE_SIZE - off; | |
1170 | sizes[1] = class->size - sizes[0]; | |
1171 | addr = local_copy; | |
1172 | ||
1173 | memcpy_from_page(addr, zpdesc_page(zpdesc), | |
1174 | off, sizes[0]); | |
1175 | zpdesc = get_next_zpdesc(zpdesc); | |
1176 | memcpy_from_page(addr + sizes[0], | |
1177 | zpdesc_page(zpdesc), | |
1178 | 0, sizes[1]); | |
1179 | } | |
1180 | ||
1181 | if (!ZsHugePage(zspage)) | |
1182 | addr += ZS_HANDLE_SIZE; | |
1183 | ||
1184 | return addr; | |
1185 | } | |
1186 | EXPORT_SYMBOL_GPL(zs_obj_read_begin); | |
1187 | ||
1188 | void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, | |
1189 | void *handle_mem) | |
1190 | { | |
1191 | struct zspage *zspage; | |
1192 | struct zpdesc *zpdesc; | |
1193 | unsigned long obj, off; | |
1194 | unsigned int obj_idx; | |
1195 | struct size_class *class; | |
1196 | ||
1197 | obj = handle_to_obj(handle); | |
1198 | obj_to_location(obj, &zpdesc, &obj_idx); | |
1199 | zspage = get_zspage(zpdesc); | |
1200 | class = zspage_class(pool, zspage); | |
1201 | off = offset_in_page(class->size * obj_idx); | |
1202 | ||
1203 | if (off + class->size <= PAGE_SIZE) { | |
1204 | if (!ZsHugePage(zspage)) | |
1205 | off += ZS_HANDLE_SIZE; | |
1206 | handle_mem -= off; | |
1207 | kunmap_local(handle_mem); | |
1208 | } | |
1209 | ||
1210 | zspage_read_unlock(zspage); | |
1211 | } | |
1212 | EXPORT_SYMBOL_GPL(zs_obj_read_end); | |
1213 | ||
1214 | void zs_obj_write(struct zs_pool *pool, unsigned long handle, | |
1215 | void *handle_mem, size_t mem_len) | |
1216 | { | |
1217 | struct zspage *zspage; | |
1218 | struct zpdesc *zpdesc; | |
1219 | unsigned long obj, off; | |
1220 | unsigned int obj_idx; | |
1221 | struct size_class *class; | |
1222 | ||
1223 | /* Guarantee we can get zspage from handle safely */ | |
1224 | read_lock(&pool->lock); | |
1225 | obj = handle_to_obj(handle); | |
1226 | obj_to_location(obj, &zpdesc, &obj_idx); | |
1227 | zspage = get_zspage(zpdesc); | |
1228 | ||
1229 | /* Make sure migration doesn't move any pages in this zspage */ | |
1230 | zspage_read_lock(zspage); | |
1231 | read_unlock(&pool->lock); | |
1232 | ||
1233 | class = zspage_class(pool, zspage); | |
1234 | off = offset_in_page(class->size * obj_idx); | |
1235 | ||
02f5bf89 SS |
1236 | if (!ZsHugePage(zspage)) |
1237 | off += ZS_HANDLE_SIZE; | |
1238 | ||
1239 | if (off + mem_len <= PAGE_SIZE) { | |
44f76413 SS |
1240 | /* this object is contained entirely within a page */ |
1241 | void *dst = kmap_local_zpdesc(zpdesc); | |
1242 | ||
44f76413 SS |
1243 | memcpy(dst + off, handle_mem, mem_len); |
1244 | kunmap_local(dst); | |
1245 | } else { | |
1246 | /* this object spans two pages */ | |
1247 | size_t sizes[2]; | |
1248 | ||
44f76413 SS |
1249 | sizes[0] = PAGE_SIZE - off; |
1250 | sizes[1] = mem_len - sizes[0]; | |
1251 | ||
1252 | memcpy_to_page(zpdesc_page(zpdesc), off, | |
1253 | handle_mem, sizes[0]); | |
1254 | zpdesc = get_next_zpdesc(zpdesc); | |
1255 | memcpy_to_page(zpdesc_page(zpdesc), 0, | |
1256 | handle_mem + sizes[0], sizes[1]); | |
1257 | } | |
1258 | ||
1259 | zspage_read_unlock(zspage); | |
1260 | } | |
1261 | EXPORT_SYMBOL_GPL(zs_obj_write); | |
1262 | ||
010b495e SS |
1263 | /** |
1264 | * zs_huge_class_size() - Returns the size (in bytes) of the first huge | |
1265 | * zsmalloc &size_class. | |
1266 | * @pool: zsmalloc pool to use | |
1267 | * | |
1268 | * The function returns the size of the first huge class - any object of equal | |
1269 | * or bigger size will be stored in zspage consisting of a single physical | |
1270 | * page. | |
1271 | * | |
1272 | * Context: Any context. | |
1273 | * | |
1274 | * Return: the size (in bytes) of the first huge zsmalloc &size_class. | |
1275 | */ | |
1276 | size_t zs_huge_class_size(struct zs_pool *pool) | |
1277 | { | |
1278 | return huge_class_size; | |
1279 | } | |
1280 | EXPORT_SYMBOL_GPL(zs_huge_class_size); | |
1281 | ||
0a5f079b | 1282 | static unsigned long obj_malloc(struct zs_pool *pool, |
3783689a | 1283 | struct zspage *zspage, unsigned long handle) |
c7806261 | 1284 | { |
8f1868ad | 1285 | int i, nr_zpdesc, offset; |
c7806261 MK |
1286 | unsigned long obj; |
1287 | struct link_free *link; | |
0a5f079b | 1288 | struct size_class *class; |
c7806261 | 1289 | |
8f1868ad | 1290 | struct zpdesc *m_zpdesc; |
bfd093f5 | 1291 | unsigned long m_offset; |
c7806261 MK |
1292 | void *vaddr; |
1293 | ||
0a5f079b | 1294 | class = pool->size_class[zspage->class]; |
3783689a | 1295 | obj = get_freeobj(zspage); |
bfd093f5 MK |
1296 | |
1297 | offset = obj * class->size; | |
8f1868ad | 1298 | nr_zpdesc = offset >> PAGE_SHIFT; |
f24f66ee | 1299 | m_offset = offset_in_page(offset); |
8f1868ad | 1300 | m_zpdesc = get_first_zpdesc(zspage); |
bfd093f5 | 1301 | |
8f1868ad HY |
1302 | for (i = 0; i < nr_zpdesc; i++) |
1303 | m_zpdesc = get_next_zpdesc(m_zpdesc); | |
c7806261 | 1304 | |
8f1868ad | 1305 | vaddr = kmap_local_zpdesc(m_zpdesc); |
c7806261 | 1306 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); |
3b1d9ca6 | 1307 | set_freeobj(zspage, link->next >> OBJ_TAG_BITS); |
a41ec880 | 1308 | if (likely(!ZsHugePage(zspage))) |
7b60a685 | 1309 | /* record handle in the header of allocated chunk */ |
d468f1b8 | 1310 | link->handle = handle | OBJ_ALLOCATED_TAG; |
7b60a685 | 1311 | else |
f4e33d32 | 1312 | zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG; |
3783689a | 1313 | |
91d0ec83 | 1314 | kunmap_local(vaddr); |
3783689a | 1315 | mod_zspage_inuse(zspage, 1); |
c7806261 | 1316 | |
2d57eb9e | 1317 | obj = location_to_obj(m_zpdesc, obj); |
d468f1b8 | 1318 | record_obj(handle, obj); |
bfd093f5 | 1319 | |
c7806261 MK |
1320 | return obj; |
1321 | } | |
1322 | ||
1323 | ||
61989a80 NG |
1324 | /** |
1325 | * zs_malloc - Allocate block of given size from pool. | |
1326 | * @pool: pool to allocate from | |
1327 | * @size: size of block to allocate | |
fd854463 | 1328 | * @gfp: gfp flags when allocating object |
56e5a103 | 1329 | * @nid: The preferred node id to allocate new zspage (if needed) |
61989a80 | 1330 | * |
00a61d86 | 1331 | * On success, handle to the allocated object is returned, |
c7e6f17b | 1332 | * otherwise an ERR_PTR(). |
61989a80 NG |
1333 | * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. |
1334 | */ | |
56e5a103 NP |
1335 | unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, |
1336 | const int nid) | |
61989a80 | 1337 | { |
d468f1b8 | 1338 | unsigned long handle; |
61989a80 | 1339 | struct size_class *class; |
4c7ac972 | 1340 | int newfg; |
3783689a | 1341 | struct zspage *zspage; |
61989a80 | 1342 | |
fc8580ed | 1343 | if (unlikely(!size)) |
c7e6f17b | 1344 | return (unsigned long)ERR_PTR(-EINVAL); |
2e40e163 | 1345 | |
fc8580ed BS |
1346 | if (unlikely(size > ZS_MAX_ALLOC_SIZE)) |
1347 | return (unsigned long)ERR_PTR(-ENOSPC); | |
1348 | ||
3783689a | 1349 | handle = cache_alloc_handle(pool, gfp); |
2e40e163 | 1350 | if (!handle) |
c7e6f17b | 1351 | return (unsigned long)ERR_PTR(-ENOMEM); |
61989a80 | 1352 | |
2e40e163 MK |
1353 | /* extra space in chunk to keep the handle */ |
1354 | size += ZS_HANDLE_SIZE; | |
9eec4cd5 | 1355 | class = pool->size_class[get_size_class_index(size)]; |
61989a80 | 1356 | |
64bd0197 CZ |
1357 | /* class->lock effectively protects the zpage migration */ |
1358 | spin_lock(&class->lock); | |
3783689a | 1359 | zspage = find_get_zspage(class); |
48b4800a | 1360 | if (likely(zspage)) { |
d468f1b8 | 1361 | obj_malloc(pool, zspage, handle); |
48b4800a MK |
1362 | /* Now move the zspage to another fullness group, if required */ |
1363 | fix_fullness_group(class, zspage); | |
791abe1e | 1364 | class_stat_add(class, ZS_OBJS_INUSE, 1); |
61989a80 | 1365 | |
d461aac9 | 1366 | goto out; |
48b4800a | 1367 | } |
0f050d99 | 1368 | |
64bd0197 | 1369 | spin_unlock(&class->lock); |
48b4800a | 1370 | |
56e5a103 | 1371 | zspage = alloc_zspage(pool, class, gfp, nid); |
48b4800a MK |
1372 | if (!zspage) { |
1373 | cache_free_handle(pool, handle); | |
c7e6f17b | 1374 | return (unsigned long)ERR_PTR(-ENOMEM); |
61989a80 NG |
1375 | } |
1376 | ||
64bd0197 | 1377 | spin_lock(&class->lock); |
d468f1b8 | 1378 | obj_malloc(pool, zspage, handle); |
48b4800a MK |
1379 | newfg = get_fullness_group(class, zspage); |
1380 | insert_zspage(class, zspage, newfg); | |
4c7ac972 | 1381 | atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); |
791abe1e SS |
1382 | class_stat_add(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); |
1383 | class_stat_add(class, ZS_OBJS_INUSE, 1); | |
48b4800a MK |
1384 | |
1385 | /* We completely set up zspage so mark them as movable */ | |
1386 | SetZsPageMovable(pool, zspage); | |
d461aac9 | 1387 | out: |
64bd0197 | 1388 | spin_unlock(&class->lock); |
61989a80 | 1389 | |
2e40e163 | 1390 | return handle; |
61989a80 NG |
1391 | } |
1392 | EXPORT_SYMBOL_GPL(zs_malloc); | |
1393 | ||
b3067742 | 1394 | static void obj_free(int class_size, unsigned long obj) |
61989a80 NG |
1395 | { |
1396 | struct link_free *link; | |
3783689a | 1397 | struct zspage *zspage; |
b5c1d8b5 | 1398 | struct zpdesc *f_zpdesc; |
bfd093f5 MK |
1399 | unsigned long f_offset; |
1400 | unsigned int f_objidx; | |
af4ee5e9 | 1401 | void *vaddr; |
61989a80 | 1402 | |
b5c1d8b5 HY |
1403 | |
1404 | obj_to_location(obj, &f_zpdesc, &f_objidx); | |
f24f66ee | 1405 | f_offset = offset_in_page(class_size * f_objidx); |
6d0adf4b | 1406 | zspage = get_zspage(f_zpdesc); |
61989a80 | 1407 | |
b5c1d8b5 | 1408 | vaddr = kmap_local_zpdesc(f_zpdesc); |
af4ee5e9 | 1409 | link = (struct link_free *)(vaddr + f_offset); |
85b32581 | 1410 | |
b3067742 DC |
1411 | /* Insert this object in containing zspage's freelist */ |
1412 | if (likely(!ZsHugePage(zspage))) | |
1413 | link->next = get_freeobj(zspage) << OBJ_TAG_BITS; | |
1414 | else | |
b5c1d8b5 | 1415 | f_zpdesc->handle = 0; |
b3067742 | 1416 | set_freeobj(zspage, f_objidx); |
85b32581 | 1417 | |
91d0ec83 | 1418 | kunmap_local(vaddr); |
3783689a | 1419 | mod_zspage_inuse(zspage, -1); |
c7806261 MK |
1420 | } |
1421 | ||
1422 | void zs_free(struct zs_pool *pool, unsigned long handle) | |
1423 | { | |
3783689a | 1424 | struct zspage *zspage; |
4e04d10c | 1425 | struct zpdesc *f_zpdesc; |
bfd093f5 | 1426 | unsigned long obj; |
c7806261 | 1427 | struct size_class *class; |
4c7ac972 | 1428 | int fullness; |
c7806261 | 1429 | |
a5d21721 | 1430 | if (IS_ERR_OR_NULL((void *)handle)) |
c7806261 MK |
1431 | return; |
1432 | ||
b475d42d | 1433 | /* |
0d6fa44e | 1434 | * The pool->lock protects the race with zpage's migration |
b475d42d MK |
1435 | * so it's safe to get the page from handle. |
1436 | */ | |
0d6fa44e | 1437 | read_lock(&pool->lock); |
c7806261 | 1438 | obj = handle_to_obj(handle); |
4e04d10c | 1439 | obj_to_zpdesc(obj, &f_zpdesc); |
6d0adf4b | 1440 | zspage = get_zspage(f_zpdesc); |
67f1c9cd | 1441 | class = zspage_class(pool, zspage); |
64bd0197 | 1442 | spin_lock(&class->lock); |
0d6fa44e | 1443 | read_unlock(&pool->lock); |
b475d42d | 1444 | |
791abe1e | 1445 | class_stat_sub(class, ZS_OBJS_INUSE, 1); |
b3067742 | 1446 | obj_free(class->size, obj); |
85b32581 | 1447 | |
3783689a | 1448 | fullness = fix_fullness_group(class, zspage); |
4c7ac972 | 1449 | if (fullness == ZS_INUSE_RATIO_0) |
9997bc01 | 1450 | free_zspage(pool, class, zspage); |
48b4800a | 1451 | |
64bd0197 | 1452 | spin_unlock(&class->lock); |
3783689a | 1453 | cache_free_handle(pool, handle); |
312fcae2 MK |
1454 | } |
1455 | EXPORT_SYMBOL_GPL(zs_free); | |
1456 | ||
251cbb95 MK |
1457 | static void zs_object_copy(struct size_class *class, unsigned long dst, |
1458 | unsigned long src) | |
312fcae2 | 1459 | { |
b5c1d8b5 | 1460 | struct zpdesc *s_zpdesc, *d_zpdesc; |
bfd093f5 | 1461 | unsigned int s_objidx, d_objidx; |
312fcae2 MK |
1462 | unsigned long s_off, d_off; |
1463 | void *s_addr, *d_addr; | |
1464 | int s_size, d_size, size; | |
1465 | int written = 0; | |
1466 | ||
1467 | s_size = d_size = class->size; | |
1468 | ||
b5c1d8b5 HY |
1469 | obj_to_location(src, &s_zpdesc, &s_objidx); |
1470 | obj_to_location(dst, &d_zpdesc, &d_objidx); | |
312fcae2 | 1471 | |
f24f66ee AR |
1472 | s_off = offset_in_page(class->size * s_objidx); |
1473 | d_off = offset_in_page(class->size * d_objidx); | |
312fcae2 MK |
1474 | |
1475 | if (s_off + class->size > PAGE_SIZE) | |
1476 | s_size = PAGE_SIZE - s_off; | |
1477 | ||
1478 | if (d_off + class->size > PAGE_SIZE) | |
1479 | d_size = PAGE_SIZE - d_off; | |
1480 | ||
b5c1d8b5 HY |
1481 | s_addr = kmap_local_zpdesc(s_zpdesc); |
1482 | d_addr = kmap_local_zpdesc(d_zpdesc); | |
312fcae2 MK |
1483 | |
1484 | while (1) { | |
1485 | size = min(s_size, d_size); | |
1486 | memcpy(d_addr + d_off, s_addr + s_off, size); | |
1487 | written += size; | |
1488 | ||
1489 | if (written == class->size) | |
1490 | break; | |
1491 | ||
495819ea SS |
1492 | s_off += size; |
1493 | s_size -= size; | |
1494 | d_off += size; | |
1495 | d_size -= size; | |
1496 | ||
050a388b | 1497 | /* |
e664c2cd PK |
1498 | * Calling kunmap_local(d_addr) is necessary. kunmap_local() |
1499 | * calls must occurs in reverse order of calls to kmap_local_page(). | |
1500 | * So, to call kunmap_local(s_addr) we should first call | |
1501 | * kunmap_local(d_addr). For more details see | |
46e87152 | 1502 | * Documentation/mm/highmem.rst. |
050a388b | 1503 | */ |
495819ea | 1504 | if (s_off >= PAGE_SIZE) { |
91d0ec83 PK |
1505 | kunmap_local(d_addr); |
1506 | kunmap_local(s_addr); | |
b5c1d8b5 HY |
1507 | s_zpdesc = get_next_zpdesc(s_zpdesc); |
1508 | s_addr = kmap_local_zpdesc(s_zpdesc); | |
1509 | d_addr = kmap_local_zpdesc(d_zpdesc); | |
312fcae2 MK |
1510 | s_size = class->size - written; |
1511 | s_off = 0; | |
312fcae2 MK |
1512 | } |
1513 | ||
495819ea | 1514 | if (d_off >= PAGE_SIZE) { |
91d0ec83 | 1515 | kunmap_local(d_addr); |
b5c1d8b5 HY |
1516 | d_zpdesc = get_next_zpdesc(d_zpdesc); |
1517 | d_addr = kmap_local_zpdesc(d_zpdesc); | |
312fcae2 MK |
1518 | d_size = class->size - written; |
1519 | d_off = 0; | |
312fcae2 MK |
1520 | } |
1521 | } | |
1522 | ||
91d0ec83 PK |
1523 | kunmap_local(d_addr); |
1524 | kunmap_local(s_addr); | |
312fcae2 MK |
1525 | } |
1526 | ||
1527 | /* | |
f9044f17 | 1528 | * Find alloced object in zspage from index object and |
312fcae2 MK |
1529 | * return handle. |
1530 | */ | |
f9044f17 | 1531 | static unsigned long find_alloced_obj(struct size_class *class, |
76fb5d99 | 1532 | struct zpdesc *zpdesc, int *obj_idx) |
312fcae2 | 1533 | { |
671f2fa8 | 1534 | unsigned int offset; |
cf675acb | 1535 | int index = *obj_idx; |
312fcae2 | 1536 | unsigned long handle = 0; |
76fb5d99 | 1537 | void *addr = kmap_local_zpdesc(zpdesc); |
312fcae2 | 1538 | |
fc5eec0d | 1539 | offset = get_first_obj_offset(zpdesc); |
312fcae2 MK |
1540 | offset += class->size * index; |
1541 | ||
1542 | while (offset < PAGE_SIZE) { | |
76fb5d99 | 1543 | if (obj_allocated(zpdesc, addr + offset, &handle)) |
b475d42d | 1544 | break; |
312fcae2 MK |
1545 | |
1546 | offset += class->size; | |
1547 | index++; | |
1548 | } | |
1549 | ||
91d0ec83 | 1550 | kunmap_local(addr); |
cf675acb GM |
1551 | |
1552 | *obj_idx = index; | |
1553 | ||
312fcae2 MK |
1554 | return handle; |
1555 | } | |
1556 | ||
ada5caed MK |
1557 | static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, |
1558 | struct zspage *dst_zspage) | |
312fcae2 MK |
1559 | { |
1560 | unsigned long used_obj, free_obj; | |
1561 | unsigned long handle; | |
ada5caed | 1562 | int obj_idx = 0; |
65a1cf15 | 1563 | struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage); |
ada5caed | 1564 | struct size_class *class = pool->size_class[src_zspage->class]; |
312fcae2 MK |
1565 | |
1566 | while (1) { | |
65a1cf15 | 1567 | handle = find_alloced_obj(class, s_zpdesc, &obj_idx); |
312fcae2 | 1568 | if (!handle) { |
65a1cf15 HY |
1569 | s_zpdesc = get_next_zpdesc(s_zpdesc); |
1570 | if (!s_zpdesc) | |
312fcae2 | 1571 | break; |
41b88e14 | 1572 | obj_idx = 0; |
312fcae2 MK |
1573 | continue; |
1574 | } | |
1575 | ||
312fcae2 | 1576 | used_obj = handle_to_obj(handle); |
ada5caed | 1577 | free_obj = obj_malloc(pool, dst_zspage, handle); |
251cbb95 | 1578 | zs_object_copy(class, free_obj, used_obj); |
41b88e14 | 1579 | obj_idx++; |
b3067742 | 1580 | obj_free(class->size, used_obj); |
df9cd3cb | 1581 | |
4ce36584 | 1582 | /* Stop if there is no more space */ |
ada5caed | 1583 | if (zspage_full(class, dst_zspage)) |
4ce36584 SS |
1584 | break; |
1585 | ||
df9cd3cb | 1586 | /* Stop if there are no more objects to migrate */ |
ada5caed | 1587 | if (zspage_empty(src_zspage)) |
df9cd3cb | 1588 | break; |
312fcae2 | 1589 | } |
312fcae2 MK |
1590 | } |
1591 | ||
4c7ac972 | 1592 | static struct zspage *isolate_src_zspage(struct size_class *class) |
312fcae2 | 1593 | { |
3783689a | 1594 | struct zspage *zspage; |
4c7ac972 | 1595 | int fg; |
312fcae2 | 1596 | |
4c7ac972 SS |
1597 | for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) { |
1598 | zspage = list_first_entry_or_null(&class->fullness_list[fg], | |
1599 | struct zspage, list); | |
1600 | if (zspage) { | |
67eaedc1 | 1601 | remove_zspage(class, zspage); |
4c7ac972 SS |
1602 | return zspage; |
1603 | } | |
3783689a MK |
1604 | } |
1605 | ||
4c7ac972 SS |
1606 | return zspage; |
1607 | } | |
1608 | ||
1609 | static struct zspage *isolate_dst_zspage(struct size_class *class) | |
1610 | { | |
1611 | struct zspage *zspage; | |
1612 | int fg; | |
1613 | ||
1614 | for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) { | |
1615 | zspage = list_first_entry_or_null(&class->fullness_list[fg], | |
1616 | struct zspage, list); | |
3783689a | 1617 | if (zspage) { |
67eaedc1 | 1618 | remove_zspage(class, zspage); |
3783689a | 1619 | return zspage; |
312fcae2 MK |
1620 | } |
1621 | } | |
1622 | ||
3783689a | 1623 | return zspage; |
312fcae2 MK |
1624 | } |
1625 | ||
860c707d | 1626 | /* |
3783689a | 1627 | * putback_zspage - add @zspage into right class's fullness list |
860c707d | 1628 | * @class: destination class |
3783689a | 1629 | * @zspage: target page |
860c707d | 1630 | * |
4c7ac972 | 1631 | * Return @zspage's fullness status |
860c707d | 1632 | */ |
4c7ac972 | 1633 | static int putback_zspage(struct size_class *class, struct zspage *zspage) |
312fcae2 | 1634 | { |
4c7ac972 | 1635 | int fullness; |
312fcae2 | 1636 | |
3783689a MK |
1637 | fullness = get_fullness_group(class, zspage); |
1638 | insert_zspage(class, zspage, fullness); | |
839373e6 | 1639 | |
860c707d | 1640 | return fullness; |
61989a80 | 1641 | } |
312fcae2 | 1642 | |
b3067742 | 1643 | #ifdef CONFIG_COMPACTION |
4d0a5402 CIK |
1644 | /* |
1645 | * To prevent zspage destroy during migration, zspage freeing should | |
1646 | * hold locks of all pages in the zspage. | |
1647 | */ | |
1648 | static void lock_zspage(struct zspage *zspage) | |
1649 | { | |
c1b3bb73 | 1650 | struct zpdesc *curr_zpdesc, *zpdesc; |
4d0a5402 | 1651 | |
2505a981 SA |
1652 | /* |
1653 | * Pages we haven't locked yet can be migrated off the list while we're | |
1654 | * trying to lock them, so we need to be careful and only attempt to | |
e27af3f9 | 1655 | * lock each page under zspage_read_lock(). Otherwise, the page we lock |
2505a981 SA |
1656 | * may no longer belong to the zspage. This means that we may wait for |
1657 | * the wrong page to unlock, so we must take a reference to the page | |
e27af3f9 | 1658 | * prior to waiting for it to unlock outside zspage_read_lock(). |
2505a981 SA |
1659 | */ |
1660 | while (1) { | |
e27af3f9 | 1661 | zspage_read_lock(zspage); |
c1b3bb73 AS |
1662 | zpdesc = get_first_zpdesc(zspage); |
1663 | if (zpdesc_trylock(zpdesc)) | |
2505a981 | 1664 | break; |
c1b3bb73 | 1665 | zpdesc_get(zpdesc); |
e27af3f9 | 1666 | zspage_read_unlock(zspage); |
c1b3bb73 AS |
1667 | zpdesc_wait_locked(zpdesc); |
1668 | zpdesc_put(zpdesc); | |
2505a981 SA |
1669 | } |
1670 | ||
c1b3bb73 AS |
1671 | curr_zpdesc = zpdesc; |
1672 | while ((zpdesc = get_next_zpdesc(curr_zpdesc))) { | |
1673 | if (zpdesc_trylock(zpdesc)) { | |
1674 | curr_zpdesc = zpdesc; | |
2505a981 | 1675 | } else { |
c1b3bb73 | 1676 | zpdesc_get(zpdesc); |
e27af3f9 | 1677 | zspage_read_unlock(zspage); |
c1b3bb73 AS |
1678 | zpdesc_wait_locked(zpdesc); |
1679 | zpdesc_put(zpdesc); | |
e27af3f9 | 1680 | zspage_read_lock(zspage); |
2505a981 SA |
1681 | } |
1682 | } | |
e27af3f9 | 1683 | zspage_read_unlock(zspage); |
4d0a5402 | 1684 | } |
b3067742 | 1685 | #endif /* CONFIG_COMPACTION */ |
4d0a5402 | 1686 | |
568b567f | 1687 | #ifdef CONFIG_COMPACTION |
48b4800a | 1688 | |
68f2736a MWO |
1689 | static const struct movable_operations zsmalloc_mops; |
1690 | ||
48b4800a | 1691 | static void replace_sub_page(struct size_class *class, struct zspage *zspage, |
7d2e1a69 | 1692 | struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc) |
48b4800a | 1693 | { |
7d2e1a69 AS |
1694 | struct zpdesc *zpdesc; |
1695 | struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; | |
1696 | unsigned int first_obj_offset; | |
48b4800a MK |
1697 | int idx = 0; |
1698 | ||
7d2e1a69 | 1699 | zpdesc = get_first_zpdesc(zspage); |
48b4800a | 1700 | do { |
7d2e1a69 AS |
1701 | if (zpdesc == oldzpdesc) |
1702 | zpdescs[idx] = newzpdesc; | |
48b4800a | 1703 | else |
7d2e1a69 | 1704 | zpdescs[idx] = zpdesc; |
48b4800a | 1705 | idx++; |
7d2e1a69 | 1706 | } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); |
48b4800a | 1707 | |
7d2e1a69 | 1708 | create_page_chain(class, zspage, zpdescs); |
fc5eec0d AS |
1709 | first_obj_offset = get_first_obj_offset(oldzpdesc); |
1710 | set_first_obj_offset(newzpdesc, first_obj_offset); | |
a41ec880 | 1711 | if (unlikely(ZsHugePage(zspage))) |
7d2e1a69 AS |
1712 | newzpdesc->handle = oldzpdesc->handle; |
1713 | __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); | |
48b4800a MK |
1714 | } |
1715 | ||
4d0a5402 | 1716 | static bool zs_page_isolate(struct page *page, isolate_mode_t mode) |
48b4800a | 1717 | { |
48b4800a MK |
1718 | /* |
1719 | * Page is locked so zspage couldn't be destroyed. For detail, look at | |
1720 | * lock_zspage in free_zspage. | |
1721 | */ | |
48b4800a MK |
1722 | VM_BUG_ON_PAGE(PageIsolated(page), page); |
1723 | ||
48b4800a MK |
1724 | return true; |
1725 | } | |
1726 | ||
68f2736a MWO |
1727 | static int zs_page_migrate(struct page *newpage, struct page *page, |
1728 | enum migrate_mode mode) | |
48b4800a MK |
1729 | { |
1730 | struct zs_pool *pool; | |
1731 | struct size_class *class; | |
48b4800a | 1732 | struct zspage *zspage; |
b5c1d8b5 | 1733 | struct zpdesc *dummy; |
68721300 HY |
1734 | struct zpdesc *newzpdesc = page_zpdesc(newpage); |
1735 | struct zpdesc *zpdesc = page_zpdesc(page); | |
48b4800a | 1736 | void *s_addr, *d_addr, *addr; |
671f2fa8 | 1737 | unsigned int offset; |
3ae92ac2 | 1738 | unsigned long handle; |
48b4800a MK |
1739 | unsigned long old_obj, new_obj; |
1740 | unsigned int obj_idx; | |
48b4800a | 1741 | |
68721300 | 1742 | VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); |
48b4800a | 1743 | |
68f2736a | 1744 | /* The page is locked, so this pointer must remain valid */ |
6d0adf4b | 1745 | zspage = get_zspage(zpdesc); |
68f2736a | 1746 | pool = zspage->pool; |
b475d42d MK |
1747 | |
1748 | /* | |
64bd0197 | 1749 | * The pool migrate_lock protects the race between zpage migration |
b475d42d MK |
1750 | * and zs_free. |
1751 | */ | |
0d6fa44e | 1752 | write_lock(&pool->lock); |
67f1c9cd | 1753 | class = zspage_class(pool, zspage); |
48b4800a | 1754 | |
64bd0197 CZ |
1755 | /* |
1756 | * the class lock protects zpage alloc/free in the zspage. | |
1757 | */ | |
1758 | spin_lock(&class->lock); | |
07864f1a | 1759 | /* the zspage write_lock protects zpage access via zs_obj_read/write() */ |
e27af3f9 SS |
1760 | if (!zspage_write_trylock(zspage)) { |
1761 | spin_unlock(&class->lock); | |
1762 | write_unlock(&pool->lock); | |
1763 | return -EINVAL; | |
1764 | } | |
1765 | ||
1766 | /* We're committed, tell the world that this is a Zsmalloc page. */ | |
1767 | __zpdesc_set_zsmalloc(newzpdesc); | |
48b4800a | 1768 | |
fc5eec0d | 1769 | offset = get_first_obj_offset(zpdesc); |
68721300 | 1770 | s_addr = kmap_local_zpdesc(zpdesc); |
48b4800a MK |
1771 | |
1772 | /* | |
1773 | * Here, any user cannot access all objects in the zspage so let's move. | |
1774 | */ | |
68721300 | 1775 | d_addr = kmap_local_zpdesc(newzpdesc); |
afb2d666 | 1776 | copy_page(d_addr, s_addr); |
91d0ec83 | 1777 | kunmap_local(d_addr); |
48b4800a | 1778 | |
b475d42d | 1779 | for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; |
48b4800a | 1780 | addr += class->size) { |
68721300 | 1781 | if (obj_allocated(zpdesc, addr, &handle)) { |
48b4800a MK |
1782 | |
1783 | old_obj = handle_to_obj(handle); | |
1784 | obj_to_location(old_obj, &dummy, &obj_idx); | |
2d57eb9e | 1785 | new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx); |
48b4800a MK |
1786 | record_obj(handle, new_obj); |
1787 | } | |
1788 | } | |
91d0ec83 | 1789 | kunmap_local(s_addr); |
48b4800a | 1790 | |
68721300 | 1791 | replace_sub_page(class, zspage, newzpdesc, zpdesc); |
b475d42d MK |
1792 | /* |
1793 | * Since we complete the data copy and set up new zspage structure, | |
64bd0197 | 1794 | * it's okay to release migration_lock. |
b475d42d | 1795 | */ |
0d6fa44e | 1796 | write_unlock(&pool->lock); |
64bd0197 | 1797 | spin_unlock(&class->lock); |
e27af3f9 | 1798 | zspage_write_unlock(zspage); |
48b4800a | 1799 | |
68721300 HY |
1800 | zpdesc_get(newzpdesc); |
1801 | if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { | |
1802 | zpdesc_dec_zone_page_state(zpdesc); | |
1803 | zpdesc_inc_zone_page_state(newzpdesc); | |
ac8f05da CM |
1804 | } |
1805 | ||
73349afa | 1806 | reset_zpdesc(zpdesc); |
68721300 | 1807 | zpdesc_put(zpdesc); |
48b4800a | 1808 | |
b475d42d | 1809 | return MIGRATEPAGE_SUCCESS; |
48b4800a MK |
1810 | } |
1811 | ||
4d0a5402 | 1812 | static void zs_page_putback(struct page *page) |
48b4800a | 1813 | { |
48b4800a | 1814 | VM_BUG_ON_PAGE(!PageIsolated(page), page); |
48b4800a MK |
1815 | } |
1816 | ||
68f2736a | 1817 | static const struct movable_operations zsmalloc_mops = { |
48b4800a | 1818 | .isolate_page = zs_page_isolate, |
68f2736a | 1819 | .migrate_page = zs_page_migrate, |
48b4800a MK |
1820 | .putback_page = zs_page_putback, |
1821 | }; | |
1822 | ||
48b4800a MK |
1823 | /* |
1824 | * Caller should hold page_lock of all pages in the zspage | |
1825 | * In here, we cannot use zspage meta data. | |
1826 | */ | |
1827 | static void async_free_zspage(struct work_struct *work) | |
1828 | { | |
1829 | int i; | |
1830 | struct size_class *class; | |
48b4800a MK |
1831 | struct zspage *zspage, *tmp; |
1832 | LIST_HEAD(free_pages); | |
1833 | struct zs_pool *pool = container_of(work, struct zs_pool, | |
1834 | free_work); | |
1835 | ||
cf8e0fed | 1836 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { |
48b4800a MK |
1837 | class = pool->size_class[i]; |
1838 | if (class->index != i) | |
1839 | continue; | |
1840 | ||
64bd0197 | 1841 | spin_lock(&class->lock); |
4c7ac972 SS |
1842 | list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0], |
1843 | &free_pages); | |
64bd0197 | 1844 | spin_unlock(&class->lock); |
48b4800a MK |
1845 | } |
1846 | ||
48b4800a MK |
1847 | list_for_each_entry_safe(zspage, tmp, &free_pages, list) { |
1848 | list_del(&zspage->list); | |
1849 | lock_zspage(zspage); | |
1850 | ||
ce335e07 | 1851 | class = zspage_class(pool, zspage); |
64bd0197 | 1852 | spin_lock(&class->lock); |
791abe1e | 1853 | class_stat_sub(class, ZS_INUSE_RATIO_0, 1); |
33848337 | 1854 | __free_zspage(pool, class, zspage); |
64bd0197 | 1855 | spin_unlock(&class->lock); |
48b4800a MK |
1856 | } |
1857 | }; | |
1858 | ||
1859 | static void kick_deferred_free(struct zs_pool *pool) | |
1860 | { | |
1861 | schedule_work(&pool->free_work); | |
1862 | } | |
1863 | ||
68f2736a MWO |
1864 | static void zs_flush_migration(struct zs_pool *pool) |
1865 | { | |
1866 | flush_work(&pool->free_work); | |
1867 | } | |
1868 | ||
48b4800a MK |
1869 | static void init_deferred_free(struct zs_pool *pool) |
1870 | { | |
1871 | INIT_WORK(&pool->free_work, async_free_zspage); | |
1872 | } | |
1873 | ||
1874 | static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) | |
1875 | { | |
74999813 | 1876 | struct zpdesc *zpdesc = get_first_zpdesc(zspage); |
48b4800a MK |
1877 | |
1878 | do { | |
74999813 AS |
1879 | WARN_ON(!zpdesc_trylock(zpdesc)); |
1880 | __zpdesc_set_movable(zpdesc, &zsmalloc_mops); | |
1881 | zpdesc_unlock(zpdesc); | |
1882 | } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); | |
48b4800a | 1883 | } |
68f2736a MWO |
1884 | #else |
1885 | static inline void zs_flush_migration(struct zs_pool *pool) { } | |
48b4800a MK |
1886 | #endif |
1887 | ||
04f05909 SS |
1888 | /* |
1889 | * | |
1890 | * Based on the number of unused allocated objects calculate | |
1891 | * and return the number of pages that we can free. | |
04f05909 SS |
1892 | */ |
1893 | static unsigned long zs_can_compact(struct size_class *class) | |
1894 | { | |
1895 | unsigned long obj_wasted; | |
791abe1e SS |
1896 | unsigned long obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED); |
1897 | unsigned long obj_used = class_stat_read(class, ZS_OBJS_INUSE); | |
04f05909 | 1898 | |
44f43e99 SS |
1899 | if (obj_allocated <= obj_used) |
1900 | return 0; | |
04f05909 | 1901 | |
44f43e99 | 1902 | obj_wasted = obj_allocated - obj_used; |
b4fd07a0 | 1903 | obj_wasted /= class->objs_per_zspage; |
04f05909 | 1904 | |
6cbf16b3 | 1905 | return obj_wasted * class->pages_per_zspage; |
04f05909 SS |
1906 | } |
1907 | ||
23959281 RY |
1908 | static unsigned long __zs_compact(struct zs_pool *pool, |
1909 | struct size_class *class) | |
312fcae2 | 1910 | { |
5a845e9f | 1911 | struct zspage *src_zspage = NULL; |
3783689a | 1912 | struct zspage *dst_zspage = NULL; |
23959281 | 1913 | unsigned long pages_freed = 0; |
312fcae2 | 1914 | |
c0547d0b NP |
1915 | /* |
1916 | * protect the race between zpage migration and zs_free | |
1917 | * as well as zpage allocation/free | |
1918 | */ | |
0d6fa44e | 1919 | write_lock(&pool->lock); |
64bd0197 | 1920 | spin_lock(&class->lock); |
5a845e9f SS |
1921 | while (zs_can_compact(class)) { |
1922 | int fg; | |
312fcae2 | 1923 | |
5a845e9f SS |
1924 | if (!dst_zspage) { |
1925 | dst_zspage = isolate_dst_zspage(class); | |
1926 | if (!dst_zspage) | |
1927 | break; | |
5a845e9f SS |
1928 | } |
1929 | ||
1930 | src_zspage = isolate_src_zspage(class); | |
1931 | if (!src_zspage) | |
04f05909 SS |
1932 | break; |
1933 | ||
e27af3f9 SS |
1934 | if (!zspage_write_trylock(src_zspage)) |
1935 | break; | |
1936 | ||
ada5caed | 1937 | migrate_zspage(pool, src_zspage, dst_zspage); |
e27af3f9 | 1938 | zspage_write_unlock(src_zspage); |
312fcae2 | 1939 | |
59def443 | 1940 | fg = putback_zspage(class, src_zspage); |
5a845e9f SS |
1941 | if (fg == ZS_INUSE_RATIO_0) { |
1942 | free_zspage(pool, class, src_zspage); | |
1943 | pages_freed += class->pages_per_zspage; | |
5a845e9f | 1944 | } |
f7ddb612 | 1945 | src_zspage = NULL; |
312fcae2 | 1946 | |
5a845e9f | 1947 | if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100 |
0d6fa44e | 1948 | || rwlock_is_contended(&pool->lock)) { |
4aa409ca | 1949 | putback_zspage(class, dst_zspage); |
b475d42d | 1950 | dst_zspage = NULL; |
b475d42d | 1951 | |
64bd0197 | 1952 | spin_unlock(&class->lock); |
0d6fa44e | 1953 | write_unlock(&pool->lock); |
5a845e9f | 1954 | cond_resched(); |
0d6fa44e | 1955 | write_lock(&pool->lock); |
64bd0197 | 1956 | spin_lock(&class->lock); |
5a845e9f | 1957 | } |
312fcae2 MK |
1958 | } |
1959 | ||
59def443 | 1960 | if (src_zspage) |
4aa409ca | 1961 | putback_zspage(class, src_zspage); |
312fcae2 | 1962 | |
59def443 | 1963 | if (dst_zspage) |
5a845e9f | 1964 | putback_zspage(class, dst_zspage); |
59def443 | 1965 | |
64bd0197 | 1966 | spin_unlock(&class->lock); |
0d6fa44e | 1967 | write_unlock(&pool->lock); |
23959281 RY |
1968 | |
1969 | return pages_freed; | |
312fcae2 MK |
1970 | } |
1971 | ||
1972 | unsigned long zs_compact(struct zs_pool *pool) | |
1973 | { | |
1974 | int i; | |
312fcae2 | 1975 | struct size_class *class; |
23959281 | 1976 | unsigned long pages_freed = 0; |
312fcae2 | 1977 | |
d2658f20 | 1978 | /* |
0d6fa44e | 1979 | * Pool compaction is performed under pool->lock so it is basically |
d2658f20 | 1980 | * single-threaded. Having more than one thread in __zs_compact() |
0d6fa44e SS |
1981 | * will increase pool->lock contention, which will impact other |
1982 | * zsmalloc operations that need pool->lock. | |
d2658f20 SS |
1983 | */ |
1984 | if (atomic_xchg(&pool->compaction_in_progress, 1)) | |
1985 | return 0; | |
1986 | ||
cf8e0fed | 1987 | for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { |
312fcae2 | 1988 | class = pool->size_class[i]; |
312fcae2 MK |
1989 | if (class->index != i) |
1990 | continue; | |
23959281 | 1991 | pages_freed += __zs_compact(pool, class); |
312fcae2 | 1992 | } |
23959281 | 1993 | atomic_long_add(pages_freed, &pool->stats.pages_compacted); |
d2658f20 | 1994 | atomic_set(&pool->compaction_in_progress, 0); |
312fcae2 | 1995 | |
23959281 | 1996 | return pages_freed; |
312fcae2 MK |
1997 | } |
1998 | EXPORT_SYMBOL_GPL(zs_compact); | |
61989a80 | 1999 | |
7d3f3938 SS |
2000 | void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) |
2001 | { | |
2002 | memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); | |
2003 | } | |
2004 | EXPORT_SYMBOL_GPL(zs_pool_stats); | |
2005 | ||
ab9d306d SS |
2006 | static unsigned long zs_shrinker_scan(struct shrinker *shrinker, |
2007 | struct shrink_control *sc) | |
2008 | { | |
2009 | unsigned long pages_freed; | |
c19b548b | 2010 | struct zs_pool *pool = shrinker->private_data; |
ab9d306d | 2011 | |
ab9d306d SS |
2012 | /* |
2013 | * Compact classes and calculate compaction delta. | |
2014 | * Can run concurrently with a manually triggered | |
2015 | * (by user) compaction. | |
2016 | */ | |
23959281 | 2017 | pages_freed = zs_compact(pool); |
ab9d306d SS |
2018 | |
2019 | return pages_freed ? pages_freed : SHRINK_STOP; | |
2020 | } | |
2021 | ||
2022 | static unsigned long zs_shrinker_count(struct shrinker *shrinker, | |
2023 | struct shrink_control *sc) | |
2024 | { | |
2025 | int i; | |
2026 | struct size_class *class; | |
2027 | unsigned long pages_to_free = 0; | |
c19b548b | 2028 | struct zs_pool *pool = shrinker->private_data; |
ab9d306d | 2029 | |
cf8e0fed | 2030 | for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { |
ab9d306d | 2031 | class = pool->size_class[i]; |
ab9d306d SS |
2032 | if (class->index != i) |
2033 | continue; | |
2034 | ||
ab9d306d | 2035 | pages_to_free += zs_can_compact(class); |
ab9d306d SS |
2036 | } |
2037 | ||
2038 | return pages_to_free; | |
2039 | } | |
2040 | ||
2041 | static void zs_unregister_shrinker(struct zs_pool *pool) | |
2042 | { | |
c19b548b | 2043 | shrinker_free(pool->shrinker); |
ab9d306d SS |
2044 | } |
2045 | ||
2046 | static int zs_register_shrinker(struct zs_pool *pool) | |
2047 | { | |
c19b548b QZ |
2048 | pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name); |
2049 | if (!pool->shrinker) | |
2050 | return -ENOMEM; | |
2051 | ||
2052 | pool->shrinker->scan_objects = zs_shrinker_scan; | |
2053 | pool->shrinker->count_objects = zs_shrinker_count; | |
2054 | pool->shrinker->batch = 0; | |
2055 | pool->shrinker->private_data = pool; | |
ab9d306d | 2056 | |
c19b548b QZ |
2057 | shrinker_register(pool->shrinker); |
2058 | ||
2059 | return 0; | |
ab9d306d SS |
2060 | } |
2061 | ||
6260ae35 SS |
2062 | static int calculate_zspage_chain_size(int class_size) |
2063 | { | |
2064 | int i, min_waste = INT_MAX; | |
2065 | int chain_size = 1; | |
2066 | ||
e1d1f354 SS |
2067 | if (is_power_of_2(class_size)) |
2068 | return chain_size; | |
2069 | ||
6260ae35 SS |
2070 | for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { |
2071 | int waste; | |
2072 | ||
2073 | waste = (i * PAGE_SIZE) % class_size; | |
2074 | if (waste < min_waste) { | |
2075 | min_waste = waste; | |
2076 | chain_size = i; | |
2077 | } | |
2078 | } | |
2079 | ||
2080 | return chain_size; | |
2081 | } | |
2082 | ||
00a61d86 | 2083 | /** |
66cdef66 | 2084 | * zs_create_pool - Creates an allocation pool to work from. |
fd854463 | 2085 | * @name: pool name to be created |
166cfda7 | 2086 | * |
66cdef66 GM |
2087 | * This function must be called before anything when using |
2088 | * the zsmalloc allocator. | |
166cfda7 | 2089 | * |
66cdef66 GM |
2090 | * On success, a pointer to the newly created pool is returned, |
2091 | * otherwise NULL. | |
396b7fd6 | 2092 | */ |
d0d8da2d | 2093 | struct zs_pool *zs_create_pool(const char *name) |
61989a80 | 2094 | { |
66cdef66 GM |
2095 | int i; |
2096 | struct zs_pool *pool; | |
2097 | struct size_class *prev_class = NULL; | |
61989a80 | 2098 | |
66cdef66 GM |
2099 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); |
2100 | if (!pool) | |
2101 | return NULL; | |
61989a80 | 2102 | |
48b4800a | 2103 | init_deferred_free(pool); |
0d6fa44e | 2104 | rwlock_init(&pool->lock); |
d2658f20 | 2105 | atomic_set(&pool->compaction_in_progress, 0); |
61989a80 | 2106 | |
2e40e163 MK |
2107 | pool->name = kstrdup(name, GFP_KERNEL); |
2108 | if (!pool->name) | |
2109 | goto err; | |
2110 | ||
3783689a | 2111 | if (create_cache(pool)) |
2e40e163 MK |
2112 | goto err; |
2113 | ||
c60369f0 | 2114 | /* |
399d8eeb | 2115 | * Iterate reversely, because, size of size_class that we want to use |
66cdef66 | 2116 | * for merging should be larger or equal to current size. |
c60369f0 | 2117 | */ |
cf8e0fed | 2118 | for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { |
66cdef66 GM |
2119 | int size; |
2120 | int pages_per_zspage; | |
64d90465 | 2121 | int objs_per_zspage; |
66cdef66 | 2122 | struct size_class *class; |
4c7ac972 | 2123 | int fullness; |
c60369f0 | 2124 | |
66cdef66 GM |
2125 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; |
2126 | if (size > ZS_MAX_ALLOC_SIZE) | |
2127 | size = ZS_MAX_ALLOC_SIZE; | |
6260ae35 | 2128 | pages_per_zspage = calculate_zspage_chain_size(size); |
64d90465 | 2129 | objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; |
61989a80 | 2130 | |
010b495e SS |
2131 | /* |
2132 | * We iterate from biggest down to smallest classes, | |
2133 | * so huge_class_size holds the size of the first huge | |
2134 | * class. Any object bigger than or equal to that will | |
2135 | * endup in the huge class. | |
2136 | */ | |
2137 | if (pages_per_zspage != 1 && objs_per_zspage != 1 && | |
2138 | !huge_class_size) { | |
2139 | huge_class_size = size; | |
2140 | /* | |
2141 | * The object uses ZS_HANDLE_SIZE bytes to store the | |
2142 | * handle. We need to subtract it, because zs_malloc() | |
2143 | * unconditionally adds handle size before it performs | |
2144 | * size class search - so object may be smaller than | |
2145 | * huge class size, yet it still can end up in the huge | |
2146 | * class because it grows by ZS_HANDLE_SIZE extra bytes | |
2147 | * right before class lookup. | |
2148 | */ | |
2149 | huge_class_size -= (ZS_HANDLE_SIZE - 1); | |
2150 | } | |
2151 | ||
66cdef66 GM |
2152 | /* |
2153 | * size_class is used for normal zsmalloc operation such | |
2154 | * as alloc/free for that size. Although it is natural that we | |
2155 | * have one size_class for each size, there is a chance that we | |
2156 | * can get more memory utilization if we use one size_class for | |
2157 | * many different sizes whose size_class have same | |
2158 | * characteristics. So, we makes size_class point to | |
2159 | * previous size_class if possible. | |
2160 | */ | |
2161 | if (prev_class) { | |
64d90465 | 2162 | if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { |
66cdef66 GM |
2163 | pool->size_class[i] = prev_class; |
2164 | continue; | |
2165 | } | |
2166 | } | |
2167 | ||
2168 | class = kzalloc(sizeof(struct size_class), GFP_KERNEL); | |
2169 | if (!class) | |
2170 | goto err; | |
2171 | ||
2172 | class->size = size; | |
2173 | class->index = i; | |
2174 | class->pages_per_zspage = pages_per_zspage; | |
64d90465 | 2175 | class->objs_per_zspage = objs_per_zspage; |
64bd0197 | 2176 | spin_lock_init(&class->lock); |
66cdef66 | 2177 | pool->size_class[i] = class; |
4c7ac972 SS |
2178 | |
2179 | fullness = ZS_INUSE_RATIO_0; | |
2180 | while (fullness < NR_FULLNESS_GROUPS) { | |
3783689a | 2181 | INIT_LIST_HEAD(&class->fullness_list[fullness]); |
4c7ac972 SS |
2182 | fullness++; |
2183 | } | |
66cdef66 GM |
2184 | |
2185 | prev_class = class; | |
61989a80 NG |
2186 | } |
2187 | ||
d34f6157 DS |
2188 | /* debug only, don't abort if it fails */ |
2189 | zs_pool_stat_create(pool, name); | |
0f050d99 | 2190 | |
ab9d306d | 2191 | /* |
93144ca3 AK |
2192 | * Not critical since shrinker is only used to trigger internal |
2193 | * defragmentation of the pool which is pretty optional thing. If | |
2194 | * registration fails we still can use the pool normally and user can | |
2195 | * trigger compaction manually. Thus, ignore return code. | |
ab9d306d | 2196 | */ |
93144ca3 AK |
2197 | zs_register_shrinker(pool); |
2198 | ||
66cdef66 GM |
2199 | return pool; |
2200 | ||
2201 | err: | |
2202 | zs_destroy_pool(pool); | |
2203 | return NULL; | |
61989a80 | 2204 | } |
66cdef66 | 2205 | EXPORT_SYMBOL_GPL(zs_create_pool); |
61989a80 | 2206 | |
66cdef66 | 2207 | void zs_destroy_pool(struct zs_pool *pool) |
61989a80 | 2208 | { |
66cdef66 | 2209 | int i; |
61989a80 | 2210 | |
ab9d306d | 2211 | zs_unregister_shrinker(pool); |
68f2736a | 2212 | zs_flush_migration(pool); |
0f050d99 GM |
2213 | zs_pool_stat_destroy(pool); |
2214 | ||
cf8e0fed | 2215 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { |
66cdef66 GM |
2216 | int fg; |
2217 | struct size_class *class = pool->size_class[i]; | |
61989a80 | 2218 | |
4249a05f AR |
2219 | if (!class) |
2220 | continue; | |
2221 | ||
66cdef66 GM |
2222 | if (class->index != i) |
2223 | continue; | |
61989a80 | 2224 | |
4c7ac972 SS |
2225 | for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) { |
2226 | if (list_empty(&class->fullness_list[fg])) | |
2227 | continue; | |
2228 | ||
2229 | pr_err("Class-%d fullness group %d is not empty\n", | |
2230 | class->size, fg); | |
66cdef66 GM |
2231 | } |
2232 | kfree(class); | |
2233 | } | |
f553646a | 2234 | |
3783689a | 2235 | destroy_cache(pool); |
0f050d99 | 2236 | kfree(pool->name); |
66cdef66 GM |
2237 | kfree(pool); |
2238 | } | |
2239 | EXPORT_SYMBOL_GPL(zs_destroy_pool); | |
b7418510 | 2240 | |
66cdef66 GM |
2241 | static int __init zs_init(void) |
2242 | { | |
66cdef66 GM |
2243 | #ifdef CONFIG_ZPOOL |
2244 | zpool_register_driver(&zs_zpool_driver); | |
2245 | #endif | |
4abaac9b | 2246 | zs_stat_init(); |
66cdef66 | 2247 | return 0; |
61989a80 | 2248 | } |
61989a80 | 2249 | |
66cdef66 | 2250 | static void __exit zs_exit(void) |
61989a80 | 2251 | { |
66cdef66 GM |
2252 | #ifdef CONFIG_ZPOOL |
2253 | zpool_unregister_driver(&zs_zpool_driver); | |
2254 | #endif | |
0f050d99 | 2255 | zs_stat_exit(); |
61989a80 | 2256 | } |
069f101f BH |
2257 | |
2258 | module_init(zs_init); | |
2259 | module_exit(zs_exit); | |
2260 | ||
2261 | MODULE_LICENSE("Dual BSD/GPL"); | |
2262 | MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); | |
a831896a | 2263 | MODULE_DESCRIPTION("zsmalloc memory allocator"); |