Merge tag 'mm-stable-2022-10-13' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-block.git] / drivers / block / zram / zram_drv.c
1 /*
2  * Compressed RAM block device
3  *
4  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
5  *               2012, 2013 Minchan Kim
6  *
7  * This code is released using a dual license strategy: BSD/GPL
8  * You can choose the licence that better fits your requirements.
9  *
10  * Released under the terms of 3-clause BSD License
11  * Released under the terms of GNU General Public License Version 2.0
12  *
13  */
14
15 #define KMSG_COMPONENT "zram"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/bitops.h>
22 #include <linux/blkdev.h>
23 #include <linux/buffer_head.h>
24 #include <linux/device.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/backing-dev.h>
28 #include <linux/string.h>
29 #include <linux/vmalloc.h>
30 #include <linux/err.h>
31 #include <linux/idr.h>
32 #include <linux/sysfs.h>
33 #include <linux/debugfs.h>
34 #include <linux/cpuhotplug.h>
35 #include <linux/part_stat.h>
36
37 #include "zram_drv.h"
38
39 static DEFINE_IDR(zram_index_idr);
40 /* idr index must be protected */
41 static DEFINE_MUTEX(zram_index_mutex);
42
43 static int zram_major;
44 static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
45
46 /* Module params (documentation at end) */
47 static unsigned int num_devices = 1;
48 /*
49  * Pages that compress to sizes equals or greater than this are stored
50  * uncompressed in memory.
51  */
52 static size_t huge_class_size;
53
54 static const struct block_device_operations zram_devops;
55
56 static void zram_free_page(struct zram *zram, size_t index);
57 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
58                                 u32 index, int offset, struct bio *bio);
59
60
61 static int zram_slot_trylock(struct zram *zram, u32 index)
62 {
63         return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags);
64 }
65
66 static void zram_slot_lock(struct zram *zram, u32 index)
67 {
68         bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags);
69 }
70
71 static void zram_slot_unlock(struct zram *zram, u32 index)
72 {
73         bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
74 }
75
76 static inline bool init_done(struct zram *zram)
77 {
78         return zram->disksize;
79 }
80
81 static inline struct zram *dev_to_zram(struct device *dev)
82 {
83         return (struct zram *)dev_to_disk(dev)->private_data;
84 }
85
86 static unsigned long zram_get_handle(struct zram *zram, u32 index)
87 {
88         return zram->table[index].handle;
89 }
90
91 static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
92 {
93         zram->table[index].handle = handle;
94 }
95
96 /* flag operations require table entry bit_spin_lock() being held */
97 static bool zram_test_flag(struct zram *zram, u32 index,
98                         enum zram_pageflags flag)
99 {
100         return zram->table[index].flags & BIT(flag);
101 }
102
103 static void zram_set_flag(struct zram *zram, u32 index,
104                         enum zram_pageflags flag)
105 {
106         zram->table[index].flags |= BIT(flag);
107 }
108
109 static void zram_clear_flag(struct zram *zram, u32 index,
110                         enum zram_pageflags flag)
111 {
112         zram->table[index].flags &= ~BIT(flag);
113 }
114
115 static inline void zram_set_element(struct zram *zram, u32 index,
116                         unsigned long element)
117 {
118         zram->table[index].element = element;
119 }
120
121 static unsigned long zram_get_element(struct zram *zram, u32 index)
122 {
123         return zram->table[index].element;
124 }
125
126 static size_t zram_get_obj_size(struct zram *zram, u32 index)
127 {
128         return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1);
129 }
130
131 static void zram_set_obj_size(struct zram *zram,
132                                         u32 index, size_t size)
133 {
134         unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT;
135
136         zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size;
137 }
138
139 static inline bool zram_allocated(struct zram *zram, u32 index)
140 {
141         return zram_get_obj_size(zram, index) ||
142                         zram_test_flag(zram, index, ZRAM_SAME) ||
143                         zram_test_flag(zram, index, ZRAM_WB);
144 }
145
146 #if PAGE_SIZE != 4096
147 static inline bool is_partial_io(struct bio_vec *bvec)
148 {
149         return bvec->bv_len != PAGE_SIZE;
150 }
151 #else
152 static inline bool is_partial_io(struct bio_vec *bvec)
153 {
154         return false;
155 }
156 #endif
157
158 /*
159  * Check if request is within bounds and aligned on zram logical blocks.
160  */
161 static inline bool valid_io_request(struct zram *zram,
162                 sector_t start, unsigned int size)
163 {
164         u64 end, bound;
165
166         /* unaligned request */
167         if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
168                 return false;
169         if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
170                 return false;
171
172         end = start + (size >> SECTOR_SHIFT);
173         bound = zram->disksize >> SECTOR_SHIFT;
174         /* out of range range */
175         if (unlikely(start >= bound || end > bound || start > end))
176                 return false;
177
178         /* I/O request is valid */
179         return true;
180 }
181
182 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
183 {
184         *index  += (*offset + bvec->bv_len) / PAGE_SIZE;
185         *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
186 }
187
188 static inline void update_used_max(struct zram *zram,
189                                         const unsigned long pages)
190 {
191         unsigned long old_max, cur_max;
192
193         old_max = atomic_long_read(&zram->stats.max_used_pages);
194
195         do {
196                 cur_max = old_max;
197                 if (pages > cur_max)
198                         old_max = atomic_long_cmpxchg(
199                                 &zram->stats.max_used_pages, cur_max, pages);
200         } while (old_max != cur_max);
201 }
202
203 static inline void zram_fill_page(void *ptr, unsigned long len,
204                                         unsigned long value)
205 {
206         WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
207         memset_l(ptr, value, len / sizeof(unsigned long));
208 }
209
210 static bool page_same_filled(void *ptr, unsigned long *element)
211 {
212         unsigned long *page;
213         unsigned long val;
214         unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
215
216         page = (unsigned long *)ptr;
217         val = page[0];
218
219         if (val != page[last_pos])
220                 return false;
221
222         for (pos = 1; pos < last_pos; pos++) {
223                 if (val != page[pos])
224                         return false;
225         }
226
227         *element = val;
228
229         return true;
230 }
231
232 static ssize_t initstate_show(struct device *dev,
233                 struct device_attribute *attr, char *buf)
234 {
235         u32 val;
236         struct zram *zram = dev_to_zram(dev);
237
238         down_read(&zram->init_lock);
239         val = init_done(zram);
240         up_read(&zram->init_lock);
241
242         return scnprintf(buf, PAGE_SIZE, "%u\n", val);
243 }
244
245 static ssize_t disksize_show(struct device *dev,
246                 struct device_attribute *attr, char *buf)
247 {
248         struct zram *zram = dev_to_zram(dev);
249
250         return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
251 }
252
253 static ssize_t mem_limit_store(struct device *dev,
254                 struct device_attribute *attr, const char *buf, size_t len)
255 {
256         u64 limit;
257         char *tmp;
258         struct zram *zram = dev_to_zram(dev);
259
260         limit = memparse(buf, &tmp);
261         if (buf == tmp) /* no chars parsed, invalid input */
262                 return -EINVAL;
263
264         down_write(&zram->init_lock);
265         zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
266         up_write(&zram->init_lock);
267
268         return len;
269 }
270
271 static ssize_t mem_used_max_store(struct device *dev,
272                 struct device_attribute *attr, const char *buf, size_t len)
273 {
274         int err;
275         unsigned long val;
276         struct zram *zram = dev_to_zram(dev);
277
278         err = kstrtoul(buf, 10, &val);
279         if (err || val != 0)
280                 return -EINVAL;
281
282         down_read(&zram->init_lock);
283         if (init_done(zram)) {
284                 atomic_long_set(&zram->stats.max_used_pages,
285                                 zs_get_total_pages(zram->mem_pool));
286         }
287         up_read(&zram->init_lock);
288
289         return len;
290 }
291
292 /*
293  * Mark all pages which are older than or equal to cutoff as IDLE.
294  * Callers should hold the zram init lock in read mode
295  */
296 static void mark_idle(struct zram *zram, ktime_t cutoff)
297 {
298         int is_idle = 1;
299         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
300         int index;
301
302         for (index = 0; index < nr_pages; index++) {
303                 /*
304                  * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
305                  * See the comment in writeback_store.
306                  */
307                 zram_slot_lock(zram, index);
308                 if (zram_allocated(zram, index) &&
309                                 !zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
310 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
311                         is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time);
312 #endif
313                         if (is_idle)
314                                 zram_set_flag(zram, index, ZRAM_IDLE);
315                 }
316                 zram_slot_unlock(zram, index);
317         }
318 }
319
320 static ssize_t idle_store(struct device *dev,
321                 struct device_attribute *attr, const char *buf, size_t len)
322 {
323         struct zram *zram = dev_to_zram(dev);
324         ktime_t cutoff_time = 0;
325         ssize_t rv = -EINVAL;
326
327         if (!sysfs_streq(buf, "all")) {
328                 /*
329                  * If it did not parse as 'all' try to treat it as an integer
330                  * when we have memory tracking enabled.
331                  */
332                 u64 age_sec;
333
334                 if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec))
335                         cutoff_time = ktime_sub(ktime_get_boottime(),
336                                         ns_to_ktime(age_sec * NSEC_PER_SEC));
337                 else
338                         goto out;
339         }
340
341         down_read(&zram->init_lock);
342         if (!init_done(zram))
343                 goto out_unlock;
344
345         /*
346          * A cutoff_time of 0 marks everything as idle, this is the
347          * "all" behavior.
348          */
349         mark_idle(zram, cutoff_time);
350         rv = len;
351
352 out_unlock:
353         up_read(&zram->init_lock);
354 out:
355         return rv;
356 }
357
358 #ifdef CONFIG_ZRAM_WRITEBACK
359 static ssize_t writeback_limit_enable_store(struct device *dev,
360                 struct device_attribute *attr, const char *buf, size_t len)
361 {
362         struct zram *zram = dev_to_zram(dev);
363         u64 val;
364         ssize_t ret = -EINVAL;
365
366         if (kstrtoull(buf, 10, &val))
367                 return ret;
368
369         down_read(&zram->init_lock);
370         spin_lock(&zram->wb_limit_lock);
371         zram->wb_limit_enable = val;
372         spin_unlock(&zram->wb_limit_lock);
373         up_read(&zram->init_lock);
374         ret = len;
375
376         return ret;
377 }
378
379 static ssize_t writeback_limit_enable_show(struct device *dev,
380                 struct device_attribute *attr, char *buf)
381 {
382         bool val;
383         struct zram *zram = dev_to_zram(dev);
384
385         down_read(&zram->init_lock);
386         spin_lock(&zram->wb_limit_lock);
387         val = zram->wb_limit_enable;
388         spin_unlock(&zram->wb_limit_lock);
389         up_read(&zram->init_lock);
390
391         return scnprintf(buf, PAGE_SIZE, "%d\n", val);
392 }
393
394 static ssize_t writeback_limit_store(struct device *dev,
395                 struct device_attribute *attr, const char *buf, size_t len)
396 {
397         struct zram *zram = dev_to_zram(dev);
398         u64 val;
399         ssize_t ret = -EINVAL;
400
401         if (kstrtoull(buf, 10, &val))
402                 return ret;
403
404         down_read(&zram->init_lock);
405         spin_lock(&zram->wb_limit_lock);
406         zram->bd_wb_limit = val;
407         spin_unlock(&zram->wb_limit_lock);
408         up_read(&zram->init_lock);
409         ret = len;
410
411         return ret;
412 }
413
414 static ssize_t writeback_limit_show(struct device *dev,
415                 struct device_attribute *attr, char *buf)
416 {
417         u64 val;
418         struct zram *zram = dev_to_zram(dev);
419
420         down_read(&zram->init_lock);
421         spin_lock(&zram->wb_limit_lock);
422         val = zram->bd_wb_limit;
423         spin_unlock(&zram->wb_limit_lock);
424         up_read(&zram->init_lock);
425
426         return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
427 }
428
429 static void reset_bdev(struct zram *zram)
430 {
431         struct block_device *bdev;
432
433         if (!zram->backing_dev)
434                 return;
435
436         bdev = zram->bdev;
437         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
438         /* hope filp_close flush all of IO */
439         filp_close(zram->backing_dev, NULL);
440         zram->backing_dev = NULL;
441         zram->bdev = NULL;
442         zram->disk->fops = &zram_devops;
443         kvfree(zram->bitmap);
444         zram->bitmap = NULL;
445 }
446
447 static ssize_t backing_dev_show(struct device *dev,
448                 struct device_attribute *attr, char *buf)
449 {
450         struct file *file;
451         struct zram *zram = dev_to_zram(dev);
452         char *p;
453         ssize_t ret;
454
455         down_read(&zram->init_lock);
456         file = zram->backing_dev;
457         if (!file) {
458                 memcpy(buf, "none\n", 5);
459                 up_read(&zram->init_lock);
460                 return 5;
461         }
462
463         p = file_path(file, buf, PAGE_SIZE - 1);
464         if (IS_ERR(p)) {
465                 ret = PTR_ERR(p);
466                 goto out;
467         }
468
469         ret = strlen(p);
470         memmove(buf, p, ret);
471         buf[ret++] = '\n';
472 out:
473         up_read(&zram->init_lock);
474         return ret;
475 }
476
477 static ssize_t backing_dev_store(struct device *dev,
478                 struct device_attribute *attr, const char *buf, size_t len)
479 {
480         char *file_name;
481         size_t sz;
482         struct file *backing_dev = NULL;
483         struct inode *inode;
484         struct address_space *mapping;
485         unsigned int bitmap_sz;
486         unsigned long nr_pages, *bitmap = NULL;
487         struct block_device *bdev = NULL;
488         int err;
489         struct zram *zram = dev_to_zram(dev);
490
491         file_name = kmalloc(PATH_MAX, GFP_KERNEL);
492         if (!file_name)
493                 return -ENOMEM;
494
495         down_write(&zram->init_lock);
496         if (init_done(zram)) {
497                 pr_info("Can't setup backing device for initialized device\n");
498                 err = -EBUSY;
499                 goto out;
500         }
501
502         strscpy(file_name, buf, PATH_MAX);
503         /* ignore trailing newline */
504         sz = strlen(file_name);
505         if (sz > 0 && file_name[sz - 1] == '\n')
506                 file_name[sz - 1] = 0x00;
507
508         backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
509         if (IS_ERR(backing_dev)) {
510                 err = PTR_ERR(backing_dev);
511                 backing_dev = NULL;
512                 goto out;
513         }
514
515         mapping = backing_dev->f_mapping;
516         inode = mapping->host;
517
518         /* Support only block device in this moment */
519         if (!S_ISBLK(inode->i_mode)) {
520                 err = -ENOTBLK;
521                 goto out;
522         }
523
524         bdev = blkdev_get_by_dev(inode->i_rdev,
525                         FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
526         if (IS_ERR(bdev)) {
527                 err = PTR_ERR(bdev);
528                 bdev = NULL;
529                 goto out;
530         }
531
532         nr_pages = i_size_read(inode) >> PAGE_SHIFT;
533         bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
534         bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
535         if (!bitmap) {
536                 err = -ENOMEM;
537                 goto out;
538         }
539
540         reset_bdev(zram);
541
542         zram->bdev = bdev;
543         zram->backing_dev = backing_dev;
544         zram->bitmap = bitmap;
545         zram->nr_pages = nr_pages;
546         up_write(&zram->init_lock);
547
548         pr_info("setup backing device %s\n", file_name);
549         kfree(file_name);
550
551         return len;
552 out:
553         kvfree(bitmap);
554
555         if (bdev)
556                 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
557
558         if (backing_dev)
559                 filp_close(backing_dev, NULL);
560
561         up_write(&zram->init_lock);
562
563         kfree(file_name);
564
565         return err;
566 }
567
568 static unsigned long alloc_block_bdev(struct zram *zram)
569 {
570         unsigned long blk_idx = 1;
571 retry:
572         /* skip 0 bit to confuse zram.handle = 0 */
573         blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
574         if (blk_idx == zram->nr_pages)
575                 return 0;
576
577         if (test_and_set_bit(blk_idx, zram->bitmap))
578                 goto retry;
579
580         atomic64_inc(&zram->stats.bd_count);
581         return blk_idx;
582 }
583
584 static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
585 {
586         int was_set;
587
588         was_set = test_and_clear_bit(blk_idx, zram->bitmap);
589         WARN_ON_ONCE(!was_set);
590         atomic64_dec(&zram->stats.bd_count);
591 }
592
593 static void zram_page_end_io(struct bio *bio)
594 {
595         struct page *page = bio_first_page_all(bio);
596
597         page_endio(page, op_is_write(bio_op(bio)),
598                         blk_status_to_errno(bio->bi_status));
599         bio_put(bio);
600 }
601
602 /*
603  * Returns 1 if the submission is successful.
604  */
605 static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
606                         unsigned long entry, struct bio *parent)
607 {
608         struct bio *bio;
609
610         bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ,
611                         GFP_NOIO);
612         if (!bio)
613                 return -ENOMEM;
614
615         bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
616         if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
617                 bio_put(bio);
618                 return -EIO;
619         }
620
621         if (!parent)
622                 bio->bi_end_io = zram_page_end_io;
623         else
624                 bio_chain(bio, parent);
625
626         submit_bio(bio);
627         return 1;
628 }
629
630 #define PAGE_WB_SIG "page_index="
631
632 #define PAGE_WRITEBACK 0
633 #define HUGE_WRITEBACK (1<<0)
634 #define IDLE_WRITEBACK (1<<1)
635
636
637 static ssize_t writeback_store(struct device *dev,
638                 struct device_attribute *attr, const char *buf, size_t len)
639 {
640         struct zram *zram = dev_to_zram(dev);
641         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
642         unsigned long index = 0;
643         struct bio bio;
644         struct bio_vec bio_vec;
645         struct page *page;
646         ssize_t ret = len;
647         int mode, err;
648         unsigned long blk_idx = 0;
649
650         if (sysfs_streq(buf, "idle"))
651                 mode = IDLE_WRITEBACK;
652         else if (sysfs_streq(buf, "huge"))
653                 mode = HUGE_WRITEBACK;
654         else if (sysfs_streq(buf, "huge_idle"))
655                 mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
656         else {
657                 if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
658                         return -EINVAL;
659
660                 if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) ||
661                                 index >= nr_pages)
662                         return -EINVAL;
663
664                 nr_pages = 1;
665                 mode = PAGE_WRITEBACK;
666         }
667
668         down_read(&zram->init_lock);
669         if (!init_done(zram)) {
670                 ret = -EINVAL;
671                 goto release_init_lock;
672         }
673
674         if (!zram->backing_dev) {
675                 ret = -ENODEV;
676                 goto release_init_lock;
677         }
678
679         page = alloc_page(GFP_KERNEL);
680         if (!page) {
681                 ret = -ENOMEM;
682                 goto release_init_lock;
683         }
684
685         for (; nr_pages != 0; index++, nr_pages--) {
686                 struct bio_vec bvec;
687
688                 bvec.bv_page = page;
689                 bvec.bv_len = PAGE_SIZE;
690                 bvec.bv_offset = 0;
691
692                 spin_lock(&zram->wb_limit_lock);
693                 if (zram->wb_limit_enable && !zram->bd_wb_limit) {
694                         spin_unlock(&zram->wb_limit_lock);
695                         ret = -EIO;
696                         break;
697                 }
698                 spin_unlock(&zram->wb_limit_lock);
699
700                 if (!blk_idx) {
701                         blk_idx = alloc_block_bdev(zram);
702                         if (!blk_idx) {
703                                 ret = -ENOSPC;
704                                 break;
705                         }
706                 }
707
708                 zram_slot_lock(zram, index);
709                 if (!zram_allocated(zram, index))
710                         goto next;
711
712                 if (zram_test_flag(zram, index, ZRAM_WB) ||
713                                 zram_test_flag(zram, index, ZRAM_SAME) ||
714                                 zram_test_flag(zram, index, ZRAM_UNDER_WB))
715                         goto next;
716
717                 if (mode & IDLE_WRITEBACK &&
718                           !zram_test_flag(zram, index, ZRAM_IDLE))
719                         goto next;
720                 if (mode & HUGE_WRITEBACK &&
721                           !zram_test_flag(zram, index, ZRAM_HUGE))
722                         goto next;
723                 /*
724                  * Clearing ZRAM_UNDER_WB is duty of caller.
725                  * IOW, zram_free_page never clear it.
726                  */
727                 zram_set_flag(zram, index, ZRAM_UNDER_WB);
728                 /* Need for hugepage writeback racing */
729                 zram_set_flag(zram, index, ZRAM_IDLE);
730                 zram_slot_unlock(zram, index);
731                 if (zram_bvec_read(zram, &bvec, index, 0, NULL)) {
732                         zram_slot_lock(zram, index);
733                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
734                         zram_clear_flag(zram, index, ZRAM_IDLE);
735                         zram_slot_unlock(zram, index);
736                         continue;
737                 }
738
739                 bio_init(&bio, zram->bdev, &bio_vec, 1,
740                          REQ_OP_WRITE | REQ_SYNC);
741                 bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
742
743                 bio_add_page(&bio, bvec.bv_page, bvec.bv_len,
744                                 bvec.bv_offset);
745                 /*
746                  * XXX: A single page IO would be inefficient for write
747                  * but it would be not bad as starter.
748                  */
749                 err = submit_bio_wait(&bio);
750                 if (err) {
751                         zram_slot_lock(zram, index);
752                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
753                         zram_clear_flag(zram, index, ZRAM_IDLE);
754                         zram_slot_unlock(zram, index);
755                         /*
756                          * Return last IO error unless every IO were
757                          * not suceeded.
758                          */
759                         ret = err;
760                         continue;
761                 }
762
763                 atomic64_inc(&zram->stats.bd_writes);
764                 /*
765                  * We released zram_slot_lock so need to check if the slot was
766                  * changed. If there is freeing for the slot, we can catch it
767                  * easily by zram_allocated.
768                  * A subtle case is the slot is freed/reallocated/marked as
769                  * ZRAM_IDLE again. To close the race, idle_store doesn't
770                  * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
771                  * Thus, we could close the race by checking ZRAM_IDLE bit.
772                  */
773                 zram_slot_lock(zram, index);
774                 if (!zram_allocated(zram, index) ||
775                           !zram_test_flag(zram, index, ZRAM_IDLE)) {
776                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
777                         zram_clear_flag(zram, index, ZRAM_IDLE);
778                         goto next;
779                 }
780
781                 zram_free_page(zram, index);
782                 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
783                 zram_set_flag(zram, index, ZRAM_WB);
784                 zram_set_element(zram, index, blk_idx);
785                 blk_idx = 0;
786                 atomic64_inc(&zram->stats.pages_stored);
787                 spin_lock(&zram->wb_limit_lock);
788                 if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
789                         zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
790                 spin_unlock(&zram->wb_limit_lock);
791 next:
792                 zram_slot_unlock(zram, index);
793         }
794
795         if (blk_idx)
796                 free_block_bdev(zram, blk_idx);
797         __free_page(page);
798 release_init_lock:
799         up_read(&zram->init_lock);
800
801         return ret;
802 }
803
804 struct zram_work {
805         struct work_struct work;
806         struct zram *zram;
807         unsigned long entry;
808         struct bio *bio;
809         struct bio_vec bvec;
810 };
811
812 #if PAGE_SIZE != 4096
813 static void zram_sync_read(struct work_struct *work)
814 {
815         struct zram_work *zw = container_of(work, struct zram_work, work);
816         struct zram *zram = zw->zram;
817         unsigned long entry = zw->entry;
818         struct bio *bio = zw->bio;
819
820         read_from_bdev_async(zram, &zw->bvec, entry, bio);
821 }
822
823 /*
824  * Block layer want one ->submit_bio to be active at a time, so if we use
825  * chained IO with parent IO in same context, it's a deadlock. To avoid that,
826  * use a worker thread context.
827  */
828 static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
829                                 unsigned long entry, struct bio *bio)
830 {
831         struct zram_work work;
832
833         work.bvec = *bvec;
834         work.zram = zram;
835         work.entry = entry;
836         work.bio = bio;
837
838         INIT_WORK_ONSTACK(&work.work, zram_sync_read);
839         queue_work(system_unbound_wq, &work.work);
840         flush_work(&work.work);
841         destroy_work_on_stack(&work.work);
842
843         return 1;
844 }
845 #else
846 static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
847                                 unsigned long entry, struct bio *bio)
848 {
849         WARN_ON(1);
850         return -EIO;
851 }
852 #endif
853
854 static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
855                         unsigned long entry, struct bio *parent, bool sync)
856 {
857         atomic64_inc(&zram->stats.bd_reads);
858         if (sync)
859                 return read_from_bdev_sync(zram, bvec, entry, parent);
860         else
861                 return read_from_bdev_async(zram, bvec, entry, parent);
862 }
863 #else
864 static inline void reset_bdev(struct zram *zram) {};
865 static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
866                         unsigned long entry, struct bio *parent, bool sync)
867 {
868         return -EIO;
869 }
870
871 static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
872 #endif
873
874 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
875
876 static struct dentry *zram_debugfs_root;
877
878 static void zram_debugfs_create(void)
879 {
880         zram_debugfs_root = debugfs_create_dir("zram", NULL);
881 }
882
883 static void zram_debugfs_destroy(void)
884 {
885         debugfs_remove_recursive(zram_debugfs_root);
886 }
887
888 static void zram_accessed(struct zram *zram, u32 index)
889 {
890         zram_clear_flag(zram, index, ZRAM_IDLE);
891         zram->table[index].ac_time = ktime_get_boottime();
892 }
893
894 static ssize_t read_block_state(struct file *file, char __user *buf,
895                                 size_t count, loff_t *ppos)
896 {
897         char *kbuf;
898         ssize_t index, written = 0;
899         struct zram *zram = file->private_data;
900         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
901         struct timespec64 ts;
902
903         kbuf = kvmalloc(count, GFP_KERNEL);
904         if (!kbuf)
905                 return -ENOMEM;
906
907         down_read(&zram->init_lock);
908         if (!init_done(zram)) {
909                 up_read(&zram->init_lock);
910                 kvfree(kbuf);
911                 return -EINVAL;
912         }
913
914         for (index = *ppos; index < nr_pages; index++) {
915                 int copied;
916
917                 zram_slot_lock(zram, index);
918                 if (!zram_allocated(zram, index))
919                         goto next;
920
921                 ts = ktime_to_timespec64(zram->table[index].ac_time);
922                 copied = snprintf(kbuf + written, count,
923                         "%12zd %12lld.%06lu %c%c%c%c\n",
924                         index, (s64)ts.tv_sec,
925                         ts.tv_nsec / NSEC_PER_USEC,
926                         zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
927                         zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
928                         zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
929                         zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
930
931                 if (count <= copied) {
932                         zram_slot_unlock(zram, index);
933                         break;
934                 }
935                 written += copied;
936                 count -= copied;
937 next:
938                 zram_slot_unlock(zram, index);
939                 *ppos += 1;
940         }
941
942         up_read(&zram->init_lock);
943         if (copy_to_user(buf, kbuf, written))
944                 written = -EFAULT;
945         kvfree(kbuf);
946
947         return written;
948 }
949
950 static const struct file_operations proc_zram_block_state_op = {
951         .open = simple_open,
952         .read = read_block_state,
953         .llseek = default_llseek,
954 };
955
956 static void zram_debugfs_register(struct zram *zram)
957 {
958         if (!zram_debugfs_root)
959                 return;
960
961         zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
962                                                 zram_debugfs_root);
963         debugfs_create_file("block_state", 0400, zram->debugfs_dir,
964                                 zram, &proc_zram_block_state_op);
965 }
966
967 static void zram_debugfs_unregister(struct zram *zram)
968 {
969         debugfs_remove_recursive(zram->debugfs_dir);
970 }
971 #else
972 static void zram_debugfs_create(void) {};
973 static void zram_debugfs_destroy(void) {};
974 static void zram_accessed(struct zram *zram, u32 index)
975 {
976         zram_clear_flag(zram, index, ZRAM_IDLE);
977 };
978 static void zram_debugfs_register(struct zram *zram) {};
979 static void zram_debugfs_unregister(struct zram *zram) {};
980 #endif
981
982 /*
983  * We switched to per-cpu streams and this attr is not needed anymore.
984  * However, we will keep it around for some time, because:
985  * a) we may revert per-cpu streams in the future
986  * b) it's visible to user space and we need to follow our 2 years
987  *    retirement rule; but we already have a number of 'soon to be
988  *    altered' attrs, so max_comp_streams need to wait for the next
989  *    layoff cycle.
990  */
991 static ssize_t max_comp_streams_show(struct device *dev,
992                 struct device_attribute *attr, char *buf)
993 {
994         return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
995 }
996
997 static ssize_t max_comp_streams_store(struct device *dev,
998                 struct device_attribute *attr, const char *buf, size_t len)
999 {
1000         return len;
1001 }
1002
1003 static ssize_t comp_algorithm_show(struct device *dev,
1004                 struct device_attribute *attr, char *buf)
1005 {
1006         size_t sz;
1007         struct zram *zram = dev_to_zram(dev);
1008
1009         down_read(&zram->init_lock);
1010         sz = zcomp_available_show(zram->compressor, buf);
1011         up_read(&zram->init_lock);
1012
1013         return sz;
1014 }
1015
1016 static ssize_t comp_algorithm_store(struct device *dev,
1017                 struct device_attribute *attr, const char *buf, size_t len)
1018 {
1019         struct zram *zram = dev_to_zram(dev);
1020         char compressor[ARRAY_SIZE(zram->compressor)];
1021         size_t sz;
1022
1023         strscpy(compressor, buf, sizeof(compressor));
1024         /* ignore trailing newline */
1025         sz = strlen(compressor);
1026         if (sz > 0 && compressor[sz - 1] == '\n')
1027                 compressor[sz - 1] = 0x00;
1028
1029         if (!zcomp_available_algorithm(compressor))
1030                 return -EINVAL;
1031
1032         down_write(&zram->init_lock);
1033         if (init_done(zram)) {
1034                 up_write(&zram->init_lock);
1035                 pr_info("Can't change algorithm for initialized device\n");
1036                 return -EBUSY;
1037         }
1038
1039         strcpy(zram->compressor, compressor);
1040         up_write(&zram->init_lock);
1041         return len;
1042 }
1043
1044 static ssize_t compact_store(struct device *dev,
1045                 struct device_attribute *attr, const char *buf, size_t len)
1046 {
1047         struct zram *zram = dev_to_zram(dev);
1048
1049         down_read(&zram->init_lock);
1050         if (!init_done(zram)) {
1051                 up_read(&zram->init_lock);
1052                 return -EINVAL;
1053         }
1054
1055         zs_compact(zram->mem_pool);
1056         up_read(&zram->init_lock);
1057
1058         return len;
1059 }
1060
1061 static ssize_t io_stat_show(struct device *dev,
1062                 struct device_attribute *attr, char *buf)
1063 {
1064         struct zram *zram = dev_to_zram(dev);
1065         ssize_t ret;
1066
1067         down_read(&zram->init_lock);
1068         ret = scnprintf(buf, PAGE_SIZE,
1069                         "%8llu %8llu %8llu %8llu\n",
1070                         (u64)atomic64_read(&zram->stats.failed_reads),
1071                         (u64)atomic64_read(&zram->stats.failed_writes),
1072                         (u64)atomic64_read(&zram->stats.invalid_io),
1073                         (u64)atomic64_read(&zram->stats.notify_free));
1074         up_read(&zram->init_lock);
1075
1076         return ret;
1077 }
1078
1079 static ssize_t mm_stat_show(struct device *dev,
1080                 struct device_attribute *attr, char *buf)
1081 {
1082         struct zram *zram = dev_to_zram(dev);
1083         struct zs_pool_stats pool_stats;
1084         u64 orig_size, mem_used = 0;
1085         long max_used;
1086         ssize_t ret;
1087
1088         memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
1089
1090         down_read(&zram->init_lock);
1091         if (init_done(zram)) {
1092                 mem_used = zs_get_total_pages(zram->mem_pool);
1093                 zs_pool_stats(zram->mem_pool, &pool_stats);
1094         }
1095
1096         orig_size = atomic64_read(&zram->stats.pages_stored);
1097         max_used = atomic_long_read(&zram->stats.max_used_pages);
1098
1099         ret = scnprintf(buf, PAGE_SIZE,
1100                         "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
1101                         orig_size << PAGE_SHIFT,
1102                         (u64)atomic64_read(&zram->stats.compr_data_size),
1103                         mem_used << PAGE_SHIFT,
1104                         zram->limit_pages << PAGE_SHIFT,
1105                         max_used << PAGE_SHIFT,
1106                         (u64)atomic64_read(&zram->stats.same_pages),
1107                         atomic_long_read(&pool_stats.pages_compacted),
1108                         (u64)atomic64_read(&zram->stats.huge_pages),
1109                         (u64)atomic64_read(&zram->stats.huge_pages_since));
1110         up_read(&zram->init_lock);
1111
1112         return ret;
1113 }
1114
1115 #ifdef CONFIG_ZRAM_WRITEBACK
1116 #define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12)))
1117 static ssize_t bd_stat_show(struct device *dev,
1118                 struct device_attribute *attr, char *buf)
1119 {
1120         struct zram *zram = dev_to_zram(dev);
1121         ssize_t ret;
1122
1123         down_read(&zram->init_lock);
1124         ret = scnprintf(buf, PAGE_SIZE,
1125                 "%8llu %8llu %8llu\n",
1126                         FOUR_K((u64)atomic64_read(&zram->stats.bd_count)),
1127                         FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)),
1128                         FOUR_K((u64)atomic64_read(&zram->stats.bd_writes)));
1129         up_read(&zram->init_lock);
1130
1131         return ret;
1132 }
1133 #endif
1134
1135 static ssize_t debug_stat_show(struct device *dev,
1136                 struct device_attribute *attr, char *buf)
1137 {
1138         int version = 1;
1139         struct zram *zram = dev_to_zram(dev);
1140         ssize_t ret;
1141
1142         down_read(&zram->init_lock);
1143         ret = scnprintf(buf, PAGE_SIZE,
1144                         "version: %d\n%8llu %8llu\n",
1145                         version,
1146                         (u64)atomic64_read(&zram->stats.writestall),
1147                         (u64)atomic64_read(&zram->stats.miss_free));
1148         up_read(&zram->init_lock);
1149
1150         return ret;
1151 }
1152
1153 static DEVICE_ATTR_RO(io_stat);
1154 static DEVICE_ATTR_RO(mm_stat);
1155 #ifdef CONFIG_ZRAM_WRITEBACK
1156 static DEVICE_ATTR_RO(bd_stat);
1157 #endif
1158 static DEVICE_ATTR_RO(debug_stat);
1159
1160 static void zram_meta_free(struct zram *zram, u64 disksize)
1161 {
1162         size_t num_pages = disksize >> PAGE_SHIFT;
1163         size_t index;
1164
1165         /* Free all pages that are still in this zram device */
1166         for (index = 0; index < num_pages; index++)
1167                 zram_free_page(zram, index);
1168
1169         zs_destroy_pool(zram->mem_pool);
1170         vfree(zram->table);
1171 }
1172
1173 static bool zram_meta_alloc(struct zram *zram, u64 disksize)
1174 {
1175         size_t num_pages;
1176
1177         num_pages = disksize >> PAGE_SHIFT;
1178         zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
1179         if (!zram->table)
1180                 return false;
1181
1182         zram->mem_pool = zs_create_pool(zram->disk->disk_name);
1183         if (!zram->mem_pool) {
1184                 vfree(zram->table);
1185                 return false;
1186         }
1187
1188         if (!huge_class_size)
1189                 huge_class_size = zs_huge_class_size(zram->mem_pool);
1190         return true;
1191 }
1192
1193 /*
1194  * To protect concurrent access to the same index entry,
1195  * caller should hold this table index entry's bit_spinlock to
1196  * indicate this index entry is accessing.
1197  */
1198 static void zram_free_page(struct zram *zram, size_t index)
1199 {
1200         unsigned long handle;
1201
1202 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
1203         zram->table[index].ac_time = 0;
1204 #endif
1205         if (zram_test_flag(zram, index, ZRAM_IDLE))
1206                 zram_clear_flag(zram, index, ZRAM_IDLE);
1207
1208         if (zram_test_flag(zram, index, ZRAM_HUGE)) {
1209                 zram_clear_flag(zram, index, ZRAM_HUGE);
1210                 atomic64_dec(&zram->stats.huge_pages);
1211         }
1212
1213         if (zram_test_flag(zram, index, ZRAM_WB)) {
1214                 zram_clear_flag(zram, index, ZRAM_WB);
1215                 free_block_bdev(zram, zram_get_element(zram, index));
1216                 goto out;
1217         }
1218
1219         /*
1220          * No memory is allocated for same element filled pages.
1221          * Simply clear same page flag.
1222          */
1223         if (zram_test_flag(zram, index, ZRAM_SAME)) {
1224                 zram_clear_flag(zram, index, ZRAM_SAME);
1225                 atomic64_dec(&zram->stats.same_pages);
1226                 goto out;
1227         }
1228
1229         handle = zram_get_handle(zram, index);
1230         if (!handle)
1231                 return;
1232
1233         zs_free(zram->mem_pool, handle);
1234
1235         atomic64_sub(zram_get_obj_size(zram, index),
1236                         &zram->stats.compr_data_size);
1237 out:
1238         atomic64_dec(&zram->stats.pages_stored);
1239         zram_set_handle(zram, index, 0);
1240         zram_set_obj_size(zram, index, 0);
1241         WARN_ON_ONCE(zram->table[index].flags &
1242                 ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
1243 }
1244
1245 static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
1246                                 struct bio *bio, bool partial_io)
1247 {
1248         struct zcomp_strm *zstrm;
1249         unsigned long handle;
1250         unsigned int size;
1251         void *src, *dst;
1252         int ret;
1253
1254         zram_slot_lock(zram, index);
1255         if (zram_test_flag(zram, index, ZRAM_WB)) {
1256                 struct bio_vec bvec;
1257
1258                 zram_slot_unlock(zram, index);
1259                 /* A null bio means rw_page was used, we must fallback to bio */
1260                 if (!bio)
1261                         return -EOPNOTSUPP;
1262
1263                 bvec.bv_page = page;
1264                 bvec.bv_len = PAGE_SIZE;
1265                 bvec.bv_offset = 0;
1266                 return read_from_bdev(zram, &bvec,
1267                                 zram_get_element(zram, index),
1268                                 bio, partial_io);
1269         }
1270
1271         handle = zram_get_handle(zram, index);
1272         if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
1273                 unsigned long value;
1274                 void *mem;
1275
1276                 value = handle ? zram_get_element(zram, index) : 0;
1277                 mem = kmap_atomic(page);
1278                 zram_fill_page(mem, PAGE_SIZE, value);
1279                 kunmap_atomic(mem);
1280                 zram_slot_unlock(zram, index);
1281                 return 0;
1282         }
1283
1284         size = zram_get_obj_size(zram, index);
1285
1286         if (size != PAGE_SIZE)
1287                 zstrm = zcomp_stream_get(zram->comp);
1288
1289         src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
1290         if (size == PAGE_SIZE) {
1291                 dst = kmap_atomic(page);
1292                 memcpy(dst, src, PAGE_SIZE);
1293                 kunmap_atomic(dst);
1294                 ret = 0;
1295         } else {
1296                 dst = kmap_atomic(page);
1297                 ret = zcomp_decompress(zstrm, src, size, dst);
1298                 kunmap_atomic(dst);
1299                 zcomp_stream_put(zram->comp);
1300         }
1301         zs_unmap_object(zram->mem_pool, handle);
1302         zram_slot_unlock(zram, index);
1303
1304         /* Should NEVER happen. Return bio error if it does. */
1305         if (WARN_ON(ret))
1306                 pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
1307
1308         return ret;
1309 }
1310
1311 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
1312                                 u32 index, int offset, struct bio *bio)
1313 {
1314         int ret;
1315         struct page *page;
1316
1317         page = bvec->bv_page;
1318         if (is_partial_io(bvec)) {
1319                 /* Use a temporary buffer to decompress the page */
1320                 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
1321                 if (!page)
1322                         return -ENOMEM;
1323         }
1324
1325         ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
1326         if (unlikely(ret))
1327                 goto out;
1328
1329         if (is_partial_io(bvec)) {
1330                 void *src = kmap_atomic(page);
1331
1332                 memcpy_to_bvec(bvec, src + offset);
1333                 kunmap_atomic(src);
1334         }
1335 out:
1336         if (is_partial_io(bvec))
1337                 __free_page(page);
1338
1339         return ret;
1340 }
1341
1342 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1343                                 u32 index, struct bio *bio)
1344 {
1345         int ret = 0;
1346         unsigned long alloced_pages;
1347         unsigned long handle = -ENOMEM;
1348         unsigned int comp_len = 0;
1349         void *src, *dst, *mem;
1350         struct zcomp_strm *zstrm;
1351         struct page *page = bvec->bv_page;
1352         unsigned long element = 0;
1353         enum zram_pageflags flags = 0;
1354
1355         mem = kmap_atomic(page);
1356         if (page_same_filled(mem, &element)) {
1357                 kunmap_atomic(mem);
1358                 /* Free memory associated with this sector now. */
1359                 flags = ZRAM_SAME;
1360                 atomic64_inc(&zram->stats.same_pages);
1361                 goto out;
1362         }
1363         kunmap_atomic(mem);
1364
1365 compress_again:
1366         zstrm = zcomp_stream_get(zram->comp);
1367         src = kmap_atomic(page);
1368         ret = zcomp_compress(zstrm, src, &comp_len);
1369         kunmap_atomic(src);
1370
1371         if (unlikely(ret)) {
1372                 zcomp_stream_put(zram->comp);
1373                 pr_err("Compression failed! err=%d\n", ret);
1374                 zs_free(zram->mem_pool, handle);
1375                 return ret;
1376         }
1377
1378         if (comp_len >= huge_class_size)
1379                 comp_len = PAGE_SIZE;
1380         /*
1381          * handle allocation has 2 paths:
1382          * a) fast path is executed with preemption disabled (for
1383          *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
1384          *  since we can't sleep;
1385          * b) slow path enables preemption and attempts to allocate
1386          *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
1387          *  put per-cpu compression stream and, thus, to re-do
1388          *  the compression once handle is allocated.
1389          *
1390          * if we have a 'non-null' handle here then we are coming
1391          * from the slow path and handle has already been allocated.
1392          */
1393         if (IS_ERR((void *)handle))
1394                 handle = zs_malloc(zram->mem_pool, comp_len,
1395                                 __GFP_KSWAPD_RECLAIM |
1396                                 __GFP_NOWARN |
1397                                 __GFP_HIGHMEM |
1398                                 __GFP_MOVABLE);
1399         if (IS_ERR((void *)handle)) {
1400                 zcomp_stream_put(zram->comp);
1401                 atomic64_inc(&zram->stats.writestall);
1402                 handle = zs_malloc(zram->mem_pool, comp_len,
1403                                 GFP_NOIO | __GFP_HIGHMEM |
1404                                 __GFP_MOVABLE);
1405                 if (IS_ERR((void *)handle))
1406                         return PTR_ERR((void *)handle);
1407
1408                 if (comp_len != PAGE_SIZE)
1409                         goto compress_again;
1410                 /*
1411                  * If the page is not compressible, you need to acquire the
1412                  * lock and execute the code below. The zcomp_stream_get()
1413                  * call is needed to disable the cpu hotplug and grab the
1414                  * zstrm buffer back. It is necessary that the dereferencing
1415                  * of the zstrm variable below occurs correctly.
1416                  */
1417                 zstrm = zcomp_stream_get(zram->comp);
1418         }
1419
1420         alloced_pages = zs_get_total_pages(zram->mem_pool);
1421         update_used_max(zram, alloced_pages);
1422
1423         if (zram->limit_pages && alloced_pages > zram->limit_pages) {
1424                 zcomp_stream_put(zram->comp);
1425                 zs_free(zram->mem_pool, handle);
1426                 return -ENOMEM;
1427         }
1428
1429         dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
1430
1431         src = zstrm->buffer;
1432         if (comp_len == PAGE_SIZE)
1433                 src = kmap_atomic(page);
1434         memcpy(dst, src, comp_len);
1435         if (comp_len == PAGE_SIZE)
1436                 kunmap_atomic(src);
1437
1438         zcomp_stream_put(zram->comp);
1439         zs_unmap_object(zram->mem_pool, handle);
1440         atomic64_add(comp_len, &zram->stats.compr_data_size);
1441 out:
1442         /*
1443          * Free memory associated with this sector
1444          * before overwriting unused sectors.
1445          */
1446         zram_slot_lock(zram, index);
1447         zram_free_page(zram, index);
1448
1449         if (comp_len == PAGE_SIZE) {
1450                 zram_set_flag(zram, index, ZRAM_HUGE);
1451                 atomic64_inc(&zram->stats.huge_pages);
1452                 atomic64_inc(&zram->stats.huge_pages_since);
1453         }
1454
1455         if (flags) {
1456                 zram_set_flag(zram, index, flags);
1457                 zram_set_element(zram, index, element);
1458         }  else {
1459                 zram_set_handle(zram, index, handle);
1460                 zram_set_obj_size(zram, index, comp_len);
1461         }
1462         zram_slot_unlock(zram, index);
1463
1464         /* Update stats */
1465         atomic64_inc(&zram->stats.pages_stored);
1466         return ret;
1467 }
1468
1469 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1470                                 u32 index, int offset, struct bio *bio)
1471 {
1472         int ret;
1473         struct page *page = NULL;
1474         struct bio_vec vec;
1475
1476         vec = *bvec;
1477         if (is_partial_io(bvec)) {
1478                 void *dst;
1479                 /*
1480                  * This is a partial IO. We need to read the full page
1481                  * before to write the changes.
1482                  */
1483                 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
1484                 if (!page)
1485                         return -ENOMEM;
1486
1487                 ret = __zram_bvec_read(zram, page, index, bio, true);
1488                 if (ret)
1489                         goto out;
1490
1491                 dst = kmap_atomic(page);
1492                 memcpy_from_bvec(dst + offset, bvec);
1493                 kunmap_atomic(dst);
1494
1495                 vec.bv_page = page;
1496                 vec.bv_len = PAGE_SIZE;
1497                 vec.bv_offset = 0;
1498         }
1499
1500         ret = __zram_bvec_write(zram, &vec, index, bio);
1501 out:
1502         if (is_partial_io(bvec))
1503                 __free_page(page);
1504         return ret;
1505 }
1506
1507 /*
1508  * zram_bio_discard - handler on discard request
1509  * @index: physical block index in PAGE_SIZE units
1510  * @offset: byte offset within physical block
1511  */
1512 static void zram_bio_discard(struct zram *zram, u32 index,
1513                              int offset, struct bio *bio)
1514 {
1515         size_t n = bio->bi_iter.bi_size;
1516
1517         /*
1518          * zram manages data in physical block size units. Because logical block
1519          * size isn't identical with physical block size on some arch, we
1520          * could get a discard request pointing to a specific offset within a
1521          * certain physical block.  Although we can handle this request by
1522          * reading that physiclal block and decompressing and partially zeroing
1523          * and re-compressing and then re-storing it, this isn't reasonable
1524          * because our intent with a discard request is to save memory.  So
1525          * skipping this logical block is appropriate here.
1526          */
1527         if (offset) {
1528                 if (n <= (PAGE_SIZE - offset))
1529                         return;
1530
1531                 n -= (PAGE_SIZE - offset);
1532                 index++;
1533         }
1534
1535         while (n >= PAGE_SIZE) {
1536                 zram_slot_lock(zram, index);
1537                 zram_free_page(zram, index);
1538                 zram_slot_unlock(zram, index);
1539                 atomic64_inc(&zram->stats.notify_free);
1540                 index++;
1541                 n -= PAGE_SIZE;
1542         }
1543 }
1544
1545 /*
1546  * Returns errno if it has some problem. Otherwise return 0 or 1.
1547  * Returns 0 if IO request was done synchronously
1548  * Returns 1 if IO request was successfully submitted.
1549  */
1550 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
1551                         int offset, enum req_op op, struct bio *bio)
1552 {
1553         int ret;
1554
1555         if (!op_is_write(op)) {
1556                 atomic64_inc(&zram->stats.num_reads);
1557                 ret = zram_bvec_read(zram, bvec, index, offset, bio);
1558                 flush_dcache_page(bvec->bv_page);
1559         } else {
1560                 atomic64_inc(&zram->stats.num_writes);
1561                 ret = zram_bvec_write(zram, bvec, index, offset, bio);
1562         }
1563
1564         zram_slot_lock(zram, index);
1565         zram_accessed(zram, index);
1566         zram_slot_unlock(zram, index);
1567
1568         if (unlikely(ret < 0)) {
1569                 if (!op_is_write(op))
1570                         atomic64_inc(&zram->stats.failed_reads);
1571                 else
1572                         atomic64_inc(&zram->stats.failed_writes);
1573         }
1574
1575         return ret;
1576 }
1577
1578 static void __zram_make_request(struct zram *zram, struct bio *bio)
1579 {
1580         int offset;
1581         u32 index;
1582         struct bio_vec bvec;
1583         struct bvec_iter iter;
1584         unsigned long start_time;
1585
1586         index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1587         offset = (bio->bi_iter.bi_sector &
1588                   (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
1589
1590         switch (bio_op(bio)) {
1591         case REQ_OP_DISCARD:
1592         case REQ_OP_WRITE_ZEROES:
1593                 zram_bio_discard(zram, index, offset, bio);
1594                 bio_endio(bio);
1595                 return;
1596         default:
1597                 break;
1598         }
1599
1600         start_time = bio_start_io_acct(bio);
1601         bio_for_each_segment(bvec, bio, iter) {
1602                 struct bio_vec bv = bvec;
1603                 unsigned int unwritten = bvec.bv_len;
1604
1605                 do {
1606                         bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
1607                                                         unwritten);
1608                         if (zram_bvec_rw(zram, &bv, index, offset,
1609                                          bio_op(bio), bio) < 0) {
1610                                 bio->bi_status = BLK_STS_IOERR;
1611                                 break;
1612                         }
1613
1614                         bv.bv_offset += bv.bv_len;
1615                         unwritten -= bv.bv_len;
1616
1617                         update_position(&index, &offset, &bv);
1618                 } while (unwritten);
1619         }
1620         bio_end_io_acct(bio, start_time);
1621         bio_endio(bio);
1622 }
1623
1624 /*
1625  * Handler function for all zram I/O requests.
1626  */
1627 static void zram_submit_bio(struct bio *bio)
1628 {
1629         struct zram *zram = bio->bi_bdev->bd_disk->private_data;
1630
1631         if (!valid_io_request(zram, bio->bi_iter.bi_sector,
1632                                         bio->bi_iter.bi_size)) {
1633                 atomic64_inc(&zram->stats.invalid_io);
1634                 bio_io_error(bio);
1635                 return;
1636         }
1637
1638         __zram_make_request(zram, bio);
1639 }
1640
1641 static void zram_slot_free_notify(struct block_device *bdev,
1642                                 unsigned long index)
1643 {
1644         struct zram *zram;
1645
1646         zram = bdev->bd_disk->private_data;
1647
1648         atomic64_inc(&zram->stats.notify_free);
1649         if (!zram_slot_trylock(zram, index)) {
1650                 atomic64_inc(&zram->stats.miss_free);
1651                 return;
1652         }
1653
1654         zram_free_page(zram, index);
1655         zram_slot_unlock(zram, index);
1656 }
1657
1658 static int zram_rw_page(struct block_device *bdev, sector_t sector,
1659                        struct page *page, enum req_op op)
1660 {
1661         int offset, ret;
1662         u32 index;
1663         struct zram *zram;
1664         struct bio_vec bv;
1665         unsigned long start_time;
1666
1667         if (PageTransHuge(page))
1668                 return -ENOTSUPP;
1669         zram = bdev->bd_disk->private_data;
1670
1671         if (!valid_io_request(zram, sector, PAGE_SIZE)) {
1672                 atomic64_inc(&zram->stats.invalid_io);
1673                 ret = -EINVAL;
1674                 goto out;
1675         }
1676
1677         index = sector >> SECTORS_PER_PAGE_SHIFT;
1678         offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
1679
1680         bv.bv_page = page;
1681         bv.bv_len = PAGE_SIZE;
1682         bv.bv_offset = 0;
1683
1684         start_time = bdev_start_io_acct(bdev->bd_disk->part0,
1685                         SECTORS_PER_PAGE, op, jiffies);
1686         ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
1687         bdev_end_io_acct(bdev->bd_disk->part0, op, start_time);
1688 out:
1689         /*
1690          * If I/O fails, just return error(ie, non-zero) without
1691          * calling page_endio.
1692          * It causes resubmit the I/O with bio request by upper functions
1693          * of rw_page(e.g., swap_readpage, __swap_writepage) and
1694          * bio->bi_end_io does things to handle the error
1695          * (e.g., SetPageError, set_page_dirty and extra works).
1696          */
1697         if (unlikely(ret < 0))
1698                 return ret;
1699
1700         switch (ret) {
1701         case 0:
1702                 page_endio(page, op_is_write(op), 0);
1703                 break;
1704         case 1:
1705                 ret = 0;
1706                 break;
1707         default:
1708                 WARN_ON(1);
1709         }
1710         return ret;
1711 }
1712
1713 static void zram_reset_device(struct zram *zram)
1714 {
1715         down_write(&zram->init_lock);
1716
1717         zram->limit_pages = 0;
1718
1719         if (!init_done(zram)) {
1720                 up_write(&zram->init_lock);
1721                 return;
1722         }
1723
1724         set_capacity_and_notify(zram->disk, 0);
1725         part_stat_set_all(zram->disk->part0, 0);
1726
1727         /* I/O operation under all of CPU are done so let's free */
1728         zram_meta_free(zram, zram->disksize);
1729         zram->disksize = 0;
1730         memset(&zram->stats, 0, sizeof(zram->stats));
1731         zcomp_destroy(zram->comp);
1732         zram->comp = NULL;
1733         reset_bdev(zram);
1734
1735         up_write(&zram->init_lock);
1736 }
1737
1738 static ssize_t disksize_store(struct device *dev,
1739                 struct device_attribute *attr, const char *buf, size_t len)
1740 {
1741         u64 disksize;
1742         struct zcomp *comp;
1743         struct zram *zram = dev_to_zram(dev);
1744         int err;
1745
1746         disksize = memparse(buf, NULL);
1747         if (!disksize)
1748                 return -EINVAL;
1749
1750         down_write(&zram->init_lock);
1751         if (init_done(zram)) {
1752                 pr_info("Cannot change disksize for initialized device\n");
1753                 err = -EBUSY;
1754                 goto out_unlock;
1755         }
1756
1757         disksize = PAGE_ALIGN(disksize);
1758         if (!zram_meta_alloc(zram, disksize)) {
1759                 err = -ENOMEM;
1760                 goto out_unlock;
1761         }
1762
1763         comp = zcomp_create(zram->compressor);
1764         if (IS_ERR(comp)) {
1765                 pr_err("Cannot initialise %s compressing backend\n",
1766                                 zram->compressor);
1767                 err = PTR_ERR(comp);
1768                 goto out_free_meta;
1769         }
1770
1771         zram->comp = comp;
1772         zram->disksize = disksize;
1773         set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
1774         up_write(&zram->init_lock);
1775
1776         return len;
1777
1778 out_free_meta:
1779         zram_meta_free(zram, disksize);
1780 out_unlock:
1781         up_write(&zram->init_lock);
1782         return err;
1783 }
1784
1785 static ssize_t reset_store(struct device *dev,
1786                 struct device_attribute *attr, const char *buf, size_t len)
1787 {
1788         int ret;
1789         unsigned short do_reset;
1790         struct zram *zram;
1791         struct gendisk *disk;
1792
1793         ret = kstrtou16(buf, 10, &do_reset);
1794         if (ret)
1795                 return ret;
1796
1797         if (!do_reset)
1798                 return -EINVAL;
1799
1800         zram = dev_to_zram(dev);
1801         disk = zram->disk;
1802
1803         mutex_lock(&disk->open_mutex);
1804         /* Do not reset an active device or claimed device */
1805         if (disk_openers(disk) || zram->claim) {
1806                 mutex_unlock(&disk->open_mutex);
1807                 return -EBUSY;
1808         }
1809
1810         /* From now on, anyone can't open /dev/zram[0-9] */
1811         zram->claim = true;
1812         mutex_unlock(&disk->open_mutex);
1813
1814         /* Make sure all the pending I/O are finished */
1815         sync_blockdev(disk->part0);
1816         zram_reset_device(zram);
1817
1818         mutex_lock(&disk->open_mutex);
1819         zram->claim = false;
1820         mutex_unlock(&disk->open_mutex);
1821
1822         return len;
1823 }
1824
1825 static int zram_open(struct block_device *bdev, fmode_t mode)
1826 {
1827         int ret = 0;
1828         struct zram *zram;
1829
1830         WARN_ON(!mutex_is_locked(&bdev->bd_disk->open_mutex));
1831
1832         zram = bdev->bd_disk->private_data;
1833         /* zram was claimed to reset so open request fails */
1834         if (zram->claim)
1835                 ret = -EBUSY;
1836
1837         return ret;
1838 }
1839
1840 static const struct block_device_operations zram_devops = {
1841         .open = zram_open,
1842         .submit_bio = zram_submit_bio,
1843         .swap_slot_free_notify = zram_slot_free_notify,
1844         .rw_page = zram_rw_page,
1845         .owner = THIS_MODULE
1846 };
1847
1848 static DEVICE_ATTR_WO(compact);
1849 static DEVICE_ATTR_RW(disksize);
1850 static DEVICE_ATTR_RO(initstate);
1851 static DEVICE_ATTR_WO(reset);
1852 static DEVICE_ATTR_WO(mem_limit);
1853 static DEVICE_ATTR_WO(mem_used_max);
1854 static DEVICE_ATTR_WO(idle);
1855 static DEVICE_ATTR_RW(max_comp_streams);
1856 static DEVICE_ATTR_RW(comp_algorithm);
1857 #ifdef CONFIG_ZRAM_WRITEBACK
1858 static DEVICE_ATTR_RW(backing_dev);
1859 static DEVICE_ATTR_WO(writeback);
1860 static DEVICE_ATTR_RW(writeback_limit);
1861 static DEVICE_ATTR_RW(writeback_limit_enable);
1862 #endif
1863
1864 static struct attribute *zram_disk_attrs[] = {
1865         &dev_attr_disksize.attr,
1866         &dev_attr_initstate.attr,
1867         &dev_attr_reset.attr,
1868         &dev_attr_compact.attr,
1869         &dev_attr_mem_limit.attr,
1870         &dev_attr_mem_used_max.attr,
1871         &dev_attr_idle.attr,
1872         &dev_attr_max_comp_streams.attr,
1873         &dev_attr_comp_algorithm.attr,
1874 #ifdef CONFIG_ZRAM_WRITEBACK
1875         &dev_attr_backing_dev.attr,
1876         &dev_attr_writeback.attr,
1877         &dev_attr_writeback_limit.attr,
1878         &dev_attr_writeback_limit_enable.attr,
1879 #endif
1880         &dev_attr_io_stat.attr,
1881         &dev_attr_mm_stat.attr,
1882 #ifdef CONFIG_ZRAM_WRITEBACK
1883         &dev_attr_bd_stat.attr,
1884 #endif
1885         &dev_attr_debug_stat.attr,
1886         NULL,
1887 };
1888
1889 ATTRIBUTE_GROUPS(zram_disk);
1890
1891 /*
1892  * Allocate and initialize new zram device. the function returns
1893  * '>= 0' device_id upon success, and negative value otherwise.
1894  */
1895 static int zram_add(void)
1896 {
1897         struct zram *zram;
1898         int ret, device_id;
1899
1900         zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
1901         if (!zram)
1902                 return -ENOMEM;
1903
1904         ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
1905         if (ret < 0)
1906                 goto out_free_dev;
1907         device_id = ret;
1908
1909         init_rwsem(&zram->init_lock);
1910 #ifdef CONFIG_ZRAM_WRITEBACK
1911         spin_lock_init(&zram->wb_limit_lock);
1912 #endif
1913
1914         /* gendisk structure */
1915         zram->disk = blk_alloc_disk(NUMA_NO_NODE);
1916         if (!zram->disk) {
1917                 pr_err("Error allocating disk structure for device %d\n",
1918                         device_id);
1919                 ret = -ENOMEM;
1920                 goto out_free_idr;
1921         }
1922
1923         zram->disk->major = zram_major;
1924         zram->disk->first_minor = device_id;
1925         zram->disk->minors = 1;
1926         zram->disk->flags |= GENHD_FL_NO_PART;
1927         zram->disk->fops = &zram_devops;
1928         zram->disk->private_data = zram;
1929         snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1930
1931         /* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
1932         set_capacity(zram->disk, 0);
1933         /* zram devices sort of resembles non-rotational disks */
1934         blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
1935         blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
1936
1937         /*
1938          * To ensure that we always get PAGE_SIZE aligned
1939          * and n*PAGE_SIZED sized I/O requests.
1940          */
1941         blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
1942         blk_queue_logical_block_size(zram->disk->queue,
1943                                         ZRAM_LOGICAL_BLOCK_SIZE);
1944         blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
1945         blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
1946         zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
1947         blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
1948
1949         /*
1950          * zram_bio_discard() will clear all logical blocks if logical block
1951          * size is identical with physical block size(PAGE_SIZE). But if it is
1952          * different, we will skip discarding some parts of logical blocks in
1953          * the part of the request range which isn't aligned to physical block
1954          * size.  So we can't ensure that all discarded logical blocks are
1955          * zeroed.
1956          */
1957         if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
1958                 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
1959
1960         blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
1961         ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
1962         if (ret)
1963                 goto out_cleanup_disk;
1964
1965         strscpy(zram->compressor, default_compressor, sizeof(zram->compressor));
1966
1967         zram_debugfs_register(zram);
1968         pr_info("Added device: %s\n", zram->disk->disk_name);
1969         return device_id;
1970
1971 out_cleanup_disk:
1972         put_disk(zram->disk);
1973 out_free_idr:
1974         idr_remove(&zram_index_idr, device_id);
1975 out_free_dev:
1976         kfree(zram);
1977         return ret;
1978 }
1979
1980 static int zram_remove(struct zram *zram)
1981 {
1982         bool claimed;
1983
1984         mutex_lock(&zram->disk->open_mutex);
1985         if (disk_openers(zram->disk)) {
1986                 mutex_unlock(&zram->disk->open_mutex);
1987                 return -EBUSY;
1988         }
1989
1990         claimed = zram->claim;
1991         if (!claimed)
1992                 zram->claim = true;
1993         mutex_unlock(&zram->disk->open_mutex);
1994
1995         zram_debugfs_unregister(zram);
1996
1997         if (claimed) {
1998                 /*
1999                  * If we were claimed by reset_store(), del_gendisk() will
2000                  * wait until reset_store() is done, so nothing need to do.
2001                  */
2002                 ;
2003         } else {
2004                 /* Make sure all the pending I/O are finished */
2005                 sync_blockdev(zram->disk->part0);
2006                 zram_reset_device(zram);
2007         }
2008
2009         pr_info("Removed device: %s\n", zram->disk->disk_name);
2010
2011         del_gendisk(zram->disk);
2012
2013         /* del_gendisk drains pending reset_store */
2014         WARN_ON_ONCE(claimed && zram->claim);
2015
2016         /*
2017          * disksize_store() may be called in between zram_reset_device()
2018          * and del_gendisk(), so run the last reset to avoid leaking
2019          * anything allocated with disksize_store()
2020          */
2021         zram_reset_device(zram);
2022
2023         put_disk(zram->disk);
2024         kfree(zram);
2025         return 0;
2026 }
2027
2028 /* zram-control sysfs attributes */
2029
2030 /*
2031  * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
2032  * sense that reading from this file does alter the state of your system -- it
2033  * creates a new un-initialized zram device and returns back this device's
2034  * device_id (or an error code if it fails to create a new device).
2035  */
2036 static ssize_t hot_add_show(struct class *class,
2037                         struct class_attribute *attr,
2038                         char *buf)
2039 {
2040         int ret;
2041
2042         mutex_lock(&zram_index_mutex);
2043         ret = zram_add();
2044         mutex_unlock(&zram_index_mutex);
2045
2046         if (ret < 0)
2047                 return ret;
2048         return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
2049 }
2050 static struct class_attribute class_attr_hot_add =
2051         __ATTR(hot_add, 0400, hot_add_show, NULL);
2052
2053 static ssize_t hot_remove_store(struct class *class,
2054                         struct class_attribute *attr,
2055                         const char *buf,
2056                         size_t count)
2057 {
2058         struct zram *zram;
2059         int ret, dev_id;
2060
2061         /* dev_id is gendisk->first_minor, which is `int' */
2062         ret = kstrtoint(buf, 10, &dev_id);
2063         if (ret)
2064                 return ret;
2065         if (dev_id < 0)
2066                 return -EINVAL;
2067
2068         mutex_lock(&zram_index_mutex);
2069
2070         zram = idr_find(&zram_index_idr, dev_id);
2071         if (zram) {
2072                 ret = zram_remove(zram);
2073                 if (!ret)
2074                         idr_remove(&zram_index_idr, dev_id);
2075         } else {
2076                 ret = -ENODEV;
2077         }
2078
2079         mutex_unlock(&zram_index_mutex);
2080         return ret ? ret : count;
2081 }
2082 static CLASS_ATTR_WO(hot_remove);
2083
2084 static struct attribute *zram_control_class_attrs[] = {
2085         &class_attr_hot_add.attr,
2086         &class_attr_hot_remove.attr,
2087         NULL,
2088 };
2089 ATTRIBUTE_GROUPS(zram_control_class);
2090
2091 static struct class zram_control_class = {
2092         .name           = "zram-control",
2093         .owner          = THIS_MODULE,
2094         .class_groups   = zram_control_class_groups,
2095 };
2096
2097 static int zram_remove_cb(int id, void *ptr, void *data)
2098 {
2099         WARN_ON_ONCE(zram_remove(ptr));
2100         return 0;
2101 }
2102
2103 static void destroy_devices(void)
2104 {
2105         class_unregister(&zram_control_class);
2106         idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
2107         zram_debugfs_destroy();
2108         idr_destroy(&zram_index_idr);
2109         unregister_blkdev(zram_major, "zram");
2110         cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2111 }
2112
2113 static int __init zram_init(void)
2114 {
2115         int ret;
2116
2117         BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG);
2118
2119         ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
2120                                       zcomp_cpu_up_prepare, zcomp_cpu_dead);
2121         if (ret < 0)
2122                 return ret;
2123
2124         ret = class_register(&zram_control_class);
2125         if (ret) {
2126                 pr_err("Unable to register zram-control class\n");
2127                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2128                 return ret;
2129         }
2130
2131         zram_debugfs_create();
2132         zram_major = register_blkdev(0, "zram");
2133         if (zram_major <= 0) {
2134                 pr_err("Unable to get major number\n");
2135                 class_unregister(&zram_control_class);
2136                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2137                 return -EBUSY;
2138         }
2139
2140         while (num_devices != 0) {
2141                 mutex_lock(&zram_index_mutex);
2142                 ret = zram_add();
2143                 mutex_unlock(&zram_index_mutex);
2144                 if (ret < 0)
2145                         goto out_error;
2146                 num_devices--;
2147         }
2148
2149         return 0;
2150
2151 out_error:
2152         destroy_devices();
2153         return ret;
2154 }
2155
2156 static void __exit zram_exit(void)
2157 {
2158         destroy_devices();
2159 }
2160
2161 module_init(zram_init);
2162 module_exit(zram_exit);
2163
2164 module_param(num_devices, uint, 0);
2165 MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
2166
2167 MODULE_LICENSE("Dual BSD/GPL");
2168 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
2169 MODULE_DESCRIPTION("Compressed RAM Block Device");