zram: don't return errors from read_from_bdev_async
[linux-block.git] / drivers / block / zram / zram_drv.c
1 /*
2  * Compressed RAM block device
3  *
4  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
5  *               2012, 2013 Minchan Kim
6  *
7  * This code is released using a dual license strategy: BSD/GPL
8  * You can choose the licence that better fits your requirements.
9  *
10  * Released under the terms of 3-clause BSD License
11  * Released under the terms of GNU General Public License Version 2.0
12  *
13  */
14
15 #define KMSG_COMPONENT "zram"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/bitops.h>
22 #include <linux/blkdev.h>
23 #include <linux/buffer_head.h>
24 #include <linux/device.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/backing-dev.h>
28 #include <linux/string.h>
29 #include <linux/vmalloc.h>
30 #include <linux/err.h>
31 #include <linux/idr.h>
32 #include <linux/sysfs.h>
33 #include <linux/debugfs.h>
34 #include <linux/cpuhotplug.h>
35 #include <linux/part_stat.h>
36
37 #include "zram_drv.h"
38
39 static DEFINE_IDR(zram_index_idr);
40 /* idr index must be protected */
41 static DEFINE_MUTEX(zram_index_mutex);
42
43 static int zram_major;
44 static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
45
46 /* Module params (documentation at end) */
47 static unsigned int num_devices = 1;
48 /*
49  * Pages that compress to sizes equals or greater than this are stored
50  * uncompressed in memory.
51  */
52 static size_t huge_class_size;
53
54 static const struct block_device_operations zram_devops;
55
56 static void zram_free_page(struct zram *zram, size_t index);
57 static int zram_read_page(struct zram *zram, struct page *page, u32 index,
58                           struct bio *bio, bool partial_io);
59
60 static int zram_slot_trylock(struct zram *zram, u32 index)
61 {
62         return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags);
63 }
64
65 static void zram_slot_lock(struct zram *zram, u32 index)
66 {
67         bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags);
68 }
69
70 static void zram_slot_unlock(struct zram *zram, u32 index)
71 {
72         bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
73 }
74
75 static inline bool init_done(struct zram *zram)
76 {
77         return zram->disksize;
78 }
79
80 static inline struct zram *dev_to_zram(struct device *dev)
81 {
82         return (struct zram *)dev_to_disk(dev)->private_data;
83 }
84
85 static unsigned long zram_get_handle(struct zram *zram, u32 index)
86 {
87         return zram->table[index].handle;
88 }
89
90 static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
91 {
92         zram->table[index].handle = handle;
93 }
94
95 /* flag operations require table entry bit_spin_lock() being held */
96 static bool zram_test_flag(struct zram *zram, u32 index,
97                         enum zram_pageflags flag)
98 {
99         return zram->table[index].flags & BIT(flag);
100 }
101
102 static void zram_set_flag(struct zram *zram, u32 index,
103                         enum zram_pageflags flag)
104 {
105         zram->table[index].flags |= BIT(flag);
106 }
107
108 static void zram_clear_flag(struct zram *zram, u32 index,
109                         enum zram_pageflags flag)
110 {
111         zram->table[index].flags &= ~BIT(flag);
112 }
113
114 static inline void zram_set_element(struct zram *zram, u32 index,
115                         unsigned long element)
116 {
117         zram->table[index].element = element;
118 }
119
120 static unsigned long zram_get_element(struct zram *zram, u32 index)
121 {
122         return zram->table[index].element;
123 }
124
125 static size_t zram_get_obj_size(struct zram *zram, u32 index)
126 {
127         return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1);
128 }
129
130 static void zram_set_obj_size(struct zram *zram,
131                                         u32 index, size_t size)
132 {
133         unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT;
134
135         zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size;
136 }
137
138 static inline bool zram_allocated(struct zram *zram, u32 index)
139 {
140         return zram_get_obj_size(zram, index) ||
141                         zram_test_flag(zram, index, ZRAM_SAME) ||
142                         zram_test_flag(zram, index, ZRAM_WB);
143 }
144
145 #if PAGE_SIZE != 4096
146 static inline bool is_partial_io(struct bio_vec *bvec)
147 {
148         return bvec->bv_len != PAGE_SIZE;
149 }
150 #define ZRAM_PARTIAL_IO         1
151 #else
152 static inline bool is_partial_io(struct bio_vec *bvec)
153 {
154         return false;
155 }
156 #endif
157
158 static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio)
159 {
160         prio &= ZRAM_COMP_PRIORITY_MASK;
161         /*
162          * Clear previous priority value first, in case if we recompress
163          * further an already recompressed page
164          */
165         zram->table[index].flags &= ~(ZRAM_COMP_PRIORITY_MASK <<
166                                       ZRAM_COMP_PRIORITY_BIT1);
167         zram->table[index].flags |= (prio << ZRAM_COMP_PRIORITY_BIT1);
168 }
169
170 static inline u32 zram_get_priority(struct zram *zram, u32 index)
171 {
172         u32 prio = zram->table[index].flags >> ZRAM_COMP_PRIORITY_BIT1;
173
174         return prio & ZRAM_COMP_PRIORITY_MASK;
175 }
176
177 static inline void update_used_max(struct zram *zram,
178                                         const unsigned long pages)
179 {
180         unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages);
181
182         do {
183                 if (cur_max >= pages)
184                         return;
185         } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages,
186                                           &cur_max, pages));
187 }
188
189 static inline void zram_fill_page(void *ptr, unsigned long len,
190                                         unsigned long value)
191 {
192         WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
193         memset_l(ptr, value, len / sizeof(unsigned long));
194 }
195
196 static bool page_same_filled(void *ptr, unsigned long *element)
197 {
198         unsigned long *page;
199         unsigned long val;
200         unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
201
202         page = (unsigned long *)ptr;
203         val = page[0];
204
205         if (val != page[last_pos])
206                 return false;
207
208         for (pos = 1; pos < last_pos; pos++) {
209                 if (val != page[pos])
210                         return false;
211         }
212
213         *element = val;
214
215         return true;
216 }
217
218 static ssize_t initstate_show(struct device *dev,
219                 struct device_attribute *attr, char *buf)
220 {
221         u32 val;
222         struct zram *zram = dev_to_zram(dev);
223
224         down_read(&zram->init_lock);
225         val = init_done(zram);
226         up_read(&zram->init_lock);
227
228         return scnprintf(buf, PAGE_SIZE, "%u\n", val);
229 }
230
231 static ssize_t disksize_show(struct device *dev,
232                 struct device_attribute *attr, char *buf)
233 {
234         struct zram *zram = dev_to_zram(dev);
235
236         return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
237 }
238
239 static ssize_t mem_limit_store(struct device *dev,
240                 struct device_attribute *attr, const char *buf, size_t len)
241 {
242         u64 limit;
243         char *tmp;
244         struct zram *zram = dev_to_zram(dev);
245
246         limit = memparse(buf, &tmp);
247         if (buf == tmp) /* no chars parsed, invalid input */
248                 return -EINVAL;
249
250         down_write(&zram->init_lock);
251         zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
252         up_write(&zram->init_lock);
253
254         return len;
255 }
256
257 static ssize_t mem_used_max_store(struct device *dev,
258                 struct device_attribute *attr, const char *buf, size_t len)
259 {
260         int err;
261         unsigned long val;
262         struct zram *zram = dev_to_zram(dev);
263
264         err = kstrtoul(buf, 10, &val);
265         if (err || val != 0)
266                 return -EINVAL;
267
268         down_read(&zram->init_lock);
269         if (init_done(zram)) {
270                 atomic_long_set(&zram->stats.max_used_pages,
271                                 zs_get_total_pages(zram->mem_pool));
272         }
273         up_read(&zram->init_lock);
274
275         return len;
276 }
277
278 /*
279  * Mark all pages which are older than or equal to cutoff as IDLE.
280  * Callers should hold the zram init lock in read mode
281  */
282 static void mark_idle(struct zram *zram, ktime_t cutoff)
283 {
284         int is_idle = 1;
285         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
286         int index;
287
288         for (index = 0; index < nr_pages; index++) {
289                 /*
290                  * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
291                  * See the comment in writeback_store.
292                  */
293                 zram_slot_lock(zram, index);
294                 if (zram_allocated(zram, index) &&
295                                 !zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
296 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
297                         is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time);
298 #endif
299                         if (is_idle)
300                                 zram_set_flag(zram, index, ZRAM_IDLE);
301                 }
302                 zram_slot_unlock(zram, index);
303         }
304 }
305
306 static ssize_t idle_store(struct device *dev,
307                 struct device_attribute *attr, const char *buf, size_t len)
308 {
309         struct zram *zram = dev_to_zram(dev);
310         ktime_t cutoff_time = 0;
311         ssize_t rv = -EINVAL;
312
313         if (!sysfs_streq(buf, "all")) {
314                 /*
315                  * If it did not parse as 'all' try to treat it as an integer
316                  * when we have memory tracking enabled.
317                  */
318                 u64 age_sec;
319
320                 if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec))
321                         cutoff_time = ktime_sub(ktime_get_boottime(),
322                                         ns_to_ktime(age_sec * NSEC_PER_SEC));
323                 else
324                         goto out;
325         }
326
327         down_read(&zram->init_lock);
328         if (!init_done(zram))
329                 goto out_unlock;
330
331         /*
332          * A cutoff_time of 0 marks everything as idle, this is the
333          * "all" behavior.
334          */
335         mark_idle(zram, cutoff_time);
336         rv = len;
337
338 out_unlock:
339         up_read(&zram->init_lock);
340 out:
341         return rv;
342 }
343
344 #ifdef CONFIG_ZRAM_WRITEBACK
345 static ssize_t writeback_limit_enable_store(struct device *dev,
346                 struct device_attribute *attr, const char *buf, size_t len)
347 {
348         struct zram *zram = dev_to_zram(dev);
349         u64 val;
350         ssize_t ret = -EINVAL;
351
352         if (kstrtoull(buf, 10, &val))
353                 return ret;
354
355         down_read(&zram->init_lock);
356         spin_lock(&zram->wb_limit_lock);
357         zram->wb_limit_enable = val;
358         spin_unlock(&zram->wb_limit_lock);
359         up_read(&zram->init_lock);
360         ret = len;
361
362         return ret;
363 }
364
365 static ssize_t writeback_limit_enable_show(struct device *dev,
366                 struct device_attribute *attr, char *buf)
367 {
368         bool val;
369         struct zram *zram = dev_to_zram(dev);
370
371         down_read(&zram->init_lock);
372         spin_lock(&zram->wb_limit_lock);
373         val = zram->wb_limit_enable;
374         spin_unlock(&zram->wb_limit_lock);
375         up_read(&zram->init_lock);
376
377         return scnprintf(buf, PAGE_SIZE, "%d\n", val);
378 }
379
380 static ssize_t writeback_limit_store(struct device *dev,
381                 struct device_attribute *attr, const char *buf, size_t len)
382 {
383         struct zram *zram = dev_to_zram(dev);
384         u64 val;
385         ssize_t ret = -EINVAL;
386
387         if (kstrtoull(buf, 10, &val))
388                 return ret;
389
390         down_read(&zram->init_lock);
391         spin_lock(&zram->wb_limit_lock);
392         zram->bd_wb_limit = val;
393         spin_unlock(&zram->wb_limit_lock);
394         up_read(&zram->init_lock);
395         ret = len;
396
397         return ret;
398 }
399
400 static ssize_t writeback_limit_show(struct device *dev,
401                 struct device_attribute *attr, char *buf)
402 {
403         u64 val;
404         struct zram *zram = dev_to_zram(dev);
405
406         down_read(&zram->init_lock);
407         spin_lock(&zram->wb_limit_lock);
408         val = zram->bd_wb_limit;
409         spin_unlock(&zram->wb_limit_lock);
410         up_read(&zram->init_lock);
411
412         return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
413 }
414
415 static void reset_bdev(struct zram *zram)
416 {
417         struct block_device *bdev;
418
419         if (!zram->backing_dev)
420                 return;
421
422         bdev = zram->bdev;
423         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
424         /* hope filp_close flush all of IO */
425         filp_close(zram->backing_dev, NULL);
426         zram->backing_dev = NULL;
427         zram->bdev = NULL;
428         zram->disk->fops = &zram_devops;
429         kvfree(zram->bitmap);
430         zram->bitmap = NULL;
431 }
432
433 static ssize_t backing_dev_show(struct device *dev,
434                 struct device_attribute *attr, char *buf)
435 {
436         struct file *file;
437         struct zram *zram = dev_to_zram(dev);
438         char *p;
439         ssize_t ret;
440
441         down_read(&zram->init_lock);
442         file = zram->backing_dev;
443         if (!file) {
444                 memcpy(buf, "none\n", 5);
445                 up_read(&zram->init_lock);
446                 return 5;
447         }
448
449         p = file_path(file, buf, PAGE_SIZE - 1);
450         if (IS_ERR(p)) {
451                 ret = PTR_ERR(p);
452                 goto out;
453         }
454
455         ret = strlen(p);
456         memmove(buf, p, ret);
457         buf[ret++] = '\n';
458 out:
459         up_read(&zram->init_lock);
460         return ret;
461 }
462
463 static ssize_t backing_dev_store(struct device *dev,
464                 struct device_attribute *attr, const char *buf, size_t len)
465 {
466         char *file_name;
467         size_t sz;
468         struct file *backing_dev = NULL;
469         struct inode *inode;
470         struct address_space *mapping;
471         unsigned int bitmap_sz;
472         unsigned long nr_pages, *bitmap = NULL;
473         struct block_device *bdev = NULL;
474         int err;
475         struct zram *zram = dev_to_zram(dev);
476
477         file_name = kmalloc(PATH_MAX, GFP_KERNEL);
478         if (!file_name)
479                 return -ENOMEM;
480
481         down_write(&zram->init_lock);
482         if (init_done(zram)) {
483                 pr_info("Can't setup backing device for initialized device\n");
484                 err = -EBUSY;
485                 goto out;
486         }
487
488         strscpy(file_name, buf, PATH_MAX);
489         /* ignore trailing newline */
490         sz = strlen(file_name);
491         if (sz > 0 && file_name[sz - 1] == '\n')
492                 file_name[sz - 1] = 0x00;
493
494         backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
495         if (IS_ERR(backing_dev)) {
496                 err = PTR_ERR(backing_dev);
497                 backing_dev = NULL;
498                 goto out;
499         }
500
501         mapping = backing_dev->f_mapping;
502         inode = mapping->host;
503
504         /* Support only block device in this moment */
505         if (!S_ISBLK(inode->i_mode)) {
506                 err = -ENOTBLK;
507                 goto out;
508         }
509
510         bdev = blkdev_get_by_dev(inode->i_rdev,
511                         FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
512         if (IS_ERR(bdev)) {
513                 err = PTR_ERR(bdev);
514                 bdev = NULL;
515                 goto out;
516         }
517
518         nr_pages = i_size_read(inode) >> PAGE_SHIFT;
519         bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
520         bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
521         if (!bitmap) {
522                 err = -ENOMEM;
523                 goto out;
524         }
525
526         reset_bdev(zram);
527
528         zram->bdev = bdev;
529         zram->backing_dev = backing_dev;
530         zram->bitmap = bitmap;
531         zram->nr_pages = nr_pages;
532         up_write(&zram->init_lock);
533
534         pr_info("setup backing device %s\n", file_name);
535         kfree(file_name);
536
537         return len;
538 out:
539         kvfree(bitmap);
540
541         if (bdev)
542                 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
543
544         if (backing_dev)
545                 filp_close(backing_dev, NULL);
546
547         up_write(&zram->init_lock);
548
549         kfree(file_name);
550
551         return err;
552 }
553
554 static unsigned long alloc_block_bdev(struct zram *zram)
555 {
556         unsigned long blk_idx = 1;
557 retry:
558         /* skip 0 bit to confuse zram.handle = 0 */
559         blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
560         if (blk_idx == zram->nr_pages)
561                 return 0;
562
563         if (test_and_set_bit(blk_idx, zram->bitmap))
564                 goto retry;
565
566         atomic64_inc(&zram->stats.bd_count);
567         return blk_idx;
568 }
569
570 static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
571 {
572         int was_set;
573
574         was_set = test_and_clear_bit(blk_idx, zram->bitmap);
575         WARN_ON_ONCE(!was_set);
576         atomic64_dec(&zram->stats.bd_count);
577 }
578
579 static void zram_page_end_io(struct bio *bio)
580 {
581         struct page *page = bio_first_page_all(bio);
582
583         page_endio(page, op_is_write(bio_op(bio)),
584                         blk_status_to_errno(bio->bi_status));
585         bio_put(bio);
586 }
587
588 static void read_from_bdev_async(struct zram *zram, struct page *page,
589                         unsigned long entry, struct bio *parent)
590 {
591         struct bio *bio;
592
593         bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ,
594                         GFP_NOIO);
595
596         bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
597         __bio_add_page(bio, page, PAGE_SIZE, 0);
598
599         if (!parent)
600                 bio->bi_end_io = zram_page_end_io;
601         else
602                 bio_chain(bio, parent);
603
604         submit_bio(bio);
605 }
606
607 #define PAGE_WB_SIG "page_index="
608
609 #define PAGE_WRITEBACK                  0
610 #define HUGE_WRITEBACK                  (1<<0)
611 #define IDLE_WRITEBACK                  (1<<1)
612 #define INCOMPRESSIBLE_WRITEBACK        (1<<2)
613
614 static ssize_t writeback_store(struct device *dev,
615                 struct device_attribute *attr, const char *buf, size_t len)
616 {
617         struct zram *zram = dev_to_zram(dev);
618         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
619         unsigned long index = 0;
620         struct bio bio;
621         struct bio_vec bio_vec;
622         struct page *page;
623         ssize_t ret = len;
624         int mode, err;
625         unsigned long blk_idx = 0;
626
627         if (sysfs_streq(buf, "idle"))
628                 mode = IDLE_WRITEBACK;
629         else if (sysfs_streq(buf, "huge"))
630                 mode = HUGE_WRITEBACK;
631         else if (sysfs_streq(buf, "huge_idle"))
632                 mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
633         else if (sysfs_streq(buf, "incompressible"))
634                 mode = INCOMPRESSIBLE_WRITEBACK;
635         else {
636                 if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
637                         return -EINVAL;
638
639                 if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) ||
640                                 index >= nr_pages)
641                         return -EINVAL;
642
643                 nr_pages = 1;
644                 mode = PAGE_WRITEBACK;
645         }
646
647         down_read(&zram->init_lock);
648         if (!init_done(zram)) {
649                 ret = -EINVAL;
650                 goto release_init_lock;
651         }
652
653         if (!zram->backing_dev) {
654                 ret = -ENODEV;
655                 goto release_init_lock;
656         }
657
658         page = alloc_page(GFP_KERNEL);
659         if (!page) {
660                 ret = -ENOMEM;
661                 goto release_init_lock;
662         }
663
664         for (; nr_pages != 0; index++, nr_pages--) {
665                 spin_lock(&zram->wb_limit_lock);
666                 if (zram->wb_limit_enable && !zram->bd_wb_limit) {
667                         spin_unlock(&zram->wb_limit_lock);
668                         ret = -EIO;
669                         break;
670                 }
671                 spin_unlock(&zram->wb_limit_lock);
672
673                 if (!blk_idx) {
674                         blk_idx = alloc_block_bdev(zram);
675                         if (!blk_idx) {
676                                 ret = -ENOSPC;
677                                 break;
678                         }
679                 }
680
681                 zram_slot_lock(zram, index);
682                 if (!zram_allocated(zram, index))
683                         goto next;
684
685                 if (zram_test_flag(zram, index, ZRAM_WB) ||
686                                 zram_test_flag(zram, index, ZRAM_SAME) ||
687                                 zram_test_flag(zram, index, ZRAM_UNDER_WB))
688                         goto next;
689
690                 if (mode & IDLE_WRITEBACK &&
691                     !zram_test_flag(zram, index, ZRAM_IDLE))
692                         goto next;
693                 if (mode & HUGE_WRITEBACK &&
694                     !zram_test_flag(zram, index, ZRAM_HUGE))
695                         goto next;
696                 if (mode & INCOMPRESSIBLE_WRITEBACK &&
697                     !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
698                         goto next;
699
700                 /*
701                  * Clearing ZRAM_UNDER_WB is duty of caller.
702                  * IOW, zram_free_page never clear it.
703                  */
704                 zram_set_flag(zram, index, ZRAM_UNDER_WB);
705                 /* Need for hugepage writeback racing */
706                 zram_set_flag(zram, index, ZRAM_IDLE);
707                 zram_slot_unlock(zram, index);
708                 if (zram_read_page(zram, page, index, NULL, false)) {
709                         zram_slot_lock(zram, index);
710                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
711                         zram_clear_flag(zram, index, ZRAM_IDLE);
712                         zram_slot_unlock(zram, index);
713                         continue;
714                 }
715
716                 bio_init(&bio, zram->bdev, &bio_vec, 1,
717                          REQ_OP_WRITE | REQ_SYNC);
718                 bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
719                 bio_add_page(&bio, page, PAGE_SIZE, 0);
720
721                 /*
722                  * XXX: A single page IO would be inefficient for write
723                  * but it would be not bad as starter.
724                  */
725                 err = submit_bio_wait(&bio);
726                 if (err) {
727                         zram_slot_lock(zram, index);
728                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
729                         zram_clear_flag(zram, index, ZRAM_IDLE);
730                         zram_slot_unlock(zram, index);
731                         /*
732                          * BIO errors are not fatal, we continue and simply
733                          * attempt to writeback the remaining objects (pages).
734                          * At the same time we need to signal user-space that
735                          * some writes (at least one, but also could be all of
736                          * them) were not successful and we do so by returning
737                          * the most recent BIO error.
738                          */
739                         ret = err;
740                         continue;
741                 }
742
743                 atomic64_inc(&zram->stats.bd_writes);
744                 /*
745                  * We released zram_slot_lock so need to check if the slot was
746                  * changed. If there is freeing for the slot, we can catch it
747                  * easily by zram_allocated.
748                  * A subtle case is the slot is freed/reallocated/marked as
749                  * ZRAM_IDLE again. To close the race, idle_store doesn't
750                  * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
751                  * Thus, we could close the race by checking ZRAM_IDLE bit.
752                  */
753                 zram_slot_lock(zram, index);
754                 if (!zram_allocated(zram, index) ||
755                           !zram_test_flag(zram, index, ZRAM_IDLE)) {
756                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
757                         zram_clear_flag(zram, index, ZRAM_IDLE);
758                         goto next;
759                 }
760
761                 zram_free_page(zram, index);
762                 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
763                 zram_set_flag(zram, index, ZRAM_WB);
764                 zram_set_element(zram, index, blk_idx);
765                 blk_idx = 0;
766                 atomic64_inc(&zram->stats.pages_stored);
767                 spin_lock(&zram->wb_limit_lock);
768                 if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
769                         zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
770                 spin_unlock(&zram->wb_limit_lock);
771 next:
772                 zram_slot_unlock(zram, index);
773         }
774
775         if (blk_idx)
776                 free_block_bdev(zram, blk_idx);
777         __free_page(page);
778 release_init_lock:
779         up_read(&zram->init_lock);
780
781         return ret;
782 }
783
784 struct zram_work {
785         struct work_struct work;
786         struct zram *zram;
787         unsigned long entry;
788         struct bio *bio;
789         struct page *page;
790 };
791
792 static void zram_sync_read(struct work_struct *work)
793 {
794         struct zram_work *zw = container_of(work, struct zram_work, work);
795         struct zram *zram = zw->zram;
796         unsigned long entry = zw->entry;
797         struct bio *bio = zw->bio;
798
799         read_from_bdev_async(zram, zw->page, entry, bio);
800 }
801
802 /*
803  * Block layer want one ->submit_bio to be active at a time, so if we use
804  * chained IO with parent IO in same context, it's a deadlock. To avoid that,
805  * use a worker thread context.
806  */
807 static int read_from_bdev_sync(struct zram *zram, struct page *page,
808                                 unsigned long entry, struct bio *bio)
809 {
810         struct zram_work work;
811
812         work.page = page;
813         work.zram = zram;
814         work.entry = entry;
815         work.bio = bio;
816
817         INIT_WORK_ONSTACK(&work.work, zram_sync_read);
818         queue_work(system_unbound_wq, &work.work);
819         flush_work(&work.work);
820         destroy_work_on_stack(&work.work);
821
822         return 1;
823 }
824
825 static int read_from_bdev(struct zram *zram, struct page *page,
826                         unsigned long entry, struct bio *parent, bool sync)
827 {
828         atomic64_inc(&zram->stats.bd_reads);
829         if (sync) {
830                 if (WARN_ON_ONCE(!IS_ENABLED(ZRAM_PARTIAL_IO)))
831                         return -EIO;
832                 return read_from_bdev_sync(zram, page, entry, parent);
833         }
834         read_from_bdev_async(zram, page, entry, parent);
835         return 1;
836 }
837 #else
838 static inline void reset_bdev(struct zram *zram) {};
839 static int read_from_bdev(struct zram *zram, struct page *page,
840                         unsigned long entry, struct bio *parent, bool sync)
841 {
842         return -EIO;
843 }
844
845 static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
846 #endif
847
848 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
849
850 static struct dentry *zram_debugfs_root;
851
852 static void zram_debugfs_create(void)
853 {
854         zram_debugfs_root = debugfs_create_dir("zram", NULL);
855 }
856
857 static void zram_debugfs_destroy(void)
858 {
859         debugfs_remove_recursive(zram_debugfs_root);
860 }
861
862 static void zram_accessed(struct zram *zram, u32 index)
863 {
864         zram_clear_flag(zram, index, ZRAM_IDLE);
865         zram->table[index].ac_time = ktime_get_boottime();
866 }
867
868 static ssize_t read_block_state(struct file *file, char __user *buf,
869                                 size_t count, loff_t *ppos)
870 {
871         char *kbuf;
872         ssize_t index, written = 0;
873         struct zram *zram = file->private_data;
874         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
875         struct timespec64 ts;
876
877         kbuf = kvmalloc(count, GFP_KERNEL);
878         if (!kbuf)
879                 return -ENOMEM;
880
881         down_read(&zram->init_lock);
882         if (!init_done(zram)) {
883                 up_read(&zram->init_lock);
884                 kvfree(kbuf);
885                 return -EINVAL;
886         }
887
888         for (index = *ppos; index < nr_pages; index++) {
889                 int copied;
890
891                 zram_slot_lock(zram, index);
892                 if (!zram_allocated(zram, index))
893                         goto next;
894
895                 ts = ktime_to_timespec64(zram->table[index].ac_time);
896                 copied = snprintf(kbuf + written, count,
897                         "%12zd %12lld.%06lu %c%c%c%c%c%c\n",
898                         index, (s64)ts.tv_sec,
899                         ts.tv_nsec / NSEC_PER_USEC,
900                         zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
901                         zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
902                         zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
903                         zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.',
904                         zram_get_priority(zram, index) ? 'r' : '.',
905                         zram_test_flag(zram, index,
906                                        ZRAM_INCOMPRESSIBLE) ? 'n' : '.');
907
908                 if (count <= copied) {
909                         zram_slot_unlock(zram, index);
910                         break;
911                 }
912                 written += copied;
913                 count -= copied;
914 next:
915                 zram_slot_unlock(zram, index);
916                 *ppos += 1;
917         }
918
919         up_read(&zram->init_lock);
920         if (copy_to_user(buf, kbuf, written))
921                 written = -EFAULT;
922         kvfree(kbuf);
923
924         return written;
925 }
926
927 static const struct file_operations proc_zram_block_state_op = {
928         .open = simple_open,
929         .read = read_block_state,
930         .llseek = default_llseek,
931 };
932
933 static void zram_debugfs_register(struct zram *zram)
934 {
935         if (!zram_debugfs_root)
936                 return;
937
938         zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
939                                                 zram_debugfs_root);
940         debugfs_create_file("block_state", 0400, zram->debugfs_dir,
941                                 zram, &proc_zram_block_state_op);
942 }
943
944 static void zram_debugfs_unregister(struct zram *zram)
945 {
946         debugfs_remove_recursive(zram->debugfs_dir);
947 }
948 #else
949 static void zram_debugfs_create(void) {};
950 static void zram_debugfs_destroy(void) {};
951 static void zram_accessed(struct zram *zram, u32 index)
952 {
953         zram_clear_flag(zram, index, ZRAM_IDLE);
954 };
955 static void zram_debugfs_register(struct zram *zram) {};
956 static void zram_debugfs_unregister(struct zram *zram) {};
957 #endif
958
959 /*
960  * We switched to per-cpu streams and this attr is not needed anymore.
961  * However, we will keep it around for some time, because:
962  * a) we may revert per-cpu streams in the future
963  * b) it's visible to user space and we need to follow our 2 years
964  *    retirement rule; but we already have a number of 'soon to be
965  *    altered' attrs, so max_comp_streams need to wait for the next
966  *    layoff cycle.
967  */
968 static ssize_t max_comp_streams_show(struct device *dev,
969                 struct device_attribute *attr, char *buf)
970 {
971         return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
972 }
973
974 static ssize_t max_comp_streams_store(struct device *dev,
975                 struct device_attribute *attr, const char *buf, size_t len)
976 {
977         return len;
978 }
979
980 static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
981 {
982         /* Do not free statically defined compression algorithms */
983         if (zram->comp_algs[prio] != default_compressor)
984                 kfree(zram->comp_algs[prio]);
985
986         zram->comp_algs[prio] = alg;
987 }
988
989 static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, char *buf)
990 {
991         ssize_t sz;
992
993         down_read(&zram->init_lock);
994         sz = zcomp_available_show(zram->comp_algs[prio], buf);
995         up_read(&zram->init_lock);
996
997         return sz;
998 }
999
1000 static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf)
1001 {
1002         char *compressor;
1003         size_t sz;
1004
1005         sz = strlen(buf);
1006         if (sz >= CRYPTO_MAX_ALG_NAME)
1007                 return -E2BIG;
1008
1009         compressor = kstrdup(buf, GFP_KERNEL);
1010         if (!compressor)
1011                 return -ENOMEM;
1012
1013         /* ignore trailing newline */
1014         if (sz > 0 && compressor[sz - 1] == '\n')
1015                 compressor[sz - 1] = 0x00;
1016
1017         if (!zcomp_available_algorithm(compressor)) {
1018                 kfree(compressor);
1019                 return -EINVAL;
1020         }
1021
1022         down_write(&zram->init_lock);
1023         if (init_done(zram)) {
1024                 up_write(&zram->init_lock);
1025                 kfree(compressor);
1026                 pr_info("Can't change algorithm for initialized device\n");
1027                 return -EBUSY;
1028         }
1029
1030         comp_algorithm_set(zram, prio, compressor);
1031         up_write(&zram->init_lock);
1032         return 0;
1033 }
1034
1035 static ssize_t comp_algorithm_show(struct device *dev,
1036                                    struct device_attribute *attr,
1037                                    char *buf)
1038 {
1039         struct zram *zram = dev_to_zram(dev);
1040
1041         return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf);
1042 }
1043
1044 static ssize_t comp_algorithm_store(struct device *dev,
1045                                     struct device_attribute *attr,
1046                                     const char *buf,
1047                                     size_t len)
1048 {
1049         struct zram *zram = dev_to_zram(dev);
1050         int ret;
1051
1052         ret = __comp_algorithm_store(zram, ZRAM_PRIMARY_COMP, buf);
1053         return ret ? ret : len;
1054 }
1055
1056 #ifdef CONFIG_ZRAM_MULTI_COMP
1057 static ssize_t recomp_algorithm_show(struct device *dev,
1058                                      struct device_attribute *attr,
1059                                      char *buf)
1060 {
1061         struct zram *zram = dev_to_zram(dev);
1062         ssize_t sz = 0;
1063         u32 prio;
1064
1065         for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
1066                 if (!zram->comp_algs[prio])
1067                         continue;
1068
1069                 sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "#%d: ", prio);
1070                 sz += __comp_algorithm_show(zram, prio, buf + sz);
1071         }
1072
1073         return sz;
1074 }
1075
1076 static ssize_t recomp_algorithm_store(struct device *dev,
1077                                       struct device_attribute *attr,
1078                                       const char *buf,
1079                                       size_t len)
1080 {
1081         struct zram *zram = dev_to_zram(dev);
1082         int prio = ZRAM_SECONDARY_COMP;
1083         char *args, *param, *val;
1084         char *alg = NULL;
1085         int ret;
1086
1087         args = skip_spaces(buf);
1088         while (*args) {
1089                 args = next_arg(args, &param, &val);
1090
1091                 if (!val || !*val)
1092                         return -EINVAL;
1093
1094                 if (!strcmp(param, "algo")) {
1095                         alg = val;
1096                         continue;
1097                 }
1098
1099                 if (!strcmp(param, "priority")) {
1100                         ret = kstrtoint(val, 10, &prio);
1101                         if (ret)
1102                                 return ret;
1103                         continue;
1104                 }
1105         }
1106
1107         if (!alg)
1108                 return -EINVAL;
1109
1110         if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS)
1111                 return -EINVAL;
1112
1113         ret = __comp_algorithm_store(zram, prio, alg);
1114         return ret ? ret : len;
1115 }
1116 #endif
1117
1118 static ssize_t compact_store(struct device *dev,
1119                 struct device_attribute *attr, const char *buf, size_t len)
1120 {
1121         struct zram *zram = dev_to_zram(dev);
1122
1123         down_read(&zram->init_lock);
1124         if (!init_done(zram)) {
1125                 up_read(&zram->init_lock);
1126                 return -EINVAL;
1127         }
1128
1129         zs_compact(zram->mem_pool);
1130         up_read(&zram->init_lock);
1131
1132         return len;
1133 }
1134
1135 static ssize_t io_stat_show(struct device *dev,
1136                 struct device_attribute *attr, char *buf)
1137 {
1138         struct zram *zram = dev_to_zram(dev);
1139         ssize_t ret;
1140
1141         down_read(&zram->init_lock);
1142         ret = scnprintf(buf, PAGE_SIZE,
1143                         "%8llu %8llu 0 %8llu\n",
1144                         (u64)atomic64_read(&zram->stats.failed_reads),
1145                         (u64)atomic64_read(&zram->stats.failed_writes),
1146                         (u64)atomic64_read(&zram->stats.notify_free));
1147         up_read(&zram->init_lock);
1148
1149         return ret;
1150 }
1151
1152 static ssize_t mm_stat_show(struct device *dev,
1153                 struct device_attribute *attr, char *buf)
1154 {
1155         struct zram *zram = dev_to_zram(dev);
1156         struct zs_pool_stats pool_stats;
1157         u64 orig_size, mem_used = 0;
1158         long max_used;
1159         ssize_t ret;
1160
1161         memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
1162
1163         down_read(&zram->init_lock);
1164         if (init_done(zram)) {
1165                 mem_used = zs_get_total_pages(zram->mem_pool);
1166                 zs_pool_stats(zram->mem_pool, &pool_stats);
1167         }
1168
1169         orig_size = atomic64_read(&zram->stats.pages_stored);
1170         max_used = atomic_long_read(&zram->stats.max_used_pages);
1171
1172         ret = scnprintf(buf, PAGE_SIZE,
1173                         "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
1174                         orig_size << PAGE_SHIFT,
1175                         (u64)atomic64_read(&zram->stats.compr_data_size),
1176                         mem_used << PAGE_SHIFT,
1177                         zram->limit_pages << PAGE_SHIFT,
1178                         max_used << PAGE_SHIFT,
1179                         (u64)atomic64_read(&zram->stats.same_pages),
1180                         atomic_long_read(&pool_stats.pages_compacted),
1181                         (u64)atomic64_read(&zram->stats.huge_pages),
1182                         (u64)atomic64_read(&zram->stats.huge_pages_since));
1183         up_read(&zram->init_lock);
1184
1185         return ret;
1186 }
1187
1188 #ifdef CONFIG_ZRAM_WRITEBACK
1189 #define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12)))
1190 static ssize_t bd_stat_show(struct device *dev,
1191                 struct device_attribute *attr, char *buf)
1192 {
1193         struct zram *zram = dev_to_zram(dev);
1194         ssize_t ret;
1195
1196         down_read(&zram->init_lock);
1197         ret = scnprintf(buf, PAGE_SIZE,
1198                 "%8llu %8llu %8llu\n",
1199                         FOUR_K((u64)atomic64_read(&zram->stats.bd_count)),
1200                         FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)),
1201                         FOUR_K((u64)atomic64_read(&zram->stats.bd_writes)));
1202         up_read(&zram->init_lock);
1203
1204         return ret;
1205 }
1206 #endif
1207
1208 static ssize_t debug_stat_show(struct device *dev,
1209                 struct device_attribute *attr, char *buf)
1210 {
1211         int version = 1;
1212         struct zram *zram = dev_to_zram(dev);
1213         ssize_t ret;
1214
1215         down_read(&zram->init_lock);
1216         ret = scnprintf(buf, PAGE_SIZE,
1217                         "version: %d\n%8llu %8llu\n",
1218                         version,
1219                         (u64)atomic64_read(&zram->stats.writestall),
1220                         (u64)atomic64_read(&zram->stats.miss_free));
1221         up_read(&zram->init_lock);
1222
1223         return ret;
1224 }
1225
1226 static DEVICE_ATTR_RO(io_stat);
1227 static DEVICE_ATTR_RO(mm_stat);
1228 #ifdef CONFIG_ZRAM_WRITEBACK
1229 static DEVICE_ATTR_RO(bd_stat);
1230 #endif
1231 static DEVICE_ATTR_RO(debug_stat);
1232
1233 static void zram_meta_free(struct zram *zram, u64 disksize)
1234 {
1235         size_t num_pages = disksize >> PAGE_SHIFT;
1236         size_t index;
1237
1238         /* Free all pages that are still in this zram device */
1239         for (index = 0; index < num_pages; index++)
1240                 zram_free_page(zram, index);
1241
1242         zs_destroy_pool(zram->mem_pool);
1243         vfree(zram->table);
1244 }
1245
1246 static bool zram_meta_alloc(struct zram *zram, u64 disksize)
1247 {
1248         size_t num_pages;
1249
1250         num_pages = disksize >> PAGE_SHIFT;
1251         zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
1252         if (!zram->table)
1253                 return false;
1254
1255         zram->mem_pool = zs_create_pool(zram->disk->disk_name);
1256         if (!zram->mem_pool) {
1257                 vfree(zram->table);
1258                 return false;
1259         }
1260
1261         if (!huge_class_size)
1262                 huge_class_size = zs_huge_class_size(zram->mem_pool);
1263         return true;
1264 }
1265
1266 /*
1267  * To protect concurrent access to the same index entry,
1268  * caller should hold this table index entry's bit_spinlock to
1269  * indicate this index entry is accessing.
1270  */
1271 static void zram_free_page(struct zram *zram, size_t index)
1272 {
1273         unsigned long handle;
1274
1275 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
1276         zram->table[index].ac_time = 0;
1277 #endif
1278         if (zram_test_flag(zram, index, ZRAM_IDLE))
1279                 zram_clear_flag(zram, index, ZRAM_IDLE);
1280
1281         if (zram_test_flag(zram, index, ZRAM_HUGE)) {
1282                 zram_clear_flag(zram, index, ZRAM_HUGE);
1283                 atomic64_dec(&zram->stats.huge_pages);
1284         }
1285
1286         if (zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
1287                 zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE);
1288
1289         zram_set_priority(zram, index, 0);
1290
1291         if (zram_test_flag(zram, index, ZRAM_WB)) {
1292                 zram_clear_flag(zram, index, ZRAM_WB);
1293                 free_block_bdev(zram, zram_get_element(zram, index));
1294                 goto out;
1295         }
1296
1297         /*
1298          * No memory is allocated for same element filled pages.
1299          * Simply clear same page flag.
1300          */
1301         if (zram_test_flag(zram, index, ZRAM_SAME)) {
1302                 zram_clear_flag(zram, index, ZRAM_SAME);
1303                 atomic64_dec(&zram->stats.same_pages);
1304                 goto out;
1305         }
1306
1307         handle = zram_get_handle(zram, index);
1308         if (!handle)
1309                 return;
1310
1311         zs_free(zram->mem_pool, handle);
1312
1313         atomic64_sub(zram_get_obj_size(zram, index),
1314                         &zram->stats.compr_data_size);
1315 out:
1316         atomic64_dec(&zram->stats.pages_stored);
1317         zram_set_handle(zram, index, 0);
1318         zram_set_obj_size(zram, index, 0);
1319         WARN_ON_ONCE(zram->table[index].flags &
1320                 ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
1321 }
1322
1323 /*
1324  * Reads (decompresses if needed) a page from zspool (zsmalloc).
1325  * Corresponding ZRAM slot should be locked.
1326  */
1327 static int zram_read_from_zspool(struct zram *zram, struct page *page,
1328                                  u32 index)
1329 {
1330         struct zcomp_strm *zstrm;
1331         unsigned long handle;
1332         unsigned int size;
1333         void *src, *dst;
1334         u32 prio;
1335         int ret;
1336
1337         handle = zram_get_handle(zram, index);
1338         if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
1339                 unsigned long value;
1340                 void *mem;
1341
1342                 value = handle ? zram_get_element(zram, index) : 0;
1343                 mem = kmap_atomic(page);
1344                 zram_fill_page(mem, PAGE_SIZE, value);
1345                 kunmap_atomic(mem);
1346                 return 0;
1347         }
1348
1349         size = zram_get_obj_size(zram, index);
1350
1351         if (size != PAGE_SIZE) {
1352                 prio = zram_get_priority(zram, index);
1353                 zstrm = zcomp_stream_get(zram->comps[prio]);
1354         }
1355
1356         src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
1357         if (size == PAGE_SIZE) {
1358                 dst = kmap_atomic(page);
1359                 memcpy(dst, src, PAGE_SIZE);
1360                 kunmap_atomic(dst);
1361                 ret = 0;
1362         } else {
1363                 dst = kmap_atomic(page);
1364                 ret = zcomp_decompress(zstrm, src, size, dst);
1365                 kunmap_atomic(dst);
1366                 zcomp_stream_put(zram->comps[prio]);
1367         }
1368         zs_unmap_object(zram->mem_pool, handle);
1369         return ret;
1370 }
1371
1372 static int zram_read_page(struct zram *zram, struct page *page, u32 index,
1373                           struct bio *bio, bool partial_io)
1374 {
1375         int ret;
1376
1377         zram_slot_lock(zram, index);
1378         if (!zram_test_flag(zram, index, ZRAM_WB)) {
1379                 /* Slot should be locked through out the function call */
1380                 ret = zram_read_from_zspool(zram, page, index);
1381                 zram_slot_unlock(zram, index);
1382         } else {
1383                 /*
1384                  * The slot should be unlocked before reading from the backing
1385                  * device.
1386                  */
1387                 zram_slot_unlock(zram, index);
1388
1389                 ret = read_from_bdev(zram, page, zram_get_element(zram, index),
1390                                      bio, partial_io);
1391         }
1392
1393         /* Should NEVER happen. Return bio error if it does. */
1394         if (WARN_ON(ret < 0))
1395                 pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
1396
1397         return ret;
1398 }
1399
1400 /*
1401  * Use a temporary buffer to decompress the page, as the decompressor
1402  * always expects a full page for the output.
1403  */
1404 static int zram_bvec_read_partial(struct zram *zram, struct bio_vec *bvec,
1405                                   u32 index, int offset, struct bio *bio)
1406 {
1407         struct page *page = alloc_page(GFP_NOIO);
1408         int ret;
1409
1410         if (!page)
1411                 return -ENOMEM;
1412         ret = zram_read_page(zram, page, index, bio, true);
1413         if (likely(!ret))
1414                 memcpy_to_bvec(bvec, page_address(page) + offset);
1415         __free_page(page);
1416         return ret;
1417 }
1418
1419 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
1420                           u32 index, int offset, struct bio *bio)
1421 {
1422         if (is_partial_io(bvec))
1423                 return zram_bvec_read_partial(zram, bvec, index, offset, bio);
1424         return zram_read_page(zram, bvec->bv_page, index, bio, false);
1425 }
1426
1427 static int zram_write_page(struct zram *zram, struct page *page, u32 index)
1428 {
1429         int ret = 0;
1430         unsigned long alloced_pages;
1431         unsigned long handle = -ENOMEM;
1432         unsigned int comp_len = 0;
1433         void *src, *dst, *mem;
1434         struct zcomp_strm *zstrm;
1435         unsigned long element = 0;
1436         enum zram_pageflags flags = 0;
1437
1438         mem = kmap_atomic(page);
1439         if (page_same_filled(mem, &element)) {
1440                 kunmap_atomic(mem);
1441                 /* Free memory associated with this sector now. */
1442                 flags = ZRAM_SAME;
1443                 atomic64_inc(&zram->stats.same_pages);
1444                 goto out;
1445         }
1446         kunmap_atomic(mem);
1447
1448 compress_again:
1449         zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
1450         src = kmap_atomic(page);
1451         ret = zcomp_compress(zstrm, src, &comp_len);
1452         kunmap_atomic(src);
1453
1454         if (unlikely(ret)) {
1455                 zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1456                 pr_err("Compression failed! err=%d\n", ret);
1457                 zs_free(zram->mem_pool, handle);
1458                 return ret;
1459         }
1460
1461         if (comp_len >= huge_class_size)
1462                 comp_len = PAGE_SIZE;
1463         /*
1464          * handle allocation has 2 paths:
1465          * a) fast path is executed with preemption disabled (for
1466          *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
1467          *  since we can't sleep;
1468          * b) slow path enables preemption and attempts to allocate
1469          *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
1470          *  put per-cpu compression stream and, thus, to re-do
1471          *  the compression once handle is allocated.
1472          *
1473          * if we have a 'non-null' handle here then we are coming
1474          * from the slow path and handle has already been allocated.
1475          */
1476         if (IS_ERR_VALUE(handle))
1477                 handle = zs_malloc(zram->mem_pool, comp_len,
1478                                 __GFP_KSWAPD_RECLAIM |
1479                                 __GFP_NOWARN |
1480                                 __GFP_HIGHMEM |
1481                                 __GFP_MOVABLE);
1482         if (IS_ERR_VALUE(handle)) {
1483                 zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1484                 atomic64_inc(&zram->stats.writestall);
1485                 handle = zs_malloc(zram->mem_pool, comp_len,
1486                                 GFP_NOIO | __GFP_HIGHMEM |
1487                                 __GFP_MOVABLE);
1488                 if (IS_ERR_VALUE(handle))
1489                         return PTR_ERR((void *)handle);
1490
1491                 if (comp_len != PAGE_SIZE)
1492                         goto compress_again;
1493                 /*
1494                  * If the page is not compressible, you need to acquire the
1495                  * lock and execute the code below. The zcomp_stream_get()
1496                  * call is needed to disable the cpu hotplug and grab the
1497                  * zstrm buffer back. It is necessary that the dereferencing
1498                  * of the zstrm variable below occurs correctly.
1499                  */
1500                 zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
1501         }
1502
1503         alloced_pages = zs_get_total_pages(zram->mem_pool);
1504         update_used_max(zram, alloced_pages);
1505
1506         if (zram->limit_pages && alloced_pages > zram->limit_pages) {
1507                 zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1508                 zs_free(zram->mem_pool, handle);
1509                 return -ENOMEM;
1510         }
1511
1512         dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
1513
1514         src = zstrm->buffer;
1515         if (comp_len == PAGE_SIZE)
1516                 src = kmap_atomic(page);
1517         memcpy(dst, src, comp_len);
1518         if (comp_len == PAGE_SIZE)
1519                 kunmap_atomic(src);
1520
1521         zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1522         zs_unmap_object(zram->mem_pool, handle);
1523         atomic64_add(comp_len, &zram->stats.compr_data_size);
1524 out:
1525         /*
1526          * Free memory associated with this sector
1527          * before overwriting unused sectors.
1528          */
1529         zram_slot_lock(zram, index);
1530         zram_free_page(zram, index);
1531
1532         if (comp_len == PAGE_SIZE) {
1533                 zram_set_flag(zram, index, ZRAM_HUGE);
1534                 atomic64_inc(&zram->stats.huge_pages);
1535                 atomic64_inc(&zram->stats.huge_pages_since);
1536         }
1537
1538         if (flags) {
1539                 zram_set_flag(zram, index, flags);
1540                 zram_set_element(zram, index, element);
1541         }  else {
1542                 zram_set_handle(zram, index, handle);
1543                 zram_set_obj_size(zram, index, comp_len);
1544         }
1545         zram_slot_unlock(zram, index);
1546
1547         /* Update stats */
1548         atomic64_inc(&zram->stats.pages_stored);
1549         return ret;
1550 }
1551
1552 /*
1553  * This is a partial IO. Read the full page before writing the changes.
1554  */
1555 static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec,
1556                                    u32 index, int offset, struct bio *bio)
1557 {
1558         struct page *page = alloc_page(GFP_NOIO);
1559         int ret;
1560
1561         if (!page)
1562                 return -ENOMEM;
1563
1564         ret = zram_read_page(zram, page, index, bio, true);
1565         if (!ret) {
1566                 memcpy_from_bvec(page_address(page) + offset, bvec);
1567                 ret = zram_write_page(zram, page, index);
1568         }
1569         __free_page(page);
1570         return ret;
1571 }
1572
1573 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1574                            u32 index, int offset, struct bio *bio)
1575 {
1576         if (is_partial_io(bvec))
1577                 return zram_bvec_write_partial(zram, bvec, index, offset, bio);
1578         return zram_write_page(zram, bvec->bv_page, index);
1579 }
1580
1581 #ifdef CONFIG_ZRAM_MULTI_COMP
1582 /*
1583  * This function will decompress (unless it's ZRAM_HUGE) the page and then
1584  * attempt to compress it using provided compression algorithm priority
1585  * (which is potentially more effective).
1586  *
1587  * Corresponding ZRAM slot should be locked.
1588  */
1589 static int zram_recompress(struct zram *zram, u32 index, struct page *page,
1590                            u32 threshold, u32 prio, u32 prio_max)
1591 {
1592         struct zcomp_strm *zstrm = NULL;
1593         unsigned long handle_old;
1594         unsigned long handle_new;
1595         unsigned int comp_len_old;
1596         unsigned int comp_len_new;
1597         unsigned int class_index_old;
1598         unsigned int class_index_new;
1599         u32 num_recomps = 0;
1600         void *src, *dst;
1601         int ret;
1602
1603         handle_old = zram_get_handle(zram, index);
1604         if (!handle_old)
1605                 return -EINVAL;
1606
1607         comp_len_old = zram_get_obj_size(zram, index);
1608         /*
1609          * Do not recompress objects that are already "small enough".
1610          */
1611         if (comp_len_old < threshold)
1612                 return 0;
1613
1614         ret = zram_read_from_zspool(zram, page, index);
1615         if (ret)
1616                 return ret;
1617
1618         class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old);
1619         /*
1620          * Iterate the secondary comp algorithms list (in order of priority)
1621          * and try to recompress the page.
1622          */
1623         for (; prio < prio_max; prio++) {
1624                 if (!zram->comps[prio])
1625                         continue;
1626
1627                 /*
1628                  * Skip if the object is already re-compressed with a higher
1629                  * priority algorithm (or same algorithm).
1630                  */
1631                 if (prio <= zram_get_priority(zram, index))
1632                         continue;
1633
1634                 num_recomps++;
1635                 zstrm = zcomp_stream_get(zram->comps[prio]);
1636                 src = kmap_atomic(page);
1637                 ret = zcomp_compress(zstrm, src, &comp_len_new);
1638                 kunmap_atomic(src);
1639
1640                 if (ret) {
1641                         zcomp_stream_put(zram->comps[prio]);
1642                         return ret;
1643                 }
1644
1645                 class_index_new = zs_lookup_class_index(zram->mem_pool,
1646                                                         comp_len_new);
1647
1648                 /* Continue until we make progress */
1649                 if (class_index_new >= class_index_old ||
1650                     (threshold && comp_len_new >= threshold)) {
1651                         zcomp_stream_put(zram->comps[prio]);
1652                         continue;
1653                 }
1654
1655                 /* Recompression was successful so break out */
1656                 break;
1657         }
1658
1659         /*
1660          * We did not try to recompress, e.g. when we have only one
1661          * secondary algorithm and the page is already recompressed
1662          * using that algorithm
1663          */
1664         if (!zstrm)
1665                 return 0;
1666
1667         if (class_index_new >= class_index_old) {
1668                 /*
1669                  * Secondary algorithms failed to re-compress the page
1670                  * in a way that would save memory, mark the object as
1671                  * incompressible so that we will not try to compress
1672                  * it again.
1673                  *
1674                  * We need to make sure that all secondary algorithms have
1675                  * failed, so we test if the number of recompressions matches
1676                  * the number of active secondary algorithms.
1677                  */
1678                 if (num_recomps == zram->num_active_comps - 1)
1679                         zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE);
1680                 return 0;
1681         }
1682
1683         /* Successful recompression but above threshold */
1684         if (threshold && comp_len_new >= threshold)
1685                 return 0;
1686
1687         /*
1688          * No direct reclaim (slow path) for handle allocation and no
1689          * re-compression attempt (unlike in zram_write_bvec()) since
1690          * we already have stored that object in zsmalloc. If we cannot
1691          * alloc memory for recompressed object then we bail out and
1692          * simply keep the old (existing) object in zsmalloc.
1693          */
1694         handle_new = zs_malloc(zram->mem_pool, comp_len_new,
1695                                __GFP_KSWAPD_RECLAIM |
1696                                __GFP_NOWARN |
1697                                __GFP_HIGHMEM |
1698                                __GFP_MOVABLE);
1699         if (IS_ERR_VALUE(handle_new)) {
1700                 zcomp_stream_put(zram->comps[prio]);
1701                 return PTR_ERR((void *)handle_new);
1702         }
1703
1704         dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO);
1705         memcpy(dst, zstrm->buffer, comp_len_new);
1706         zcomp_stream_put(zram->comps[prio]);
1707
1708         zs_unmap_object(zram->mem_pool, handle_new);
1709
1710         zram_free_page(zram, index);
1711         zram_set_handle(zram, index, handle_new);
1712         zram_set_obj_size(zram, index, comp_len_new);
1713         zram_set_priority(zram, index, prio);
1714
1715         atomic64_add(comp_len_new, &zram->stats.compr_data_size);
1716         atomic64_inc(&zram->stats.pages_stored);
1717
1718         return 0;
1719 }
1720
1721 #define RECOMPRESS_IDLE         (1 << 0)
1722 #define RECOMPRESS_HUGE         (1 << 1)
1723
1724 static ssize_t recompress_store(struct device *dev,
1725                                 struct device_attribute *attr,
1726                                 const char *buf, size_t len)
1727 {
1728         u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS;
1729         struct zram *zram = dev_to_zram(dev);
1730         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
1731         char *args, *param, *val, *algo = NULL;
1732         u32 mode = 0, threshold = 0;
1733         unsigned long index;
1734         struct page *page;
1735         ssize_t ret;
1736
1737         args = skip_spaces(buf);
1738         while (*args) {
1739                 args = next_arg(args, &param, &val);
1740
1741                 if (!val || !*val)
1742                         return -EINVAL;
1743
1744                 if (!strcmp(param, "type")) {
1745                         if (!strcmp(val, "idle"))
1746                                 mode = RECOMPRESS_IDLE;
1747                         if (!strcmp(val, "huge"))
1748                                 mode = RECOMPRESS_HUGE;
1749                         if (!strcmp(val, "huge_idle"))
1750                                 mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
1751                         continue;
1752                 }
1753
1754                 if (!strcmp(param, "threshold")) {
1755                         /*
1756                          * We will re-compress only idle objects equal or
1757                          * greater in size than watermark.
1758                          */
1759                         ret = kstrtouint(val, 10, &threshold);
1760                         if (ret)
1761                                 return ret;
1762                         continue;
1763                 }
1764
1765                 if (!strcmp(param, "algo")) {
1766                         algo = val;
1767                         continue;
1768                 }
1769         }
1770
1771         if (threshold >= PAGE_SIZE)
1772                 return -EINVAL;
1773
1774         down_read(&zram->init_lock);
1775         if (!init_done(zram)) {
1776                 ret = -EINVAL;
1777                 goto release_init_lock;
1778         }
1779
1780         if (algo) {
1781                 bool found = false;
1782
1783                 for (; prio < ZRAM_MAX_COMPS; prio++) {
1784                         if (!zram->comp_algs[prio])
1785                                 continue;
1786
1787                         if (!strcmp(zram->comp_algs[prio], algo)) {
1788                                 prio_max = min(prio + 1, ZRAM_MAX_COMPS);
1789                                 found = true;
1790                                 break;
1791                         }
1792                 }
1793
1794                 if (!found) {
1795                         ret = -EINVAL;
1796                         goto release_init_lock;
1797                 }
1798         }
1799
1800         page = alloc_page(GFP_KERNEL);
1801         if (!page) {
1802                 ret = -ENOMEM;
1803                 goto release_init_lock;
1804         }
1805
1806         ret = len;
1807         for (index = 0; index < nr_pages; index++) {
1808                 int err = 0;
1809
1810                 zram_slot_lock(zram, index);
1811
1812                 if (!zram_allocated(zram, index))
1813                         goto next;
1814
1815                 if (mode & RECOMPRESS_IDLE &&
1816                     !zram_test_flag(zram, index, ZRAM_IDLE))
1817                         goto next;
1818
1819                 if (mode & RECOMPRESS_HUGE &&
1820                     !zram_test_flag(zram, index, ZRAM_HUGE))
1821                         goto next;
1822
1823                 if (zram_test_flag(zram, index, ZRAM_WB) ||
1824                     zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
1825                     zram_test_flag(zram, index, ZRAM_SAME) ||
1826                     zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
1827                         goto next;
1828
1829                 err = zram_recompress(zram, index, page, threshold,
1830                                       prio, prio_max);
1831 next:
1832                 zram_slot_unlock(zram, index);
1833                 if (err) {
1834                         ret = err;
1835                         break;
1836                 }
1837
1838                 cond_resched();
1839         }
1840
1841         __free_page(page);
1842
1843 release_init_lock:
1844         up_read(&zram->init_lock);
1845         return ret;
1846 }
1847 #endif
1848
1849 static void zram_bio_discard(struct zram *zram, struct bio *bio)
1850 {
1851         size_t n = bio->bi_iter.bi_size;
1852         u32 index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1853         u32 offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1854                         SECTOR_SHIFT;
1855
1856         /*
1857          * zram manages data in physical block size units. Because logical block
1858          * size isn't identical with physical block size on some arch, we
1859          * could get a discard request pointing to a specific offset within a
1860          * certain physical block.  Although we can handle this request by
1861          * reading that physiclal block and decompressing and partially zeroing
1862          * and re-compressing and then re-storing it, this isn't reasonable
1863          * because our intent with a discard request is to save memory.  So
1864          * skipping this logical block is appropriate here.
1865          */
1866         if (offset) {
1867                 if (n <= (PAGE_SIZE - offset))
1868                         return;
1869
1870                 n -= (PAGE_SIZE - offset);
1871                 index++;
1872         }
1873
1874         while (n >= PAGE_SIZE) {
1875                 zram_slot_lock(zram, index);
1876                 zram_free_page(zram, index);
1877                 zram_slot_unlock(zram, index);
1878                 atomic64_inc(&zram->stats.notify_free);
1879                 index++;
1880                 n -= PAGE_SIZE;
1881         }
1882
1883         bio_endio(bio);
1884 }
1885
1886 static void zram_bio_read(struct zram *zram, struct bio *bio)
1887 {
1888         struct bvec_iter iter;
1889         struct bio_vec bv;
1890         unsigned long start_time;
1891
1892         start_time = bio_start_io_acct(bio);
1893         bio_for_each_segment(bv, bio, iter) {
1894                 u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1895                 u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1896                                 SECTOR_SHIFT;
1897
1898                 if (zram_bvec_read(zram, &bv, index, offset, bio) < 0) {
1899                         atomic64_inc(&zram->stats.failed_reads);
1900                         bio->bi_status = BLK_STS_IOERR;
1901                         break;
1902                 }
1903                 flush_dcache_page(bv.bv_page);
1904
1905                 zram_slot_lock(zram, index);
1906                 zram_accessed(zram, index);
1907                 zram_slot_unlock(zram, index);
1908         }
1909         bio_end_io_acct(bio, start_time);
1910         bio_endio(bio);
1911 }
1912
1913 static void zram_bio_write(struct zram *zram, struct bio *bio)
1914 {
1915         struct bvec_iter iter;
1916         struct bio_vec bv;
1917         unsigned long start_time;
1918
1919         start_time = bio_start_io_acct(bio);
1920         bio_for_each_segment(bv, bio, iter) {
1921                 u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1922                 u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1923                                 SECTOR_SHIFT;
1924
1925                 if (zram_bvec_write(zram, &bv, index, offset, bio) < 0) {
1926                         atomic64_inc(&zram->stats.failed_writes);
1927                         bio->bi_status = BLK_STS_IOERR;
1928                         break;
1929                 }
1930
1931                 zram_slot_lock(zram, index);
1932                 zram_accessed(zram, index);
1933                 zram_slot_unlock(zram, index);
1934         }
1935         bio_end_io_acct(bio, start_time);
1936         bio_endio(bio);
1937 }
1938
1939 /*
1940  * Handler function for all zram I/O requests.
1941  */
1942 static void zram_submit_bio(struct bio *bio)
1943 {
1944         struct zram *zram = bio->bi_bdev->bd_disk->private_data;
1945
1946         switch (bio_op(bio)) {
1947         case REQ_OP_READ:
1948                 zram_bio_read(zram, bio);
1949                 break;
1950         case REQ_OP_WRITE:
1951                 zram_bio_write(zram, bio);
1952                 break;
1953         case REQ_OP_DISCARD:
1954         case REQ_OP_WRITE_ZEROES:
1955                 zram_bio_discard(zram, bio);
1956                 break;
1957         default:
1958                 WARN_ON_ONCE(1);
1959                 bio_endio(bio);
1960         }
1961 }
1962
1963 static void zram_slot_free_notify(struct block_device *bdev,
1964                                 unsigned long index)
1965 {
1966         struct zram *zram;
1967
1968         zram = bdev->bd_disk->private_data;
1969
1970         atomic64_inc(&zram->stats.notify_free);
1971         if (!zram_slot_trylock(zram, index)) {
1972                 atomic64_inc(&zram->stats.miss_free);
1973                 return;
1974         }
1975
1976         zram_free_page(zram, index);
1977         zram_slot_unlock(zram, index);
1978 }
1979
1980 static void zram_destroy_comps(struct zram *zram)
1981 {
1982         u32 prio;
1983
1984         for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
1985                 struct zcomp *comp = zram->comps[prio];
1986
1987                 zram->comps[prio] = NULL;
1988                 if (!comp)
1989                         continue;
1990                 zcomp_destroy(comp);
1991                 zram->num_active_comps--;
1992         }
1993 }
1994
1995 static void zram_reset_device(struct zram *zram)
1996 {
1997         down_write(&zram->init_lock);
1998
1999         zram->limit_pages = 0;
2000
2001         if (!init_done(zram)) {
2002                 up_write(&zram->init_lock);
2003                 return;
2004         }
2005
2006         set_capacity_and_notify(zram->disk, 0);
2007         part_stat_set_all(zram->disk->part0, 0);
2008
2009         /* I/O operation under all of CPU are done so let's free */
2010         zram_meta_free(zram, zram->disksize);
2011         zram->disksize = 0;
2012         zram_destroy_comps(zram);
2013         memset(&zram->stats, 0, sizeof(zram->stats));
2014         reset_bdev(zram);
2015
2016         comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
2017         up_write(&zram->init_lock);
2018 }
2019
2020 static ssize_t disksize_store(struct device *dev,
2021                 struct device_attribute *attr, const char *buf, size_t len)
2022 {
2023         u64 disksize;
2024         struct zcomp *comp;
2025         struct zram *zram = dev_to_zram(dev);
2026         int err;
2027         u32 prio;
2028
2029         disksize = memparse(buf, NULL);
2030         if (!disksize)
2031                 return -EINVAL;
2032
2033         down_write(&zram->init_lock);
2034         if (init_done(zram)) {
2035                 pr_info("Cannot change disksize for initialized device\n");
2036                 err = -EBUSY;
2037                 goto out_unlock;
2038         }
2039
2040         disksize = PAGE_ALIGN(disksize);
2041         if (!zram_meta_alloc(zram, disksize)) {
2042                 err = -ENOMEM;
2043                 goto out_unlock;
2044         }
2045
2046         for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
2047                 if (!zram->comp_algs[prio])
2048                         continue;
2049
2050                 comp = zcomp_create(zram->comp_algs[prio]);
2051                 if (IS_ERR(comp)) {
2052                         pr_err("Cannot initialise %s compressing backend\n",
2053                                zram->comp_algs[prio]);
2054                         err = PTR_ERR(comp);
2055                         goto out_free_comps;
2056                 }
2057
2058                 zram->comps[prio] = comp;
2059                 zram->num_active_comps++;
2060         }
2061         zram->disksize = disksize;
2062         set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
2063         up_write(&zram->init_lock);
2064
2065         return len;
2066
2067 out_free_comps:
2068         zram_destroy_comps(zram);
2069         zram_meta_free(zram, disksize);
2070 out_unlock:
2071         up_write(&zram->init_lock);
2072         return err;
2073 }
2074
2075 static ssize_t reset_store(struct device *dev,
2076                 struct device_attribute *attr, const char *buf, size_t len)
2077 {
2078         int ret;
2079         unsigned short do_reset;
2080         struct zram *zram;
2081         struct gendisk *disk;
2082
2083         ret = kstrtou16(buf, 10, &do_reset);
2084         if (ret)
2085                 return ret;
2086
2087         if (!do_reset)
2088                 return -EINVAL;
2089
2090         zram = dev_to_zram(dev);
2091         disk = zram->disk;
2092
2093         mutex_lock(&disk->open_mutex);
2094         /* Do not reset an active device or claimed device */
2095         if (disk_openers(disk) || zram->claim) {
2096                 mutex_unlock(&disk->open_mutex);
2097                 return -EBUSY;
2098         }
2099
2100         /* From now on, anyone can't open /dev/zram[0-9] */
2101         zram->claim = true;
2102         mutex_unlock(&disk->open_mutex);
2103
2104         /* Make sure all the pending I/O are finished */
2105         sync_blockdev(disk->part0);
2106         zram_reset_device(zram);
2107
2108         mutex_lock(&disk->open_mutex);
2109         zram->claim = false;
2110         mutex_unlock(&disk->open_mutex);
2111
2112         return len;
2113 }
2114
2115 static int zram_open(struct block_device *bdev, fmode_t mode)
2116 {
2117         int ret = 0;
2118         struct zram *zram;
2119
2120         WARN_ON(!mutex_is_locked(&bdev->bd_disk->open_mutex));
2121
2122         zram = bdev->bd_disk->private_data;
2123         /* zram was claimed to reset so open request fails */
2124         if (zram->claim)
2125                 ret = -EBUSY;
2126
2127         return ret;
2128 }
2129
2130 static const struct block_device_operations zram_devops = {
2131         .open = zram_open,
2132         .submit_bio = zram_submit_bio,
2133         .swap_slot_free_notify = zram_slot_free_notify,
2134         .owner = THIS_MODULE
2135 };
2136
2137 static DEVICE_ATTR_WO(compact);
2138 static DEVICE_ATTR_RW(disksize);
2139 static DEVICE_ATTR_RO(initstate);
2140 static DEVICE_ATTR_WO(reset);
2141 static DEVICE_ATTR_WO(mem_limit);
2142 static DEVICE_ATTR_WO(mem_used_max);
2143 static DEVICE_ATTR_WO(idle);
2144 static DEVICE_ATTR_RW(max_comp_streams);
2145 static DEVICE_ATTR_RW(comp_algorithm);
2146 #ifdef CONFIG_ZRAM_WRITEBACK
2147 static DEVICE_ATTR_RW(backing_dev);
2148 static DEVICE_ATTR_WO(writeback);
2149 static DEVICE_ATTR_RW(writeback_limit);
2150 static DEVICE_ATTR_RW(writeback_limit_enable);
2151 #endif
2152 #ifdef CONFIG_ZRAM_MULTI_COMP
2153 static DEVICE_ATTR_RW(recomp_algorithm);
2154 static DEVICE_ATTR_WO(recompress);
2155 #endif
2156
2157 static struct attribute *zram_disk_attrs[] = {
2158         &dev_attr_disksize.attr,
2159         &dev_attr_initstate.attr,
2160         &dev_attr_reset.attr,
2161         &dev_attr_compact.attr,
2162         &dev_attr_mem_limit.attr,
2163         &dev_attr_mem_used_max.attr,
2164         &dev_attr_idle.attr,
2165         &dev_attr_max_comp_streams.attr,
2166         &dev_attr_comp_algorithm.attr,
2167 #ifdef CONFIG_ZRAM_WRITEBACK
2168         &dev_attr_backing_dev.attr,
2169         &dev_attr_writeback.attr,
2170         &dev_attr_writeback_limit.attr,
2171         &dev_attr_writeback_limit_enable.attr,
2172 #endif
2173         &dev_attr_io_stat.attr,
2174         &dev_attr_mm_stat.attr,
2175 #ifdef CONFIG_ZRAM_WRITEBACK
2176         &dev_attr_bd_stat.attr,
2177 #endif
2178         &dev_attr_debug_stat.attr,
2179 #ifdef CONFIG_ZRAM_MULTI_COMP
2180         &dev_attr_recomp_algorithm.attr,
2181         &dev_attr_recompress.attr,
2182 #endif
2183         NULL,
2184 };
2185
2186 ATTRIBUTE_GROUPS(zram_disk);
2187
2188 /*
2189  * Allocate and initialize new zram device. the function returns
2190  * '>= 0' device_id upon success, and negative value otherwise.
2191  */
2192 static int zram_add(void)
2193 {
2194         struct zram *zram;
2195         int ret, device_id;
2196
2197         zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
2198         if (!zram)
2199                 return -ENOMEM;
2200
2201         ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
2202         if (ret < 0)
2203                 goto out_free_dev;
2204         device_id = ret;
2205
2206         init_rwsem(&zram->init_lock);
2207 #ifdef CONFIG_ZRAM_WRITEBACK
2208         spin_lock_init(&zram->wb_limit_lock);
2209 #endif
2210
2211         /* gendisk structure */
2212         zram->disk = blk_alloc_disk(NUMA_NO_NODE);
2213         if (!zram->disk) {
2214                 pr_err("Error allocating disk structure for device %d\n",
2215                         device_id);
2216                 ret = -ENOMEM;
2217                 goto out_free_idr;
2218         }
2219
2220         zram->disk->major = zram_major;
2221         zram->disk->first_minor = device_id;
2222         zram->disk->minors = 1;
2223         zram->disk->flags |= GENHD_FL_NO_PART;
2224         zram->disk->fops = &zram_devops;
2225         zram->disk->private_data = zram;
2226         snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
2227
2228         /* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */
2229         set_capacity(zram->disk, 0);
2230         /* zram devices sort of resembles non-rotational disks */
2231         blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
2232         blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
2233         blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
2234
2235         /*
2236          * To ensure that we always get PAGE_SIZE aligned
2237          * and n*PAGE_SIZED sized I/O requests.
2238          */
2239         blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
2240         blk_queue_logical_block_size(zram->disk->queue,
2241                                         ZRAM_LOGICAL_BLOCK_SIZE);
2242         blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
2243         blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
2244         zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
2245         blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
2246
2247         /*
2248          * zram_bio_discard() will clear all logical blocks if logical block
2249          * size is identical with physical block size(PAGE_SIZE). But if it is
2250          * different, we will skip discarding some parts of logical blocks in
2251          * the part of the request range which isn't aligned to physical block
2252          * size.  So we can't ensure that all discarded logical blocks are
2253          * zeroed.
2254          */
2255         if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
2256                 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
2257
2258         blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
2259         ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
2260         if (ret)
2261                 goto out_cleanup_disk;
2262
2263         comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
2264
2265         zram_debugfs_register(zram);
2266         pr_info("Added device: %s\n", zram->disk->disk_name);
2267         return device_id;
2268
2269 out_cleanup_disk:
2270         put_disk(zram->disk);
2271 out_free_idr:
2272         idr_remove(&zram_index_idr, device_id);
2273 out_free_dev:
2274         kfree(zram);
2275         return ret;
2276 }
2277
2278 static int zram_remove(struct zram *zram)
2279 {
2280         bool claimed;
2281
2282         mutex_lock(&zram->disk->open_mutex);
2283         if (disk_openers(zram->disk)) {
2284                 mutex_unlock(&zram->disk->open_mutex);
2285                 return -EBUSY;
2286         }
2287
2288         claimed = zram->claim;
2289         if (!claimed)
2290                 zram->claim = true;
2291         mutex_unlock(&zram->disk->open_mutex);
2292
2293         zram_debugfs_unregister(zram);
2294
2295         if (claimed) {
2296                 /*
2297                  * If we were claimed by reset_store(), del_gendisk() will
2298                  * wait until reset_store() is done, so nothing need to do.
2299                  */
2300                 ;
2301         } else {
2302                 /* Make sure all the pending I/O are finished */
2303                 sync_blockdev(zram->disk->part0);
2304                 zram_reset_device(zram);
2305         }
2306
2307         pr_info("Removed device: %s\n", zram->disk->disk_name);
2308
2309         del_gendisk(zram->disk);
2310
2311         /* del_gendisk drains pending reset_store */
2312         WARN_ON_ONCE(claimed && zram->claim);
2313
2314         /*
2315          * disksize_store() may be called in between zram_reset_device()
2316          * and del_gendisk(), so run the last reset to avoid leaking
2317          * anything allocated with disksize_store()
2318          */
2319         zram_reset_device(zram);
2320
2321         put_disk(zram->disk);
2322         kfree(zram);
2323         return 0;
2324 }
2325
2326 /* zram-control sysfs attributes */
2327
2328 /*
2329  * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
2330  * sense that reading from this file does alter the state of your system -- it
2331  * creates a new un-initialized zram device and returns back this device's
2332  * device_id (or an error code if it fails to create a new device).
2333  */
2334 static ssize_t hot_add_show(struct class *class,
2335                         struct class_attribute *attr,
2336                         char *buf)
2337 {
2338         int ret;
2339
2340         mutex_lock(&zram_index_mutex);
2341         ret = zram_add();
2342         mutex_unlock(&zram_index_mutex);
2343
2344         if (ret < 0)
2345                 return ret;
2346         return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
2347 }
2348 static struct class_attribute class_attr_hot_add =
2349         __ATTR(hot_add, 0400, hot_add_show, NULL);
2350
2351 static ssize_t hot_remove_store(struct class *class,
2352                         struct class_attribute *attr,
2353                         const char *buf,
2354                         size_t count)
2355 {
2356         struct zram *zram;
2357         int ret, dev_id;
2358
2359         /* dev_id is gendisk->first_minor, which is `int' */
2360         ret = kstrtoint(buf, 10, &dev_id);
2361         if (ret)
2362                 return ret;
2363         if (dev_id < 0)
2364                 return -EINVAL;
2365
2366         mutex_lock(&zram_index_mutex);
2367
2368         zram = idr_find(&zram_index_idr, dev_id);
2369         if (zram) {
2370                 ret = zram_remove(zram);
2371                 if (!ret)
2372                         idr_remove(&zram_index_idr, dev_id);
2373         } else {
2374                 ret = -ENODEV;
2375         }
2376
2377         mutex_unlock(&zram_index_mutex);
2378         return ret ? ret : count;
2379 }
2380 static CLASS_ATTR_WO(hot_remove);
2381
2382 static struct attribute *zram_control_class_attrs[] = {
2383         &class_attr_hot_add.attr,
2384         &class_attr_hot_remove.attr,
2385         NULL,
2386 };
2387 ATTRIBUTE_GROUPS(zram_control_class);
2388
2389 static struct class zram_control_class = {
2390         .name           = "zram-control",
2391         .owner          = THIS_MODULE,
2392         .class_groups   = zram_control_class_groups,
2393 };
2394
2395 static int zram_remove_cb(int id, void *ptr, void *data)
2396 {
2397         WARN_ON_ONCE(zram_remove(ptr));
2398         return 0;
2399 }
2400
2401 static void destroy_devices(void)
2402 {
2403         class_unregister(&zram_control_class);
2404         idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
2405         zram_debugfs_destroy();
2406         idr_destroy(&zram_index_idr);
2407         unregister_blkdev(zram_major, "zram");
2408         cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2409 }
2410
2411 static int __init zram_init(void)
2412 {
2413         int ret;
2414
2415         BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG);
2416
2417         ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
2418                                       zcomp_cpu_up_prepare, zcomp_cpu_dead);
2419         if (ret < 0)
2420                 return ret;
2421
2422         ret = class_register(&zram_control_class);
2423         if (ret) {
2424                 pr_err("Unable to register zram-control class\n");
2425                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2426                 return ret;
2427         }
2428
2429         zram_debugfs_create();
2430         zram_major = register_blkdev(0, "zram");
2431         if (zram_major <= 0) {
2432                 pr_err("Unable to get major number\n");
2433                 class_unregister(&zram_control_class);
2434                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2435                 return -EBUSY;
2436         }
2437
2438         while (num_devices != 0) {
2439                 mutex_lock(&zram_index_mutex);
2440                 ret = zram_add();
2441                 mutex_unlock(&zram_index_mutex);
2442                 if (ret < 0)
2443                         goto out_error;
2444                 num_devices--;
2445         }
2446
2447         return 0;
2448
2449 out_error:
2450         destroy_devices();
2451         return ret;
2452 }
2453
2454 static void __exit zram_exit(void)
2455 {
2456         destroy_devices();
2457 }
2458
2459 module_init(zram_init);
2460 module_exit(zram_exit);
2461
2462 module_param(num_devices, uint, 0);
2463 MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
2464
2465 MODULE_LICENSE("Dual BSD/GPL");
2466 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
2467 MODULE_DESCRIPTION("Compressed RAM Block Device");