drivers/md/dm-bufio.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2009-2011 Red Hat, Inc.
   4  *
   5  * Author: Mikulas Patocka <mpatocka@redhat.com>
   6  *
   7  * This file is released under the GPL.
   8  */
   9
  10 #include <linux/dm-bufio.h>
  11
  12 #include <linux/device-mapper.h>
  13 #include <linux/dm-io.h>
  14 #include <linux/slab.h>
  15 #include <linux/sched/mm.h>
  16 #include <linux/jiffies.h>
  17 #include <linux/vmalloc.h>
  18 #include <linux/shrinker.h>
  19 #include <linux/module.h>
  20 #include <linux/rbtree.h>
  21 #include <linux/stacktrace.h>
  22 #include <linux/jump_label.h>
  23
  24 #include "dm.h"
  25
  26 #define DM_MSG_PREFIX "bufio"
  27
  28 /*
  29  * Memory management policy:
  30  *      Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
  31  *      or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
  32  *      Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
  33  *      Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
  34  *      dirty buffers.
  35  */
  36 #define DM_BUFIO_MIN_BUFFERS            8
  37
  38 #define DM_BUFIO_MEMORY_PERCENT         2
  39 #define DM_BUFIO_VMALLOC_PERCENT        25
  40 #define DM_BUFIO_WRITEBACK_RATIO        3
  41 #define DM_BUFIO_LOW_WATERMARK_RATIO    16
  42
  43 /*
  44  * Check buffer ages in this interval (seconds)
  45  */
  46 #define DM_BUFIO_WORK_TIMER_SECS        30
  47
  48 /*
  49  * Free buffers when they are older than this (seconds)
  50  */
  51 #define DM_BUFIO_DEFAULT_AGE_SECS       300
  52
  53 /*
  54  * The nr of bytes of cached data to keep around.
  55  */
  56 #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
  57
  58 /*
  59  * Align buffer writes to this boundary.
  60  * Tests show that SSDs have the highest IOPS when using 4k writes.
  61  */
  62 #define DM_BUFIO_WRITE_ALIGN            4096
  63
  64 /*
  65  * dm_buffer->list_mode
  66  */
  67 #define LIST_CLEAN      0
  68 #define LIST_DIRTY      1
  69 #define LIST_SIZE       2
  70
  71 /*--------------------------------------------------------------*/
  72
  73 /*
  74  * Rather than use an LRU list, we use a clock algorithm where entries
  75  * are held in a circular list.  When an entry is 'hit' a reference bit
  76  * is set.  The least recently used entry is approximated by running a
  77  * cursor around the list selecting unreferenced entries. Referenced
  78  * entries have their reference bit cleared as the cursor passes them.
  79  */
  80 struct lru_entry {
  81         struct list_head list;
  82         atomic_t referenced;
  83 };
  84
  85 struct lru_iter {
  86         struct lru *lru;
  87         struct list_head list;
  88         struct lru_entry *stop;
  89         struct lru_entry *e;
  90 };
  91
  92 struct lru {
  93         struct list_head *cursor;
  94         unsigned long count;
  95
  96         struct list_head iterators;
  97 };
  98
  99 /*--------------*/
 100
 101 static void lru_init(struct lru *lru)
 102 {
 103         lru->cursor = NULL;
 104         lru->count = 0;
 105         INIT_LIST_HEAD(&lru->iterators);
 106 }
 107
 108 static void lru_destroy(struct lru *lru)
 109 {
 110         WARN_ON_ONCE(lru->cursor);
 111         WARN_ON_ONCE(!list_empty(&lru->iterators));
 112 }
 113
 114 /*
 115  * Insert a new entry into the lru.
 116  */
 117 static void lru_insert(struct lru *lru, struct lru_entry *le)
 118 {
 119         /*
 120          * Don't be tempted to set to 1, makes the lru aspect
 121          * perform poorly.
 122          */
 123         atomic_set(&le->referenced, 0);
 124
 125         if (lru->cursor) {
 126                 list_add_tail(&le->list, lru->cursor);
 127         } else {
 128                 INIT_LIST_HEAD(&le->list);
 129                 lru->cursor = &le->list;
 130         }
 131         lru->count++;
 132 }
 133
 134 /*--------------*/
 135
 136 /*
 137  * Convert a list_head pointer to an lru_entry pointer.
 138  */
 139 static inline struct lru_entry *to_le(struct list_head *l)
 140 {
 141         return container_of(l, struct lru_entry, list);
 142 }
 143
 144 /*
 145  * Initialize an lru_iter and add it to the list of cursors in the lru.
 146  */
 147 static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
 148 {
 149         it->lru = lru;
 150         it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
 151         it->e = lru->cursor ? to_le(lru->cursor) : NULL;
 152         list_add(&it->list, &lru->iterators);
 153 }
 154
 155 /*
 156  * Remove an lru_iter from the list of cursors in the lru.
 157  */
 158 static inline void lru_iter_end(struct lru_iter *it)
 159 {
 160         list_del(&it->list);
 161 }
 162
 163 /* Predicate function type to be used with lru_iter_next */
 164 typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
 165
 166 /*
 167  * Advance the cursor to the next entry that passes the
 168  * predicate, and return that entry.  Returns NULL if the
 169  * iteration is complete.
 170  */
 171 static struct lru_entry *lru_iter_next(struct lru_iter *it,
 172                                        iter_predicate pred, void *context)
 173 {
 174         struct lru_entry *e;
 175
 176         while (it->e) {
 177                 e = it->e;
 178
 179                 /* advance the cursor */
 180                 if (it->e == it->stop)
 181                         it->e = NULL;
 182                 else
 183                         it->e = to_le(it->e->list.next);
 184
 185                 if (pred(e, context))
 186                         return e;
 187         }
 188
 189         return NULL;
 190 }
 191
 192 /*
 193  * Invalidate a specific lru_entry and update all cursors in
 194  * the lru accordingly.
 195  */
 196 static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
 197 {
 198         struct lru_iter *it;
 199
 200         list_for_each_entry(it, &lru->iterators, list) {
 201                 /* Move c->e forwards if necc. */
 202                 if (it->e == e) {
 203                         it->e = to_le(it->e->list.next);
 204                         if (it->e == e)
 205                                 it->e = NULL;
 206                 }
 207
 208                 /* Move it->stop backwards if necc. */
 209                 if (it->stop == e) {
 210                         it->stop = to_le(it->stop->list.prev);
 211                         if (it->stop == e)
 212                                 it->stop = NULL;
 213                 }
 214         }
 215 }
 216
 217 /*--------------*/
 218
 219 /*
 220  * Remove a specific entry from the lru.
 221  */
 222 static void lru_remove(struct lru *lru, struct lru_entry *le)
 223 {
 224         lru_iter_invalidate(lru, le);
 225         if (lru->count == 1) {
 226                 lru->cursor = NULL;
 227         } else {
 228                 if (lru->cursor == &le->list)
 229                         lru->cursor = lru->cursor->next;
 230                 list_del(&le->list);
 231         }
 232         lru->count--;
 233 }
 234
 235 /*
 236  * Mark as referenced.
 237  */
 238 static inline void lru_reference(struct lru_entry *le)
 239 {
 240         atomic_set(&le->referenced, 1);
 241 }
 242
 243 /*--------------*/
 244
 245 /*
 246  * Remove the least recently used entry (approx), that passes the predicate.
 247  * Returns NULL on failure.
 248  */
 249 enum evict_result {
 250         ER_EVICT,
 251         ER_DONT_EVICT,
 252         ER_STOP, /* stop looking for something to evict */
 253 };
 254
 255 typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
 256
 257 static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context)
 258 {
 259         unsigned long tested = 0;
 260         struct list_head *h = lru->cursor;
 261         struct lru_entry *le;
 262
 263         if (!h)
 264                 return NULL;
 265         /*
 266          * In the worst case we have to loop around twice. Once to clear
 267          * the reference flags, and then again to discover the predicate
 268          * fails for all entries.
 269          */
 270         while (tested < lru->count) {
 271                 le = container_of(h, struct lru_entry, list);
 272
 273                 if (atomic_read(&le->referenced)) {
 274                         atomic_set(&le->referenced, 0);
 275                 } else {
 276                         tested++;
 277                         switch (pred(le, context)) {
 278                         case ER_EVICT:
 279                                 /*
 280                                  * Adjust the cursor, so we start the next
 281                                  * search from here.
 282                                  */
 283                                 lru->cursor = le->list.next;
 284                                 lru_remove(lru, le);
 285                                 return le;
 286
 287                         case ER_DONT_EVICT:
 288                                 break;
 289
 290                         case ER_STOP:
 291                                 lru->cursor = le->list.next;
 292                                 return NULL;
 293                         }
 294                 }
 295
 296                 h = h->next;
 297
 298                 cond_resched();
 299         }
 300
 301         return NULL;
 302 }
 303
 304 /*--------------------------------------------------------------*/
 305
 306 /*
 307  * Buffer state bits.
 308  */
 309 #define B_READING       0
 310 #define B_WRITING       1
 311 #define B_DIRTY         2
 312
 313 /*
 314  * Describes how the block was allocated:
 315  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
 316  * See the comment at alloc_buffer_data.
 317  */
 318 enum data_mode {
 319         DATA_MODE_SLAB = 0,
 320         DATA_MODE_GET_FREE_PAGES = 1,
 321         DATA_MODE_VMALLOC = 2,
 322         DATA_MODE_LIMIT = 3
 323 };
 324
 325 struct dm_buffer {
 326         /* protected by the locks in dm_buffer_cache */
 327         struct rb_node node;
 328
 329         /* immutable, so don't need protecting */
 330         sector_t block;
 331         void *data;
 332         unsigned char data_mode;                /* DATA_MODE_* */
 333
 334         /*
 335          * These two fields are used in isolation, so do not need
 336          * a surrounding lock.
 337          */
 338         atomic_t hold_count;
 339         unsigned long last_accessed;
 340
 341         /*
 342          * Everything else is protected by the mutex in
 343          * dm_bufio_client
 344          */
 345         unsigned long state;
 346         struct lru_entry lru;
 347         unsigned char list_mode;                /* LIST_* */
 348         blk_status_t read_error;
 349         blk_status_t write_error;
 350         unsigned int dirty_start;
 351         unsigned int dirty_end;
 352         unsigned int write_start;
 353         unsigned int write_end;
 354         struct list_head write_list;
 355         struct dm_bufio_client *c;
 356         void (*end_io)(struct dm_buffer *b, blk_status_t bs);
 357 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 358 #define MAX_STACK 10
 359         unsigned int stack_len;
 360         unsigned long stack_entries[MAX_STACK];
 361 #endif
 362 };
 363
 364 /*--------------------------------------------------------------*/
 365
 366 /*
 367  * The buffer cache manages buffers, particularly:
 368  *  - inc/dec of holder count
 369  *  - setting the last_accessed field
 370  *  - maintains clean/dirty state along with lru
 371  *  - selecting buffers that match predicates
 372  *
 373  * It does *not* handle:
 374  *  - allocation/freeing of buffers.
 375  *  - IO
 376  *  - Eviction or cache sizing.
 377  *
 378  * cache_get() and cache_put() are threadsafe, you do not need to
 379  * protect these calls with a surrounding mutex.  All the other
 380  * methods are not threadsafe; they do use locking primitives, but
 381  * only enough to ensure get/put are threadsafe.
 382  */
 383
 384 struct buffer_tree {
 385         struct rw_semaphore lock;
 386         struct rb_root root;
 387 } ____cacheline_aligned_in_smp;
 388
 389 struct dm_buffer_cache {
 390         struct lru lru[LIST_SIZE];
 391         /*
 392          * We spread entries across multiple trees to reduce contention
 393          * on the locks.
 394          */
 395         unsigned int num_locks;
 396         struct buffer_tree trees[];
 397 };
 398
 399 static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
 400 {
 401         return dm_hash_locks_index(block, num_locks);
 402 }
 403
 404 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
 405 {
 406         down_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
 407 }
 408
 409 static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
 410 {
 411         up_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
 412 }
 413
 414 static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
 415 {
 416         down_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
 417 }
 418
 419 static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
 420 {
 421         up_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
 422 }
 423
 424 /*
 425  * Sometimes we want to repeatedly get and drop locks as part of an iteration.
 426  * This struct helps avoid redundant drop and gets of the same lock.
 427  */
 428 struct lock_history {
 429         struct dm_buffer_cache *cache;
 430         bool write;
 431         unsigned int previous;
 432         unsigned int no_previous;
 433 };
 434
 435 static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
 436 {
 437         lh->cache = cache;
 438         lh->write = write;
 439         lh->no_previous = cache->num_locks;
 440         lh->previous = lh->no_previous;
 441 }
 442
 443 static void __lh_lock(struct lock_history *lh, unsigned int index)
 444 {
 445         if (lh->write)
 446                 down_write(&lh->cache->trees[index].lock);
 447         else
 448                 down_read(&lh->cache->trees[index].lock);
 449 }
 450
 451 static void __lh_unlock(struct lock_history *lh, unsigned int index)
 452 {
 453         if (lh->write)
 454                 up_write(&lh->cache->trees[index].lock);
 455         else
 456                 up_read(&lh->cache->trees[index].lock);
 457 }
 458
 459 /*
 460  * Make sure you call this since it will unlock the final lock.
 461  */
 462 static void lh_exit(struct lock_history *lh)
 463 {
 464         if (lh->previous != lh->no_previous) {
 465                 __lh_unlock(lh, lh->previous);
 466                 lh->previous = lh->no_previous;
 467         }
 468 }
 469
 470 /*
 471  * Named 'next' because there is no corresponding
 472  * 'up/unlock' call since it's done automatically.
 473  */
 474 static void lh_next(struct lock_history *lh, sector_t b)
 475 {
 476         unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
 477
 478         if (lh->previous != lh->no_previous) {
 479                 if (lh->previous != index) {
 480                         __lh_unlock(lh, lh->previous);
 481                         __lh_lock(lh, index);
 482                         lh->previous = index;
 483                 }
 484         } else {
 485                 __lh_lock(lh, index);
 486                 lh->previous = index;
 487         }
 488 }
 489
 490 static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
 491 {
 492         return container_of(le, struct dm_buffer, lru);
 493 }
 494
 495 static struct dm_buffer *list_to_buffer(struct list_head *l)
 496 {
 497         struct lru_entry *le = list_entry(l, struct lru_entry, list);
 498
 499         if (!le)
 500                 return NULL;
 501
 502         return le_to_buffer(le);
 503 }
 504
 505 static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks)
 506 {
 507         unsigned int i;
 508
 509         bc->num_locks = num_locks;
 510
 511         for (i = 0; i < bc->num_locks; i++) {
 512                 init_rwsem(&bc->trees[i].lock);
 513                 bc->trees[i].root = RB_ROOT;
 514         }
 515
 516         lru_init(&bc->lru[LIST_CLEAN]);
 517         lru_init(&bc->lru[LIST_DIRTY]);
 518 }
 519
 520 static void cache_destroy(struct dm_buffer_cache *bc)
 521 {
 522         unsigned int i;
 523
 524         for (i = 0; i < bc->num_locks; i++)
 525                 WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
 526
 527         lru_destroy(&bc->lru[LIST_CLEAN]);
 528         lru_destroy(&bc->lru[LIST_DIRTY]);
 529 }
 530
 531 /*--------------*/
 532
 533 /*
 534  * not threadsafe, or racey depending how you look at it
 535  */
 536 static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
 537 {
 538         return bc->lru[list_mode].count;
 539 }
 540
 541 static inline unsigned long cache_total(struct dm_buffer_cache *bc)
 542 {
 543         return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
 544 }
 545
 546 /*--------------*/
 547
 548 /*
 549  * Gets a specific buffer, indexed by block.
 550  * If the buffer is found then its holder count will be incremented and
 551  * lru_reference will be called.
 552  *
 553  * threadsafe
 554  */
 555 static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
 556 {
 557         struct rb_node *n = root->rb_node;
 558         struct dm_buffer *b;
 559
 560         while (n) {
 561                 b = container_of(n, struct dm_buffer, node);
 562
 563                 if (b->block == block)
 564                         return b;
 565
 566                 n = block < b->block ? n->rb_left : n->rb_right;
 567         }
 568
 569         return NULL;
 570 }
 571
 572 static void __cache_inc_buffer(struct dm_buffer *b)
 573 {
 574         atomic_inc(&b->hold_count);
 575         WRITE_ONCE(b->last_accessed, jiffies);
 576 }
 577
 578 static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
 579 {
 580         struct dm_buffer *b;
 581
 582         cache_read_lock(bc, block);
 583         b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
 584         if (b) {
 585                 lru_reference(&b->lru);
 586                 __cache_inc_buffer(b);
 587         }
 588         cache_read_unlock(bc, block);
 589
 590         return b;
 591 }
 592
 593 /*--------------*/
 594
 595 /*
 596  * Returns true if the hold count hits zero.
 597  * threadsafe
 598  */
 599 static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
 600 {
 601         bool r;
 602
 603         cache_read_lock(bc, b->block);
 604         BUG_ON(!atomic_read(&b->hold_count));
 605         r = atomic_dec_and_test(&b->hold_count);
 606         cache_read_unlock(bc, b->block);
 607
 608         return r;
 609 }
 610
 611 /*--------------*/
 612
 613 typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
 614
 615 /*
 616  * Evicts a buffer based on a predicate.  The oldest buffer that
 617  * matches the predicate will be selected.  In addition to the
 618  * predicate the hold_count of the selected buffer will be zero.
 619  */
 620 struct evict_wrapper {
 621         struct lock_history *lh;
 622         b_predicate pred;
 623         void *context;
 624 };
 625
 626 /*
 627  * Wraps the buffer predicate turning it into an lru predicate.  Adds
 628  * extra test for hold_count.
 629  */
 630 static enum evict_result __evict_pred(struct lru_entry *le, void *context)
 631 {
 632         struct evict_wrapper *w = context;
 633         struct dm_buffer *b = le_to_buffer(le);
 634
 635         lh_next(w->lh, b->block);
 636
 637         if (atomic_read(&b->hold_count))
 638                 return ER_DONT_EVICT;
 639
 640         return w->pred(b, w->context);
 641 }
 642
 643 static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
 644                                        b_predicate pred, void *context,
 645                                        struct lock_history *lh)
 646 {
 647         struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
 648         struct lru_entry *le;
 649         struct dm_buffer *b;
 650
 651         le = lru_evict(&bc->lru[list_mode], __evict_pred, &w);
 652         if (!le)
 653                 return NULL;
 654
 655         b = le_to_buffer(le);
 656         /* __evict_pred will have locked the appropriate tree. */
 657         rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
 658
 659         return b;
 660 }
 661
 662 static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
 663                                      b_predicate pred, void *context)
 664 {
 665         struct dm_buffer *b;
 666         struct lock_history lh;
 667
 668         lh_init(&lh, bc, true);
 669         b = __cache_evict(bc, list_mode, pred, context, &lh);
 670         lh_exit(&lh);
 671
 672         return b;
 673 }
 674
 675 /*--------------*/
 676
 677 /*
 678  * Mark a buffer as clean or dirty. Not threadsafe.
 679  */
 680 static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
 681 {
 682         cache_write_lock(bc, b->block);
 683         if (list_mode != b->list_mode) {
 684                 lru_remove(&bc->lru[b->list_mode], &b->lru);
 685                 b->list_mode = list_mode;
 686                 lru_insert(&bc->lru[b->list_mode], &b->lru);
 687         }
 688         cache_write_unlock(bc, b->block);
 689 }
 690
 691 /*--------------*/
 692
 693 /*
 694  * Runs through the lru associated with 'old_mode', if the predicate matches then
 695  * it moves them to 'new_mode'.  Not threadsafe.
 696  */
 697 static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
 698                               b_predicate pred, void *context, struct lock_history *lh)
 699 {
 700         struct lru_entry *le;
 701         struct dm_buffer *b;
 702         struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
 703
 704         while (true) {
 705                 le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
 706                 if (!le)
 707                         break;
 708
 709                 b = le_to_buffer(le);
 710                 b->list_mode = new_mode;
 711                 lru_insert(&bc->lru[b->list_mode], &b->lru);
 712         }
 713 }
 714
 715 static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
 716                             b_predicate pred, void *context)
 717 {
 718         struct lock_history lh;
 719
 720         lh_init(&lh, bc, true);
 721         __cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
 722         lh_exit(&lh);
 723 }
 724
 725 /*--------------*/
 726
 727 /*
 728  * Iterates through all clean or dirty entries calling a function for each
 729  * entry.  The callback may terminate the iteration early.  Not threadsafe.
 730  */
 731
 732 /*
 733  * Iterator functions should return one of these actions to indicate
 734  * how the iteration should proceed.
 735  */
 736 enum it_action {
 737         IT_NEXT,
 738         IT_COMPLETE,
 739 };
 740
 741 typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
 742
 743 static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
 744                             iter_fn fn, void *context, struct lock_history *lh)
 745 {
 746         struct lru *lru = &bc->lru[list_mode];
 747         struct lru_entry *le, *first;
 748
 749         if (!lru->cursor)
 750                 return;
 751
 752         first = le = to_le(lru->cursor);
 753         do {
 754                 struct dm_buffer *b = le_to_buffer(le);
 755
 756                 lh_next(lh, b->block);
 757
 758                 switch (fn(b, context)) {
 759                 case IT_NEXT:
 760                         break;
 761
 762                 case IT_COMPLETE:
 763                         return;
 764                 }
 765                 cond_resched();
 766
 767                 le = to_le(le->list.next);
 768         } while (le != first);
 769 }
 770
 771 static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
 772                           iter_fn fn, void *context)
 773 {
 774         struct lock_history lh;
 775
 776         lh_init(&lh, bc, false);
 777         __cache_iterate(bc, list_mode, fn, context, &lh);
 778         lh_exit(&lh);
 779 }
 780
 781 /*--------------*/
 782
 783 /*
 784  * Passes ownership of the buffer to the cache. Returns false if the
 785  * buffer was already present (in which case ownership does not pass).
 786  * eg, a race with another thread.
 787  *
 788  * Holder count should be 1 on insertion.
 789  *
 790  * Not threadsafe.
 791  */
 792 static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
 793 {
 794         struct rb_node **new = &root->rb_node, *parent = NULL;
 795         struct dm_buffer *found;
 796
 797         while (*new) {
 798                 found = container_of(*new, struct dm_buffer, node);
 799
 800                 if (found->block == b->block)
 801                         return false;
 802
 803                 parent = *new;
 804                 new = b->block < found->block ?
 805                         &found->node.rb_left : &found->node.rb_right;
 806         }
 807
 808         rb_link_node(&b->node, parent, new);
 809         rb_insert_color(&b->node, root);
 810
 811         return true;
 812 }
 813
 814 static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
 815 {
 816         bool r;
 817
 818         if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
 819                 return false;
 820
 821         cache_write_lock(bc, b->block);
 822         BUG_ON(atomic_read(&b->hold_count) != 1);
 823         r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
 824         if (r)
 825                 lru_insert(&bc->lru[b->list_mode], &b->lru);
 826         cache_write_unlock(bc, b->block);
 827
 828         return r;
 829 }
 830
 831 /*--------------*/
 832
 833 /*
 834  * Removes buffer from cache, ownership of the buffer passes back to the caller.
 835  * Fails if the hold_count is not one (ie. the caller holds the only reference).
 836  *
 837  * Not threadsafe.
 838  */
 839 static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
 840 {
 841         bool r;
 842
 843         cache_write_lock(bc, b->block);
 844
 845         if (atomic_read(&b->hold_count) != 1) {
 846                 r = false;
 847         } else {
 848                 r = true;
 849                 rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
 850                 lru_remove(&bc->lru[b->list_mode], &b->lru);
 851         }
 852
 853         cache_write_unlock(bc, b->block);
 854
 855         return r;
 856 }
 857
 858 /*--------------*/
 859
 860 typedef void (*b_release)(struct dm_buffer *);
 861
 862 static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
 863 {
 864         struct rb_node *n = root->rb_node;
 865         struct dm_buffer *b;
 866         struct dm_buffer *best = NULL;
 867
 868         while (n) {
 869                 b = container_of(n, struct dm_buffer, node);
 870
 871                 if (b->block == block)
 872                         return b;
 873
 874                 if (block <= b->block) {
 875                         n = n->rb_left;
 876                         best = b;
 877                 } else {
 878                         n = n->rb_right;
 879                 }
 880         }
 881
 882         return best;
 883 }
 884
 885 static void __remove_range(struct dm_buffer_cache *bc,
 886                            struct rb_root *root,
 887                            sector_t begin, sector_t end,
 888                            b_predicate pred, b_release release)
 889 {
 890         struct dm_buffer *b;
 891
 892         while (true) {
 893                 cond_resched();
 894
 895                 b = __find_next(root, begin);
 896                 if (!b || (b->block >= end))
 897                         break;
 898
 899                 begin = b->block + 1;
 900
 901                 if (atomic_read(&b->hold_count))
 902                         continue;
 903
 904                 if (pred(b, NULL) == ER_EVICT) {
 905                         rb_erase(&b->node, root);
 906                         lru_remove(&bc->lru[b->list_mode], &b->lru);
 907                         release(b);
 908                 }
 909         }
 910 }
 911
 912 static void cache_remove_range(struct dm_buffer_cache *bc,
 913                                sector_t begin, sector_t end,
 914                                b_predicate pred, b_release release)
 915 {
 916         unsigned int i;
 917
 918         for (i = 0; i < bc->num_locks; i++) {
 919                 down_write(&bc->trees[i].lock);
 920                 __remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
 921                 up_write(&bc->trees[i].lock);
 922         }
 923 }
 924
 925 /*----------------------------------------------------------------*/
 926
 927 /*
 928  * Linking of buffers:
 929  *      All buffers are linked to buffer_cache with their node field.
 930  *
 931  *      Clean buffers that are not being written (B_WRITING not set)
 932  *      are linked to lru[LIST_CLEAN] with their lru_list field.
 933  *
 934  *      Dirty and clean buffers that are being written are linked to
 935  *      lru[LIST_DIRTY] with their lru_list field. When the write
 936  *      finishes, the buffer cannot be relinked immediately (because we
 937  *      are in an interrupt context and relinking requires process
 938  *      context), so some clean-not-writing buffers can be held on
 939  *      dirty_lru too.  They are later added to lru in the process
 940  *      context.
 941  */
 942 struct dm_bufio_client {
 943         struct block_device *bdev;
 944         unsigned int block_size;
 945         s8 sectors_per_block_bits;
 946
 947         bool no_sleep;
 948         struct mutex lock;
 949         spinlock_t spinlock;
 950
 951         int async_write_error;
 952
 953         void (*alloc_callback)(struct dm_buffer *buf);
 954         void (*write_callback)(struct dm_buffer *buf);
 955         struct kmem_cache *slab_buffer;
 956         struct kmem_cache *slab_cache;
 957         struct dm_io_client *dm_io;
 958
 959         struct list_head reserved_buffers;
 960         unsigned int need_reserved_buffers;
 961
 962         unsigned int minimum_buffers;
 963
 964         sector_t start;
 965
 966         struct shrinker shrinker;
 967         struct work_struct shrink_work;
 968         atomic_long_t need_shrink;
 969
 970         wait_queue_head_t free_buffer_wait;
 971
 972         struct list_head client_list;
 973
 974         /*
 975          * Used by global_cleanup to sort the clients list.
 976          */
 977         unsigned long oldest_buffer;
 978
 979         struct dm_buffer_cache cache; /* must be last member */
 980 };
 981
 982 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
 983
 984 /*----------------------------------------------------------------*/
 985
 986 #define dm_bufio_in_request()   (!!current->bio_list)
 987
 988 static void dm_bufio_lock(struct dm_bufio_client *c)
 989 {
 990         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
 991                 spin_lock_bh(&c->spinlock);
 992         else
 993                 mutex_lock_nested(&c->lock, dm_bufio_in_request());
 994 }
 995
 996 static void dm_bufio_unlock(struct dm_bufio_client *c)
 997 {
 998         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
 999                 spin_unlock_bh(&c->spinlock);
1000         else
1001                 mutex_unlock(&c->lock);
1002 }
1003
1004 /*----------------------------------------------------------------*/
1005
1006 /*
1007  * Default cache size: available memory divided by the ratio.
1008  */
1009 static unsigned long dm_bufio_default_cache_size;
1010
1011 /*
1012  * Total cache size set by the user.
1013  */
1014 static unsigned long dm_bufio_cache_size;
1015
1016 /*
1017  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
1018  * at any time.  If it disagrees, the user has changed cache size.
1019  */
1020 static unsigned long dm_bufio_cache_size_latch;
1021
1022 static DEFINE_SPINLOCK(global_spinlock);
1023
1024 /*
1025  * Buffers are freed after this timeout
1026  */
1027 static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
1028 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
1029
1030 static unsigned long dm_bufio_peak_allocated;
1031 static unsigned long dm_bufio_allocated_kmem_cache;
1032 static unsigned long dm_bufio_allocated_get_free_pages;
1033 static unsigned long dm_bufio_allocated_vmalloc;
1034 static unsigned long dm_bufio_current_allocated;
1035
1036 /*----------------------------------------------------------------*/
1037
1038 /*
1039  * The current number of clients.
1040  */
1041 static int dm_bufio_client_count;
1042
1043 /*
1044  * The list of all clients.
1045  */
1046 static LIST_HEAD(dm_bufio_all_clients);
1047
1048 /*
1049  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
1050  */
1051 static DEFINE_MUTEX(dm_bufio_clients_lock);
1052
1053 static struct workqueue_struct *dm_bufio_wq;
1054 static struct delayed_work dm_bufio_cleanup_old_work;
1055 static struct work_struct dm_bufio_replacement_work;
1056
1057
1058 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1059 static void buffer_record_stack(struct dm_buffer *b)
1060 {
1061         b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
1062 }
1063 #endif
1064
1065 /*----------------------------------------------------------------*/
1066
1067 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
1068 {
1069         unsigned char data_mode;
1070         long diff;
1071
1072         static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
1073                 &dm_bufio_allocated_kmem_cache,
1074                 &dm_bufio_allocated_get_free_pages,
1075                 &dm_bufio_allocated_vmalloc,
1076         };
1077
1078         data_mode = b->data_mode;
1079         diff = (long)b->c->block_size;
1080         if (unlink)
1081                 diff = -diff;
1082
1083         spin_lock(&global_spinlock);
1084
1085         *class_ptr[data_mode] += diff;
1086
1087         dm_bufio_current_allocated += diff;
1088
1089         if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
1090                 dm_bufio_peak_allocated = dm_bufio_current_allocated;
1091
1092         if (!unlink) {
1093                 if (dm_bufio_current_allocated > dm_bufio_cache_size)
1094                         queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
1095         }
1096
1097         spin_unlock(&global_spinlock);
1098 }
1099
1100 /*
1101  * Change the number of clients and recalculate per-client limit.
1102  */
1103 static void __cache_size_refresh(void)
1104 {
1105         if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
1106                 return;
1107         if (WARN_ON(dm_bufio_client_count < 0))
1108                 return;
1109
1110         dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
1111
1112         /*
1113          * Use default if set to 0 and report the actual cache size used.
1114          */
1115         if (!dm_bufio_cache_size_latch) {
1116                 (void)cmpxchg(&dm_bufio_cache_size, 0,
1117                               dm_bufio_default_cache_size);
1118                 dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
1119         }
1120 }
1121
1122 /*
1123  * Allocating buffer data.
1124  *
1125  * Small buffers are allocated with kmem_cache, to use space optimally.
1126  *
1127  * For large buffers, we choose between get_free_pages and vmalloc.
1128  * Each has advantages and disadvantages.
1129  *
1130  * __get_free_pages can randomly fail if the memory is fragmented.
1131  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
1132  * as low as 128M) so using it for caching is not appropriate.
1133  *
1134  * If the allocation may fail we use __get_free_pages. Memory fragmentation
1135  * won't have a fatal effect here, but it just causes flushes of some other
1136  * buffers and more I/O will be performed. Don't use __get_free_pages if it
1137  * always fails (i.e. order > MAX_ORDER).
1138  *
1139  * If the allocation shouldn't fail we use __vmalloc. This is only for the
1140  * initial reserve allocation, so there's no risk of wasting all vmalloc
1141  * space.
1142  */
1143 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
1144                                unsigned char *data_mode)
1145 {
1146         if (unlikely(c->slab_cache != NULL)) {
1147                 *data_mode = DATA_MODE_SLAB;
1148                 return kmem_cache_alloc(c->slab_cache, gfp_mask);
1149         }
1150
1151         if (c->block_size <= KMALLOC_MAX_SIZE &&
1152             gfp_mask & __GFP_NORETRY) {
1153                 *data_mode = DATA_MODE_GET_FREE_PAGES;
1154                 return (void *)__get_free_pages(gfp_mask,
1155                                                 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1156         }
1157
1158         *data_mode = DATA_MODE_VMALLOC;
1159
1160         /*
1161          * __vmalloc allocates the data pages and auxiliary structures with
1162          * gfp_flags that were specified, but pagetables are always allocated
1163          * with GFP_KERNEL, no matter what was specified as gfp_mask.
1164          *
1165          * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
1166          * all allocations done by this process (including pagetables) are done
1167          * as if GFP_NOIO was specified.
1168          */
1169         if (gfp_mask & __GFP_NORETRY) {
1170                 unsigned int noio_flag = memalloc_noio_save();
1171                 void *ptr = __vmalloc(c->block_size, gfp_mask);
1172
1173                 memalloc_noio_restore(noio_flag);
1174                 return ptr;
1175         }
1176
1177         return __vmalloc(c->block_size, gfp_mask);
1178 }
1179
1180 /*
1181  * Free buffer's data.
1182  */
1183 static void free_buffer_data(struct dm_bufio_client *c,
1184                              void *data, unsigned char data_mode)
1185 {
1186         switch (data_mode) {
1187         case DATA_MODE_SLAB:
1188                 kmem_cache_free(c->slab_cache, data);
1189                 break;
1190
1191         case DATA_MODE_GET_FREE_PAGES:
1192                 free_pages((unsigned long)data,
1193                            c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1194                 break;
1195
1196         case DATA_MODE_VMALLOC:
1197                 vfree(data);
1198                 break;
1199
1200         default:
1201                 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
1202                        data_mode);
1203                 BUG();
1204         }
1205 }
1206
1207 /*
1208  * Allocate buffer and its data.
1209  */
1210 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
1211 {
1212         struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
1213
1214         if (!b)
1215                 return NULL;
1216
1217         b->c = c;
1218
1219         b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
1220         if (!b->data) {
1221                 kmem_cache_free(c->slab_buffer, b);
1222                 return NULL;
1223         }
1224         adjust_total_allocated(b, false);
1225
1226 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1227         b->stack_len = 0;
1228 #endif
1229         return b;
1230 }
1231
1232 /*
1233  * Free buffer and its data.
1234  */
1235 static void free_buffer(struct dm_buffer *b)
1236 {
1237         struct dm_bufio_client *c = b->c;
1238
1239         adjust_total_allocated(b, true);
1240         free_buffer_data(c, b->data, b->data_mode);
1241         kmem_cache_free(c->slab_buffer, b);
1242 }
1243
1244 /*
1245  *--------------------------------------------------------------------------
1246  * Submit I/O on the buffer.
1247  *
1248  * Bio interface is faster but it has some problems:
1249  *      the vector list is limited (increasing this limit increases
1250  *      memory-consumption per buffer, so it is not viable);
1251  *
1252  *      the memory must be direct-mapped, not vmalloced;
1253  *
1254  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
1255  * it is not vmalloced, try using the bio interface.
1256  *
1257  * If the buffer is big, if it is vmalloced or if the underlying device
1258  * rejects the bio because it is too large, use dm-io layer to do the I/O.
1259  * The dm-io layer splits the I/O into multiple requests, avoiding the above
1260  * shortcomings.
1261  *--------------------------------------------------------------------------
1262  */
1263
1264 /*
1265  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
1266  * that the request was handled directly with bio interface.
1267  */
1268 static void dmio_complete(unsigned long error, void *context)
1269 {
1270         struct dm_buffer *b = context;
1271
1272         b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
1273 }
1274
1275 static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
1276                      unsigned int n_sectors, unsigned int offset)
1277 {
1278         int r;
1279         struct dm_io_request io_req = {
1280                 .bi_opf = op,
1281                 .notify.fn = dmio_complete,
1282                 .notify.context = b,
1283                 .client = b->c->dm_io,
1284         };
1285         struct dm_io_region region = {
1286                 .bdev = b->c->bdev,
1287                 .sector = sector,
1288                 .count = n_sectors,
1289         };
1290
1291         if (b->data_mode != DATA_MODE_VMALLOC) {
1292                 io_req.mem.type = DM_IO_KMEM;
1293                 io_req.mem.ptr.addr = (char *)b->data + offset;
1294         } else {
1295                 io_req.mem.type = DM_IO_VMA;
1296                 io_req.mem.ptr.vma = (char *)b->data + offset;
1297         }
1298
1299         r = dm_io(&io_req, 1, &region, NULL);
1300         if (unlikely(r))
1301                 b->end_io(b, errno_to_blk_status(r));
1302 }
1303
1304 static void bio_complete(struct bio *bio)
1305 {
1306         struct dm_buffer *b = bio->bi_private;
1307         blk_status_t status = bio->bi_status;
1308
1309         bio_uninit(bio);
1310         kfree(bio);
1311         b->end_io(b, status);
1312 }
1313
1314 static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
1315                     unsigned int n_sectors, unsigned int offset)
1316 {
1317         struct bio *bio;
1318         char *ptr;
1319         unsigned int len;
1320
1321         bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
1322         if (!bio) {
1323                 use_dmio(b, op, sector, n_sectors, offset);
1324                 return;
1325         }
1326         bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
1327         bio->bi_iter.bi_sector = sector;
1328         bio->bi_end_io = bio_complete;
1329         bio->bi_private = b;
1330
1331         ptr = (char *)b->data + offset;
1332         len = n_sectors << SECTOR_SHIFT;
1333
1334         __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
1335
1336         submit_bio(bio);
1337 }
1338
1339 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
1340 {
1341         sector_t sector;
1342
1343         if (likely(c->sectors_per_block_bits >= 0))
1344                 sector = block << c->sectors_per_block_bits;
1345         else
1346                 sector = block * (c->block_size >> SECTOR_SHIFT);
1347         sector += c->start;
1348
1349         return sector;
1350 }
1351
1352 static void submit_io(struct dm_buffer *b, enum req_op op,
1353                       void (*end_io)(struct dm_buffer *, blk_status_t))
1354 {
1355         unsigned int n_sectors;
1356         sector_t sector;
1357         unsigned int offset, end;
1358
1359         b->end_io = end_io;
1360
1361         sector = block_to_sector(b->c, b->block);
1362
1363         if (op != REQ_OP_WRITE) {
1364                 n_sectors = b->c->block_size >> SECTOR_SHIFT;
1365                 offset = 0;
1366         } else {
1367                 if (b->c->write_callback)
1368                         b->c->write_callback(b);
1369                 offset = b->write_start;
1370                 end = b->write_end;
1371                 offset &= -DM_BUFIO_WRITE_ALIGN;
1372                 end += DM_BUFIO_WRITE_ALIGN - 1;
1373                 end &= -DM_BUFIO_WRITE_ALIGN;
1374                 if (unlikely(end > b->c->block_size))
1375                         end = b->c->block_size;
1376
1377                 sector += offset >> SECTOR_SHIFT;
1378                 n_sectors = (end - offset) >> SECTOR_SHIFT;
1379         }
1380
1381         if (b->data_mode != DATA_MODE_VMALLOC)
1382                 use_bio(b, op, sector, n_sectors, offset);
1383         else
1384                 use_dmio(b, op, sector, n_sectors, offset);
1385 }
1386
1387 /*
1388  *--------------------------------------------------------------
1389  * Writing dirty buffers
1390  *--------------------------------------------------------------
1391  */
1392
1393 /*
1394  * The endio routine for write.
1395  *
1396  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
1397  * it.
1398  */
1399 static void write_endio(struct dm_buffer *b, blk_status_t status)
1400 {
1401         b->write_error = status;
1402         if (unlikely(status)) {
1403                 struct dm_bufio_client *c = b->c;
1404
1405                 (void)cmpxchg(&c->async_write_error, 0,
1406                                 blk_status_to_errno(status));
1407         }
1408
1409         BUG_ON(!test_bit(B_WRITING, &b->state));
1410
1411         smp_mb__before_atomic();
1412         clear_bit(B_WRITING, &b->state);
1413         smp_mb__after_atomic();
1414
1415         wake_up_bit(&b->state, B_WRITING);
1416 }
1417
1418 /*
1419  * Initiate a write on a dirty buffer, but don't wait for it.
1420  *
1421  * - If the buffer is not dirty, exit.
1422  * - If there some previous write going on, wait for it to finish (we can't
1423  *   have two writes on the same buffer simultaneously).
1424  * - Submit our write and don't wait on it. We set B_WRITING indicating
1425  *   that there is a write in progress.
1426  */
1427 static void __write_dirty_buffer(struct dm_buffer *b,
1428                                  struct list_head *write_list)
1429 {
1430         if (!test_bit(B_DIRTY, &b->state))
1431                 return;
1432
1433         clear_bit(B_DIRTY, &b->state);
1434         wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1435
1436         b->write_start = b->dirty_start;
1437         b->write_end = b->dirty_end;
1438
1439         if (!write_list)
1440                 submit_io(b, REQ_OP_WRITE, write_endio);
1441         else
1442                 list_add_tail(&b->write_list, write_list);
1443 }
1444
1445 static void __flush_write_list(struct list_head *write_list)
1446 {
1447         struct blk_plug plug;
1448
1449         blk_start_plug(&plug);
1450         while (!list_empty(write_list)) {
1451                 struct dm_buffer *b =
1452                         list_entry(write_list->next, struct dm_buffer, write_list);
1453                 list_del(&b->write_list);
1454                 submit_io(b, REQ_OP_WRITE, write_endio);
1455                 cond_resched();
1456         }
1457         blk_finish_plug(&plug);
1458 }
1459
1460 /*
1461  * Wait until any activity on the buffer finishes.  Possibly write the
1462  * buffer if it is dirty.  When this function finishes, there is no I/O
1463  * running on the buffer and the buffer is not dirty.
1464  */
1465 static void __make_buffer_clean(struct dm_buffer *b)
1466 {
1467         BUG_ON(atomic_read(&b->hold_count));
1468
1469         /* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
1470         if (!smp_load_acquire(&b->state))       /* fast case */
1471                 return;
1472
1473         wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1474         __write_dirty_buffer(b, NULL);
1475         wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1476 }
1477
1478 static enum evict_result is_clean(struct dm_buffer *b, void *context)
1479 {
1480         struct dm_bufio_client *c = context;
1481
1482         /* These should never happen */
1483         if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
1484                 return ER_DONT_EVICT;
1485         if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
1486                 return ER_DONT_EVICT;
1487         if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
1488                 return ER_DONT_EVICT;
1489
1490         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
1491             unlikely(test_bit(B_READING, &b->state)))
1492                 return ER_DONT_EVICT;
1493
1494         return ER_EVICT;
1495 }
1496
1497 static enum evict_result is_dirty(struct dm_buffer *b, void *context)
1498 {
1499         /* These should never happen */
1500         if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1501                 return ER_DONT_EVICT;
1502         if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
1503                 return ER_DONT_EVICT;
1504
1505         return ER_EVICT;
1506 }
1507
1508 /*
1509  * Find some buffer that is not held by anybody, clean it, unlink it and
1510  * return it.
1511  */
1512 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
1513 {
1514         struct dm_buffer *b;
1515
1516         b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
1517         if (b) {
1518                 /* this also waits for pending reads */
1519                 __make_buffer_clean(b);
1520                 return b;
1521         }
1522
1523         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1524                 return NULL;
1525
1526         b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
1527         if (b) {
1528                 __make_buffer_clean(b);
1529                 return b;
1530         }
1531
1532         return NULL;
1533 }
1534
1535 /*
1536  * Wait until some other threads free some buffer or release hold count on
1537  * some buffer.
1538  *
1539  * This function is entered with c->lock held, drops it and regains it
1540  * before exiting.
1541  */
1542 static void __wait_for_free_buffer(struct dm_bufio_client *c)
1543 {
1544         DECLARE_WAITQUEUE(wait, current);
1545
1546         add_wait_queue(&c->free_buffer_wait, &wait);
1547         set_current_state(TASK_UNINTERRUPTIBLE);
1548         dm_bufio_unlock(c);
1549
1550         /*
1551          * It's possible to miss a wake up event since we don't always
1552          * hold c->lock when wake_up is called.  So we have a timeout here,
1553          * just in case.
1554          */
1555         io_schedule_timeout(5 * HZ);
1556
1557         remove_wait_queue(&c->free_buffer_wait, &wait);
1558
1559         dm_bufio_lock(c);
1560 }
1561
1562 enum new_flag {
1563         NF_FRESH = 0,
1564         NF_READ = 1,
1565         NF_GET = 2,
1566         NF_PREFETCH = 3
1567 };
1568
1569 /*
1570  * Allocate a new buffer. If the allocation is not possible, wait until
1571  * some other thread frees a buffer.
1572  *
1573  * May drop the lock and regain it.
1574  */
1575 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
1576 {
1577         struct dm_buffer *b;
1578         bool tried_noio_alloc = false;
1579
1580         /*
1581          * dm-bufio is resistant to allocation failures (it just keeps
1582          * one buffer reserved in cases all the allocations fail).
1583          * So set flags to not try too hard:
1584          *      GFP_NOWAIT: don't wait; if we need to sleep we'll release our
1585          *                  mutex and wait ourselves.
1586          *      __GFP_NORETRY: don't retry and rather return failure
1587          *      __GFP_NOMEMALLOC: don't use emergency reserves
1588          *      __GFP_NOWARN: don't print a warning in case of failure
1589          *
1590          * For debugging, if we set the cache size to 1, no new buffers will
1591          * be allocated.
1592          */
1593         while (1) {
1594                 if (dm_bufio_cache_size_latch != 1) {
1595                         b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1596                         if (b)
1597                                 return b;
1598                 }
1599
1600                 if (nf == NF_PREFETCH)
1601                         return NULL;
1602
1603                 if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
1604                         dm_bufio_unlock(c);
1605                         b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1606                         dm_bufio_lock(c);
1607                         if (b)
1608                                 return b;
1609                         tried_noio_alloc = true;
1610                 }
1611
1612                 if (!list_empty(&c->reserved_buffers)) {
1613                         b = list_to_buffer(c->reserved_buffers.next);
1614                         list_del(&b->lru.list);
1615                         c->need_reserved_buffers++;
1616
1617                         return b;
1618                 }
1619
1620                 b = __get_unclaimed_buffer(c);
1621                 if (b)
1622                         return b;
1623
1624                 __wait_for_free_buffer(c);
1625         }
1626 }
1627
1628 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
1629 {
1630         struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
1631
1632         if (!b)
1633                 return NULL;
1634
1635         if (c->alloc_callback)
1636                 c->alloc_callback(b);
1637
1638         return b;
1639 }
1640
1641 /*
1642  * Free a buffer and wake other threads waiting for free buffers.
1643  */
1644 static void __free_buffer_wake(struct dm_buffer *b)
1645 {
1646         struct dm_bufio_client *c = b->c;
1647
1648         b->block = -1;
1649         if (!c->need_reserved_buffers)
1650                 free_buffer(b);
1651         else {
1652                 list_add(&b->lru.list, &c->reserved_buffers);
1653                 c->need_reserved_buffers--;
1654         }
1655
1656         /*
1657          * We hold the bufio lock here, so no one can add entries to the
1658          * wait queue anyway.
1659          */
1660         if (unlikely(waitqueue_active(&c->free_buffer_wait)))
1661                 wake_up(&c->free_buffer_wait);
1662 }
1663
1664 static enum evict_result cleaned(struct dm_buffer *b, void *context)
1665 {
1666         if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1667                 return ER_DONT_EVICT; /* should never happen */
1668
1669         if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
1670                 return ER_DONT_EVICT;
1671         else
1672                 return ER_EVICT;
1673 }
1674
1675 static void __move_clean_buffers(struct dm_bufio_client *c)
1676 {
1677         cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
1678 }
1679
1680 struct write_context {
1681         int no_wait;
1682         struct list_head *write_list;
1683 };
1684
1685 static enum it_action write_one(struct dm_buffer *b, void *context)
1686 {
1687         struct write_context *wc = context;
1688
1689         if (wc->no_wait && test_bit(B_WRITING, &b->state))
1690                 return IT_COMPLETE;
1691
1692         __write_dirty_buffer(b, wc->write_list);
1693         return IT_NEXT;
1694 }
1695
1696 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
1697                                         struct list_head *write_list)
1698 {
1699         struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
1700
1701         __move_clean_buffers(c);
1702         cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
1703 }
1704
1705 /*
1706  * Check if we're over watermark.
1707  * If we are over threshold_buffers, start freeing buffers.
1708  * If we're over "limit_buffers", block until we get under the limit.
1709  */
1710 static void __check_watermark(struct dm_bufio_client *c,
1711                               struct list_head *write_list)
1712 {
1713         if (cache_count(&c->cache, LIST_DIRTY) >
1714             cache_count(&c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
1715                 __write_dirty_buffers_async(c, 1, write_list);
1716 }
1717
1718 /*
1719  *--------------------------------------------------------------
1720  * Getting a buffer
1721  *--------------------------------------------------------------
1722  */
1723
1724 static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b)
1725 {
1726         /*
1727          * Relying on waitqueue_active() is racey, but we sleep
1728          * with schedule_timeout anyway.
1729          */
1730         if (cache_put(&c->cache, b) &&
1731             unlikely(waitqueue_active(&c->free_buffer_wait)))
1732                 wake_up(&c->free_buffer_wait);
1733 }
1734
1735 /*
1736  * This assumes you have already checked the cache to see if the buffer
1737  * is already present (it will recheck after dropping the lock for allocation).
1738  */
1739 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1740                                      enum new_flag nf, int *need_submit,
1741                                      struct list_head *write_list)
1742 {
1743         struct dm_buffer *b, *new_b = NULL;
1744
1745         *need_submit = 0;
1746
1747         /* This can't be called with NF_GET */
1748         if (WARN_ON_ONCE(nf == NF_GET))
1749                 return NULL;
1750
1751         new_b = __alloc_buffer_wait(c, nf);
1752         if (!new_b)
1753                 return NULL;
1754
1755         /*
1756          * We've had a period where the mutex was unlocked, so need to
1757          * recheck the buffer tree.
1758          */
1759         b = cache_get(&c->cache, block);
1760         if (b) {
1761                 __free_buffer_wake(new_b);
1762                 goto found_buffer;
1763         }
1764
1765         __check_watermark(c, write_list);
1766
1767         b = new_b;
1768         atomic_set(&b->hold_count, 1);
1769         WRITE_ONCE(b->last_accessed, jiffies);
1770         b->block = block;
1771         b->read_error = 0;
1772         b->write_error = 0;
1773         b->list_mode = LIST_CLEAN;
1774
1775         if (nf == NF_FRESH)
1776                 b->state = 0;
1777         else {
1778                 b->state = 1 << B_READING;
1779                 *need_submit = 1;
1780         }
1781
1782         /*
1783          * We mustn't insert into the cache until the B_READING state
1784          * is set.  Otherwise another thread could get it and use
1785          * it before it had been read.
1786          */
1787         cache_insert(&c->cache, b);
1788
1789         return b;
1790
1791 found_buffer:
1792         if (nf == NF_PREFETCH) {
1793                 cache_put_and_wake(c, b);
1794                 return NULL;
1795         }
1796
1797         /*
1798          * Note: it is essential that we don't wait for the buffer to be
1799          * read if dm_bufio_get function is used. Both dm_bufio_get and
1800          * dm_bufio_prefetch can be used in the driver request routine.
1801          * If the user called both dm_bufio_prefetch and dm_bufio_get on
1802          * the same buffer, it would deadlock if we waited.
1803          */
1804         if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1805                 cache_put_and_wake(c, b);
1806                 return NULL;
1807         }
1808
1809         return b;
1810 }
1811
1812 /*
1813  * The endio routine for reading: set the error, clear the bit and wake up
1814  * anyone waiting on the buffer.
1815  */
1816 static void read_endio(struct dm_buffer *b, blk_status_t status)
1817 {
1818         b->read_error = status;
1819
1820         BUG_ON(!test_bit(B_READING, &b->state));
1821
1822         smp_mb__before_atomic();
1823         clear_bit(B_READING, &b->state);
1824         smp_mb__after_atomic();
1825
1826         wake_up_bit(&b->state, B_READING);
1827 }
1828
1829 /*
1830  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1831  * functions is similar except that dm_bufio_new doesn't read the
1832  * buffer from the disk (assuming that the caller overwrites all the data
1833  * and uses dm_bufio_mark_buffer_dirty to write new data back).
1834  */
1835 static void *new_read(struct dm_bufio_client *c, sector_t block,
1836                       enum new_flag nf, struct dm_buffer **bp)
1837 {
1838         int need_submit = 0;
1839         struct dm_buffer *b;
1840
1841         LIST_HEAD(write_list);
1842
1843         *bp = NULL;
1844
1845         /*
1846          * Fast path, hopefully the block is already in the cache.  No need
1847          * to get the client lock for this.
1848          */
1849         b = cache_get(&c->cache, block);
1850         if (b) {
1851                 if (nf == NF_PREFETCH) {
1852                         cache_put_and_wake(c, b);
1853                         return NULL;
1854                 }
1855
1856                 /*
1857                  * Note: it is essential that we don't wait for the buffer to be
1858                  * read if dm_bufio_get function is used. Both dm_bufio_get and
1859                  * dm_bufio_prefetch can be used in the driver request routine.
1860                  * If the user called both dm_bufio_prefetch and dm_bufio_get on
1861                  * the same buffer, it would deadlock if we waited.
1862                  */
1863                 if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1864                         cache_put_and_wake(c, b);
1865                         return NULL;
1866                 }
1867         }
1868
1869         if (!b) {
1870                 if (nf == NF_GET)
1871                         return NULL;
1872
1873                 dm_bufio_lock(c);
1874                 b = __bufio_new(c, block, nf, &need_submit, &write_list);
1875                 dm_bufio_unlock(c);
1876         }
1877
1878 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1879         if (b && (atomic_read(&b->hold_count) == 1))
1880                 buffer_record_stack(b);
1881 #endif
1882
1883         __flush_write_list(&write_list);
1884
1885         if (!b)
1886                 return NULL;
1887
1888         if (need_submit)
1889                 submit_io(b, REQ_OP_READ, read_endio);
1890
1891         wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1892
1893         if (b->read_error) {
1894                 int error = blk_status_to_errno(b->read_error);
1895
1896                 dm_bufio_release(b);
1897
1898                 return ERR_PTR(error);
1899         }
1900
1901         *bp = b;
1902
1903         return b->data;
1904 }
1905
1906 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1907                    struct dm_buffer **bp)
1908 {
1909         return new_read(c, block, NF_GET, bp);
1910 }
1911 EXPORT_SYMBOL_GPL(dm_bufio_get);
1912
1913 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1914                     struct dm_buffer **bp)
1915 {
1916         if (WARN_ON_ONCE(dm_bufio_in_request()))
1917                 return ERR_PTR(-EINVAL);
1918
1919         return new_read(c, block, NF_READ, bp);
1920 }
1921 EXPORT_SYMBOL_GPL(dm_bufio_read);
1922
1923 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1924                    struct dm_buffer **bp)
1925 {
1926         if (WARN_ON_ONCE(dm_bufio_in_request()))
1927                 return ERR_PTR(-EINVAL);
1928
1929         return new_read(c, block, NF_FRESH, bp);
1930 }
1931 EXPORT_SYMBOL_GPL(dm_bufio_new);
1932
1933 void dm_bufio_prefetch(struct dm_bufio_client *c,
1934                        sector_t block, unsigned int n_blocks)
1935 {
1936         struct blk_plug plug;
1937
1938         LIST_HEAD(write_list);
1939
1940         if (WARN_ON_ONCE(dm_bufio_in_request()))
1941                 return; /* should never happen */
1942
1943         blk_start_plug(&plug);
1944
1945         for (; n_blocks--; block++) {
1946                 int need_submit;
1947                 struct dm_buffer *b;
1948
1949                 b = cache_get(&c->cache, block);
1950                 if (b) {
1951                         /* already in cache */
1952                         cache_put_and_wake(c, b);
1953                         continue;
1954                 }
1955
1956                 dm_bufio_lock(c);
1957                 b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1958                                 &write_list);
1959                 if (unlikely(!list_empty(&write_list))) {
1960                         dm_bufio_unlock(c);
1961                         blk_finish_plug(&plug);
1962                         __flush_write_list(&write_list);
1963                         blk_start_plug(&plug);
1964                         dm_bufio_lock(c);
1965                 }
1966                 if (unlikely(b != NULL)) {
1967                         dm_bufio_unlock(c);
1968
1969                         if (need_submit)
1970                                 submit_io(b, REQ_OP_READ, read_endio);
1971                         dm_bufio_release(b);
1972
1973                         cond_resched();
1974
1975                         if (!n_blocks)
1976                                 goto flush_plug;
1977                         dm_bufio_lock(c);
1978                 }
1979                 dm_bufio_unlock(c);
1980         }
1981
1982 flush_plug:
1983         blk_finish_plug(&plug);
1984 }
1985 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1986
1987 void dm_bufio_release(struct dm_buffer *b)
1988 {
1989         struct dm_bufio_client *c = b->c;
1990
1991         /*
1992          * If there were errors on the buffer, and the buffer is not
1993          * to be written, free the buffer. There is no point in caching
1994          * invalid buffer.
1995          */
1996         if ((b->read_error || b->write_error) &&
1997             !test_bit_acquire(B_READING, &b->state) &&
1998             !test_bit(B_WRITING, &b->state) &&
1999             !test_bit(B_DIRTY, &b->state)) {
2000                 dm_bufio_lock(c);
2001
2002                 /* cache remove can fail if there are other holders */
2003                 if (cache_remove(&c->cache, b)) {
2004                         __free_buffer_wake(b);
2005                         dm_bufio_unlock(c);
2006                         return;
2007                 }
2008
2009                 dm_bufio_unlock(c);
2010         }
2011
2012         cache_put_and_wake(c, b);
2013 }
2014 EXPORT_SYMBOL_GPL(dm_bufio_release);
2015
2016 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
2017                                         unsigned int start, unsigned int end)
2018 {
2019         struct dm_bufio_client *c = b->c;
2020
2021         BUG_ON(start >= end);
2022         BUG_ON(end > b->c->block_size);
2023
2024         dm_bufio_lock(c);
2025
2026         BUG_ON(test_bit(B_READING, &b->state));
2027
2028         if (!test_and_set_bit(B_DIRTY, &b->state)) {
2029                 b->dirty_start = start;
2030                 b->dirty_end = end;
2031                 cache_mark(&c->cache, b, LIST_DIRTY);
2032         } else {
2033                 if (start < b->dirty_start)
2034                         b->dirty_start = start;
2035                 if (end > b->dirty_end)
2036                         b->dirty_end = end;
2037         }
2038
2039         dm_bufio_unlock(c);
2040 }
2041 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
2042
2043 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
2044 {
2045         dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
2046 }
2047 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
2048
2049 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
2050 {
2051         LIST_HEAD(write_list);
2052
2053         if (WARN_ON_ONCE(dm_bufio_in_request()))
2054                 return; /* should never happen */
2055
2056         dm_bufio_lock(c);
2057         __write_dirty_buffers_async(c, 0, &write_list);
2058         dm_bufio_unlock(c);
2059         __flush_write_list(&write_list);
2060 }
2061 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
2062
2063 /*
2064  * For performance, it is essential that the buffers are written asynchronously
2065  * and simultaneously (so that the block layer can merge the writes) and then
2066  * waited upon.
2067  *
2068  * Finally, we flush hardware disk cache.
2069  */
2070 static bool is_writing(struct lru_entry *e, void *context)
2071 {
2072         struct dm_buffer *b = le_to_buffer(e);
2073
2074         return test_bit(B_WRITING, &b->state);
2075 }
2076
2077 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
2078 {
2079         int a, f;
2080         unsigned long nr_buffers;
2081         struct lru_entry *e;
2082         struct lru_iter it;
2083
2084         LIST_HEAD(write_list);
2085
2086         dm_bufio_lock(c);
2087         __write_dirty_buffers_async(c, 0, &write_list);
2088         dm_bufio_unlock(c);
2089         __flush_write_list(&write_list);
2090         dm_bufio_lock(c);
2091
2092         nr_buffers = cache_count(&c->cache, LIST_DIRTY);
2093         lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
2094         while ((e = lru_iter_next(&it, is_writing, c))) {
2095                 struct dm_buffer *b = le_to_buffer(e);
2096                 __cache_inc_buffer(b);
2097
2098                 BUG_ON(test_bit(B_READING, &b->state));
2099
2100                 if (nr_buffers) {
2101                         nr_buffers--;
2102                         dm_bufio_unlock(c);
2103                         wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2104                         dm_bufio_lock(c);
2105                 } else {
2106                         wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2107                 }
2108
2109                 if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
2110                         cache_mark(&c->cache, b, LIST_CLEAN);
2111
2112                 cache_put_and_wake(c, b);
2113
2114                 cond_resched();
2115         }
2116         lru_iter_end(&it);
2117
2118         wake_up(&c->free_buffer_wait);
2119         dm_bufio_unlock(c);
2120
2121         a = xchg(&c->async_write_error, 0);
2122         f = dm_bufio_issue_flush(c);
2123         if (a)
2124                 return a;
2125
2126         return f;
2127 }
2128 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
2129
2130 /*
2131  * Use dm-io to send an empty barrier to flush the device.
2132  */
2133 int dm_bufio_issue_flush(struct dm_bufio_client *c)
2134 {
2135         struct dm_io_request io_req = {
2136                 .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
2137                 .mem.type = DM_IO_KMEM,
2138                 .mem.ptr.addr = NULL,
2139                 .client = c->dm_io,
2140         };
2141         struct dm_io_region io_reg = {
2142                 .bdev = c->bdev,
2143                 .sector = 0,
2144                 .count = 0,
2145         };
2146
2147         if (WARN_ON_ONCE(dm_bufio_in_request()))
2148                 return -EINVAL;
2149
2150         return dm_io(&io_req, 1, &io_reg, NULL);
2151 }
2152 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
2153
2154 /*
2155  * Use dm-io to send a discard request to flush the device.
2156  */
2157 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
2158 {
2159         struct dm_io_request io_req = {
2160                 .bi_opf = REQ_OP_DISCARD | REQ_SYNC,
2161                 .mem.type = DM_IO_KMEM,
2162                 .mem.ptr.addr = NULL,
2163                 .client = c->dm_io,
2164         };
2165         struct dm_io_region io_reg = {
2166                 .bdev = c->bdev,
2167                 .sector = block_to_sector(c, block),
2168                 .count = block_to_sector(c, count),
2169         };
2170
2171         if (WARN_ON_ONCE(dm_bufio_in_request()))
2172                 return -EINVAL; /* discards are optional */
2173
2174         return dm_io(&io_req, 1, &io_reg, NULL);
2175 }
2176 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
2177
2178 static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
2179 {
2180         struct dm_buffer *b;
2181
2182         b = cache_get(&c->cache, block);
2183         if (b) {
2184                 if (likely(!smp_load_acquire(&b->state))) {
2185                         if (cache_remove(&c->cache, b))
2186                                 __free_buffer_wake(b);
2187                         else
2188                                 cache_put_and_wake(c, b);
2189                 } else {
2190                         cache_put_and_wake(c, b);
2191                 }
2192         }
2193
2194         return b ? true : false;
2195 }
2196
2197 /*
2198  * Free the given buffer.
2199  *
2200  * This is just a hint, if the buffer is in use or dirty, this function
2201  * does nothing.
2202  */
2203 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
2204 {
2205         dm_bufio_lock(c);
2206         forget_buffer(c, block);
2207         dm_bufio_unlock(c);
2208 }
2209 EXPORT_SYMBOL_GPL(dm_bufio_forget);
2210
2211 static enum evict_result idle(struct dm_buffer *b, void *context)
2212 {
2213         return b->state ? ER_DONT_EVICT : ER_EVICT;
2214 }
2215
2216 void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
2217 {
2218         dm_bufio_lock(c);
2219         cache_remove_range(&c->cache, block, block + n_blocks, idle, __free_buffer_wake);
2220         dm_bufio_unlock(c);
2221 }
2222 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
2223
2224 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n)
2225 {
2226         c->minimum_buffers = n;
2227 }
2228 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
2229
2230 unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
2231 {
2232         return c->block_size;
2233 }
2234 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
2235
2236 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
2237 {
2238         sector_t s = bdev_nr_sectors(c->bdev);
2239
2240         if (s >= c->start)
2241                 s -= c->start;
2242         else
2243                 s = 0;
2244         if (likely(c->sectors_per_block_bits >= 0))
2245                 s >>= c->sectors_per_block_bits;
2246         else
2247                 sector_div(s, c->block_size >> SECTOR_SHIFT);
2248         return s;
2249 }
2250 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
2251
2252 struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
2253 {
2254         return c->dm_io;
2255 }
2256 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
2257
2258 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
2259 {
2260         return b->block;
2261 }
2262 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
2263
2264 void *dm_bufio_get_block_data(struct dm_buffer *b)
2265 {
2266         return b->data;
2267 }
2268 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
2269
2270 void *dm_bufio_get_aux_data(struct dm_buffer *b)
2271 {
2272         return b + 1;
2273 }
2274 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
2275
2276 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
2277 {
2278         return b->c;
2279 }
2280 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
2281
2282 static enum it_action warn_leak(struct dm_buffer *b, void *context)
2283 {
2284         bool *warned = context;
2285
2286         WARN_ON(!(*warned));
2287         *warned = true;
2288         DMERR("leaked buffer %llx, hold count %u, list %d",
2289               (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
2290 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2291         stack_trace_print(b->stack_entries, b->stack_len, 1);
2292         /* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
2293         atomic_set(&b->hold_count, 0);
2294 #endif
2295         return IT_NEXT;
2296 }
2297
2298 static void drop_buffers(struct dm_bufio_client *c)
2299 {
2300         int i;
2301         struct dm_buffer *b;
2302
2303         if (WARN_ON(dm_bufio_in_request()))
2304                 return; /* should never happen */
2305
2306         /*
2307          * An optimization so that the buffers are not written one-by-one.
2308          */
2309         dm_bufio_write_dirty_buffers_async(c);
2310
2311         dm_bufio_lock(c);
2312
2313         while ((b = __get_unclaimed_buffer(c)))
2314                 __free_buffer_wake(b);
2315
2316         for (i = 0; i < LIST_SIZE; i++) {
2317                 bool warned = false;
2318
2319                 cache_iterate(&c->cache, i, warn_leak, &warned);
2320         }
2321
2322 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2323         while ((b = __get_unclaimed_buffer(c)))
2324                 __free_buffer_wake(b);
2325 #endif
2326
2327         for (i = 0; i < LIST_SIZE; i++)
2328                 WARN_ON(cache_count(&c->cache, i));
2329
2330         dm_bufio_unlock(c);
2331 }
2332
2333 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
2334 {
2335         unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
2336
2337         if (likely(c->sectors_per_block_bits >= 0))
2338                 retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
2339         else
2340                 retain_bytes /= c->block_size;
2341
2342         return retain_bytes;
2343 }
2344
2345 static void __scan(struct dm_bufio_client *c)
2346 {
2347         int l;
2348         struct dm_buffer *b;
2349         unsigned long freed = 0;
2350         unsigned long retain_target = get_retain_buffers(c);
2351         unsigned long count = cache_total(&c->cache);
2352
2353         for (l = 0; l < LIST_SIZE; l++) {
2354                 while (true) {
2355                         if (count - freed <= retain_target)
2356                                 atomic_long_set(&c->need_shrink, 0);
2357                         if (!atomic_long_read(&c->need_shrink))
2358                                 break;
2359
2360                         b = cache_evict(&c->cache, l,
2361                                         l == LIST_CLEAN ? is_clean : is_dirty, c);
2362                         if (!b)
2363                                 break;
2364
2365                         __make_buffer_clean(b);
2366                         __free_buffer_wake(b);
2367
2368                         atomic_long_dec(&c->need_shrink);
2369                         freed++;
2370                         cond_resched();
2371                 }
2372         }
2373 }
2374
2375 static void shrink_work(struct work_struct *w)
2376 {
2377         struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
2378
2379         dm_bufio_lock(c);
2380         __scan(c);
2381         dm_bufio_unlock(c);
2382 }
2383
2384 static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2385 {
2386         struct dm_bufio_client *c;
2387
2388         c = container_of(shrink, struct dm_bufio_client, shrinker);
2389         atomic_long_add(sc->nr_to_scan, &c->need_shrink);
2390         queue_work(dm_bufio_wq, &c->shrink_work);
2391
2392         return sc->nr_to_scan;
2393 }
2394
2395 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2396 {
2397         struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
2398         unsigned long count = cache_total(&c->cache);
2399         unsigned long retain_target = get_retain_buffers(c);
2400         unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
2401
2402         if (unlikely(count < retain_target))
2403                 count = 0;
2404         else
2405                 count -= retain_target;
2406
2407         if (unlikely(count < queued_for_cleanup))
2408                 count = 0;
2409         else
2410                 count -= queued_for_cleanup;
2411
2412         return count;
2413 }
2414
2415 /*
2416  * Create the buffering interface
2417  */
2418 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size,
2419                                                unsigned int reserved_buffers, unsigned int aux_size,
2420                                                void (*alloc_callback)(struct dm_buffer *),
2421                                                void (*write_callback)(struct dm_buffer *),
2422                                                unsigned int flags)
2423 {
2424         int r;
2425         unsigned int num_locks;
2426         struct dm_bufio_client *c;
2427         char slab_name[27];
2428
2429         if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
2430                 DMERR("%s: block size not specified or is not multiple of 512b", __func__);
2431                 r = -EINVAL;
2432                 goto bad_client;
2433         }
2434
2435         num_locks = dm_num_hash_locks();
2436         c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
2437         if (!c) {
2438                 r = -ENOMEM;
2439                 goto bad_client;
2440         }
2441         cache_init(&c->cache, num_locks);
2442
2443         c->bdev = bdev;
2444         c->block_size = block_size;
2445         if (is_power_of_2(block_size))
2446                 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
2447         else
2448                 c->sectors_per_block_bits = -1;
2449
2450         c->alloc_callback = alloc_callback;
2451         c->write_callback = write_callback;
2452
2453         if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
2454                 c->no_sleep = true;
2455                 static_branch_inc(&no_sleep_enabled);
2456         }
2457
2458         mutex_init(&c->lock);
2459         spin_lock_init(&c->spinlock);
2460         INIT_LIST_HEAD(&c->reserved_buffers);
2461         c->need_reserved_buffers = reserved_buffers;
2462
2463         dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
2464
2465         init_waitqueue_head(&c->free_buffer_wait);
2466         c->async_write_error = 0;
2467
2468         c->dm_io = dm_io_client_create();
2469         if (IS_ERR(c->dm_io)) {
2470                 r = PTR_ERR(c->dm_io);
2471                 goto bad_dm_io;
2472         }
2473
2474         if (block_size <= KMALLOC_MAX_SIZE &&
2475             (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
2476                 unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
2477
2478                 snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size);
2479                 c->slab_cache = kmem_cache_create(slab_name, block_size, align,
2480                                                   SLAB_RECLAIM_ACCOUNT, NULL);
2481                 if (!c->slab_cache) {
2482                         r = -ENOMEM;
2483                         goto bad;
2484                 }
2485         }
2486         if (aux_size)
2487                 snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size);
2488         else
2489                 snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer");
2490         c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
2491                                            0, SLAB_RECLAIM_ACCOUNT, NULL);
2492         if (!c->slab_buffer) {
2493                 r = -ENOMEM;
2494                 goto bad;
2495         }
2496
2497         while (c->need_reserved_buffers) {
2498                 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
2499
2500                 if (!b) {
2501                         r = -ENOMEM;
2502                         goto bad;
2503                 }
2504                 __free_buffer_wake(b);
2505         }
2506
2507         INIT_WORK(&c->shrink_work, shrink_work);
2508         atomic_long_set(&c->need_shrink, 0);
2509
2510         c->shrinker.count_objects = dm_bufio_shrink_count;
2511         c->shrinker.scan_objects = dm_bufio_shrink_scan;
2512         c->shrinker.seeks = 1;
2513         c->shrinker.batch = 0;
2514         r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)",
2515                               MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2516         if (r)
2517                 goto bad;
2518
2519         mutex_lock(&dm_bufio_clients_lock);
2520         dm_bufio_client_count++;
2521         list_add(&c->client_list, &dm_bufio_all_clients);
2522         __cache_size_refresh();
2523         mutex_unlock(&dm_bufio_clients_lock);
2524
2525         return c;
2526
2527 bad:
2528         while (!list_empty(&c->reserved_buffers)) {
2529                 struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2530
2531                 list_del(&b->lru.list);
2532                 free_buffer(b);
2533         }
2534         kmem_cache_destroy(c->slab_cache);
2535         kmem_cache_destroy(c->slab_buffer);
2536         dm_io_client_destroy(c->dm_io);
2537 bad_dm_io:
2538         mutex_destroy(&c->lock);
2539         if (c->no_sleep)
2540                 static_branch_dec(&no_sleep_enabled);
2541         kfree(c);
2542 bad_client:
2543         return ERR_PTR(r);
2544 }
2545 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
2546
2547 /*
2548  * Free the buffering interface.
2549  * It is required that there are no references on any buffers.
2550  */
2551 void dm_bufio_client_destroy(struct dm_bufio_client *c)
2552 {
2553         unsigned int i;
2554
2555         drop_buffers(c);
2556
2557         unregister_shrinker(&c->shrinker);
2558         flush_work(&c->shrink_work);
2559
2560         mutex_lock(&dm_bufio_clients_lock);
2561
2562         list_del(&c->client_list);
2563         dm_bufio_client_count--;
2564         __cache_size_refresh();
2565
2566         mutex_unlock(&dm_bufio_clients_lock);
2567
2568         WARN_ON(c->need_reserved_buffers);
2569
2570         while (!list_empty(&c->reserved_buffers)) {
2571                 struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2572
2573                 list_del(&b->lru.list);
2574                 free_buffer(b);
2575         }
2576
2577         for (i = 0; i < LIST_SIZE; i++)
2578                 if (cache_count(&c->cache, i))
2579                         DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
2580
2581         for (i = 0; i < LIST_SIZE; i++)
2582                 WARN_ON(cache_count(&c->cache, i));
2583
2584         cache_destroy(&c->cache);
2585         kmem_cache_destroy(c->slab_cache);
2586         kmem_cache_destroy(c->slab_buffer);
2587         dm_io_client_destroy(c->dm_io);
2588         mutex_destroy(&c->lock);
2589         if (c->no_sleep)
2590                 static_branch_dec(&no_sleep_enabled);
2591         kfree(c);
2592 }
2593 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
2594
2595 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
2596 {
2597         c->start = start;
2598 }
2599 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
2600
2601 /*--------------------------------------------------------------*/
2602
2603 static unsigned int get_max_age_hz(void)
2604 {
2605         unsigned int max_age = READ_ONCE(dm_bufio_max_age);
2606
2607         if (max_age > UINT_MAX / HZ)
2608                 max_age = UINT_MAX / HZ;
2609
2610         return max_age * HZ;
2611 }
2612
2613 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
2614 {
2615         return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
2616 }
2617
2618 struct evict_params {
2619         gfp_t gfp;
2620         unsigned long age_hz;
2621
2622         /*
2623          * This gets updated with the largest last_accessed (ie. most
2624          * recently used) of the evicted buffers.  It will not be reinitialised
2625          * by __evict_many(), so you can use it across multiple invocations.
2626          */
2627         unsigned long last_accessed;
2628 };
2629
2630 /*
2631  * We may not be able to evict this buffer if IO pending or the client
2632  * is still using it.
2633  *
2634  * And if GFP_NOFS is used, we must not do any I/O because we hold
2635  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
2636  * rerouted to different bufio client.
2637  */
2638 static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
2639 {
2640         struct evict_params *params = context;
2641
2642         if (!(params->gfp & __GFP_FS) ||
2643             (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
2644                 if (test_bit_acquire(B_READING, &b->state) ||
2645                     test_bit(B_WRITING, &b->state) ||
2646                     test_bit(B_DIRTY, &b->state))
2647                         return ER_DONT_EVICT;
2648         }
2649
2650         return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
2651 }
2652
2653 static unsigned long __evict_many(struct dm_bufio_client *c,
2654                                   struct evict_params *params,
2655                                   int list_mode, unsigned long max_count)
2656 {
2657         unsigned long count;
2658         unsigned long last_accessed;
2659         struct dm_buffer *b;
2660
2661         for (count = 0; count < max_count; count++) {
2662                 b = cache_evict(&c->cache, list_mode, select_for_evict, params);
2663                 if (!b)
2664                         break;
2665
2666                 last_accessed = READ_ONCE(b->last_accessed);
2667                 if (time_after_eq(params->last_accessed, last_accessed))
2668                         params->last_accessed = last_accessed;
2669
2670                 __make_buffer_clean(b);
2671                 __free_buffer_wake(b);
2672
2673                 cond_resched();
2674         }
2675
2676         return count;
2677 }
2678
2679 static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
2680 {
2681         struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
2682         unsigned long retain = get_retain_buffers(c);
2683         unsigned long count;
2684         LIST_HEAD(write_list);
2685
2686         dm_bufio_lock(c);
2687
2688         __check_watermark(c, &write_list);
2689         if (unlikely(!list_empty(&write_list))) {
2690                 dm_bufio_unlock(c);
2691                 __flush_write_list(&write_list);
2692                 dm_bufio_lock(c);
2693         }
2694
2695         count = cache_total(&c->cache);
2696         if (count > retain)
2697                 __evict_many(c, &params, LIST_CLEAN, count - retain);
2698
2699         dm_bufio_unlock(c);
2700 }
2701
2702 static void cleanup_old_buffers(void)
2703 {
2704         unsigned long max_age_hz = get_max_age_hz();
2705         struct dm_bufio_client *c;
2706
2707         mutex_lock(&dm_bufio_clients_lock);
2708
2709         __cache_size_refresh();
2710
2711         list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2712                 evict_old_buffers(c, max_age_hz);
2713
2714         mutex_unlock(&dm_bufio_clients_lock);
2715 }
2716
2717 static void work_fn(struct work_struct *w)
2718 {
2719         cleanup_old_buffers();
2720
2721         queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2722                            DM_BUFIO_WORK_TIMER_SECS * HZ);
2723 }
2724
2725 /*--------------------------------------------------------------*/
2726
2727 /*
2728  * Global cleanup tries to evict the oldest buffers from across _all_
2729  * the clients.  It does this by repeatedly evicting a few buffers from
2730  * the client that holds the oldest buffer.  It's approximate, but hopefully
2731  * good enough.
2732  */
2733 static struct dm_bufio_client *__pop_client(void)
2734 {
2735         struct list_head *h;
2736
2737         if (list_empty(&dm_bufio_all_clients))
2738                 return NULL;
2739
2740         h = dm_bufio_all_clients.next;
2741         list_del(h);
2742         return container_of(h, struct dm_bufio_client, client_list);
2743 }
2744
2745 /*
2746  * Inserts the client in the global client list based on its
2747  * 'oldest_buffer' field.
2748  */
2749 static void __insert_client(struct dm_bufio_client *new_client)
2750 {
2751         struct dm_bufio_client *c;
2752         struct list_head *h = dm_bufio_all_clients.next;
2753
2754         while (h != &dm_bufio_all_clients) {
2755                 c = container_of(h, struct dm_bufio_client, client_list);
2756                 if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
2757                         break;
2758                 h = h->next;
2759         }
2760
2761         list_add_tail(&new_client->client_list, h);
2762 }
2763
2764 static unsigned long __evict_a_few(unsigned long nr_buffers)
2765 {
2766         unsigned long count;
2767         struct dm_bufio_client *c;
2768         struct evict_params params = {
2769                 .gfp = GFP_KERNEL,
2770                 .age_hz = 0,
2771                 /* set to jiffies in case there are no buffers in this client */
2772                 .last_accessed = jiffies
2773         };
2774
2775         c = __pop_client();
2776         if (!c)
2777                 return 0;
2778
2779         dm_bufio_lock(c);
2780         count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
2781         dm_bufio_unlock(c);
2782
2783         if (count)
2784                 c->oldest_buffer = params.last_accessed;
2785         __insert_client(c);
2786
2787         return count;
2788 }
2789
2790 static void check_watermarks(void)
2791 {
2792         LIST_HEAD(write_list);
2793         struct dm_bufio_client *c;
2794
2795         mutex_lock(&dm_bufio_clients_lock);
2796         list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
2797                 dm_bufio_lock(c);
2798                 __check_watermark(c, &write_list);
2799                 dm_bufio_unlock(c);
2800         }
2801         mutex_unlock(&dm_bufio_clients_lock);
2802
2803         __flush_write_list(&write_list);
2804 }
2805
2806 static void evict_old(void)
2807 {
2808         unsigned long threshold = dm_bufio_cache_size -
2809                 dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
2810
2811         mutex_lock(&dm_bufio_clients_lock);
2812         while (dm_bufio_current_allocated > threshold) {
2813                 if (!__evict_a_few(64))
2814                         break;
2815                 cond_resched();
2816         }
2817         mutex_unlock(&dm_bufio_clients_lock);
2818 }
2819
2820 static void do_global_cleanup(struct work_struct *w)
2821 {
2822         check_watermarks();
2823         evict_old();
2824 }
2825
2826 /*
2827  *--------------------------------------------------------------
2828  * Module setup
2829  *--------------------------------------------------------------
2830  */
2831
2832 /*
2833  * This is called only once for the whole dm_bufio module.
2834  * It initializes memory limit.
2835  */
2836 static int __init dm_bufio_init(void)
2837 {
2838         __u64 mem;
2839
2840         dm_bufio_allocated_kmem_cache = 0;
2841         dm_bufio_allocated_get_free_pages = 0;
2842         dm_bufio_allocated_vmalloc = 0;
2843         dm_bufio_current_allocated = 0;
2844
2845         mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2846                                DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2847
2848         if (mem > ULONG_MAX)
2849                 mem = ULONG_MAX;
2850
2851 #ifdef CONFIG_MMU
2852         if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2853                 mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2854 #endif
2855
2856         dm_bufio_default_cache_size = mem;
2857
2858         mutex_lock(&dm_bufio_clients_lock);
2859         __cache_size_refresh();
2860         mutex_unlock(&dm_bufio_clients_lock);
2861
2862         dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2863         if (!dm_bufio_wq)
2864                 return -ENOMEM;
2865
2866         INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2867         INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2868         queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2869                            DM_BUFIO_WORK_TIMER_SECS * HZ);
2870
2871         return 0;
2872 }
2873
2874 /*
2875  * This is called once when unloading the dm_bufio module.
2876  */
2877 static void __exit dm_bufio_exit(void)
2878 {
2879         int bug = 0;
2880
2881         cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2882         destroy_workqueue(dm_bufio_wq);
2883
2884         if (dm_bufio_client_count) {
2885                 DMCRIT("%s: dm_bufio_client_count leaked: %d",
2886                         __func__, dm_bufio_client_count);
2887                 bug = 1;
2888         }
2889
2890         if (dm_bufio_current_allocated) {
2891                 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2892                         __func__, dm_bufio_current_allocated);
2893                 bug = 1;
2894         }
2895
2896         if (dm_bufio_allocated_get_free_pages) {
2897                 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2898                        __func__, dm_bufio_allocated_get_free_pages);
2899                 bug = 1;
2900         }
2901
2902         if (dm_bufio_allocated_vmalloc) {
2903                 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2904                        __func__, dm_bufio_allocated_vmalloc);
2905                 bug = 1;
2906         }
2907
2908         WARN_ON(bug); /* leaks are not worth crashing the system */
2909 }
2910
2911 module_init(dm_bufio_init)
2912 module_exit(dm_bufio_exit)
2913
2914 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
2915 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2916
2917 module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
2918 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2919
2920 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
2921 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2922
2923 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, 0644);
2924 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2925
2926 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444);
2927 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2928
2929 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444);
2930 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2931
2932 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, 0444);
2933 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2934
2935 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, 0444);
2936 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2937
2938 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2939 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2940 MODULE_LICENSE("GPL");