Merge tag 'drm-next-2023-03-03-1' of git://anongit.freedesktop.org/drm/drm
[linux-block.git] / drivers / md / dm-thin-metadata.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2011-2012 Red Hat, Inc.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include "dm-thin-metadata.h"
9 #include "persistent-data/dm-btree.h"
10 #include "persistent-data/dm-space-map.h"
11 #include "persistent-data/dm-space-map-disk.h"
12 #include "persistent-data/dm-transaction-manager.h"
13
14 #include <linux/list.h>
15 #include <linux/device-mapper.h>
16 #include <linux/workqueue.h>
17
18 /*
19  *--------------------------------------------------------------------------
20  * As far as the metadata goes, there is:
21  *
22  * - A superblock in block zero, taking up fewer than 512 bytes for
23  *   atomic writes.
24  *
25  * - A space map managing the metadata blocks.
26  *
27  * - A space map managing the data blocks.
28  *
29  * - A btree mapping our internal thin dev ids onto struct disk_device_details.
30  *
31  * - A hierarchical btree, with 2 levels which effectively maps (thin
32  *   dev id, virtual block) -> block_time.  Block time is a 64-bit
33  *   field holding the time in the low 24 bits, and block in the top 40
34  *   bits.
35  *
36  * BTrees consist solely of btree_nodes, that fill a block.  Some are
37  * internal nodes, as such their values are a __le64 pointing to other
38  * nodes.  Leaf nodes can store data of any reasonable size (ie. much
39  * smaller than the block size).  The nodes consist of the header,
40  * followed by an array of keys, followed by an array of values.  We have
41  * to binary search on the keys so they're all held together to help the
42  * cpu cache.
43  *
44  * Space maps have 2 btrees:
45  *
46  * - One maps a uint64_t onto a struct index_entry.  Which points to a
47  *   bitmap block, and has some details about how many free entries there
48  *   are etc.
49  *
50  * - The bitmap blocks have a header (for the checksum).  Then the rest
51  *   of the block is pairs of bits.  With the meaning being:
52  *
53  *   0 - ref count is 0
54  *   1 - ref count is 1
55  *   2 - ref count is 2
56  *   3 - ref count is higher than 2
57  *
58  * - If the count is higher than 2 then the ref count is entered in a
59  *   second btree that directly maps the block_address to a uint32_t ref
60  *   count.
61  *
62  * The space map metadata variant doesn't have a bitmaps btree.  Instead
63  * it has one single blocks worth of index_entries.  This avoids
64  * recursive issues with the bitmap btree needing to allocate space in
65  * order to insert.  With a small data block size such as 64k the
66  * metadata support data devices that are hundreds of terrabytes.
67  *
68  * The space maps allocate space linearly from front to back.  Space that
69  * is freed in a transaction is never recycled within that transaction.
70  * To try and avoid fragmenting _free_ space the allocator always goes
71  * back and fills in gaps.
72  *
73  * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
74  * from the block manager.
75  *--------------------------------------------------------------------------
76  */
77
78 #define DM_MSG_PREFIX   "thin metadata"
79
80 #define THIN_SUPERBLOCK_MAGIC 27022010
81 #define THIN_SUPERBLOCK_LOCATION 0
82 #define THIN_VERSION 2
83 #define SECTOR_TO_BLOCK_SHIFT 3
84
85 /*
86  * For btree insert:
87  *  3 for btree insert +
88  *  2 for btree lookup used within space map
89  * For btree remove:
90  *  2 for shadow spine +
91  *  4 for rebalance 3 child node
92  */
93 #define THIN_MAX_CONCURRENT_LOCKS 6
94
95 /* This should be plenty */
96 #define SPACE_MAP_ROOT_SIZE 128
97
98 /*
99  * Little endian on-disk superblock and device details.
100  */
101 struct thin_disk_superblock {
102         __le32 csum;    /* Checksum of superblock except for this field. */
103         __le32 flags;
104         __le64 blocknr; /* This block number, dm_block_t. */
105
106         __u8 uuid[16];
107         __le64 magic;
108         __le32 version;
109         __le32 time;
110
111         __le64 trans_id;
112
113         /*
114          * Root held by userspace transactions.
115          */
116         __le64 held_root;
117
118         __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
119         __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
120
121         /*
122          * 2-level btree mapping (dev_id, (dev block, time)) -> data block
123          */
124         __le64 data_mapping_root;
125
126         /*
127          * Device detail root mapping dev_id -> device_details
128          */
129         __le64 device_details_root;
130
131         __le32 data_block_size;         /* In 512-byte sectors. */
132
133         __le32 metadata_block_size;     /* In 512-byte sectors. */
134         __le64 metadata_nr_blocks;
135
136         __le32 compat_flags;
137         __le32 compat_ro_flags;
138         __le32 incompat_flags;
139 } __packed;
140
141 struct disk_device_details {
142         __le64 mapped_blocks;
143         __le64 transaction_id;          /* When created. */
144         __le32 creation_time;
145         __le32 snapshotted_time;
146 } __packed;
147
148 struct dm_pool_metadata {
149         struct hlist_node hash;
150
151         struct block_device *bdev;
152         struct dm_block_manager *bm;
153         struct dm_space_map *metadata_sm;
154         struct dm_space_map *data_sm;
155         struct dm_transaction_manager *tm;
156         struct dm_transaction_manager *nb_tm;
157
158         /*
159          * Two-level btree.
160          * First level holds thin_dev_t.
161          * Second level holds mappings.
162          */
163         struct dm_btree_info info;
164
165         /*
166          * Non-blocking version of the above.
167          */
168         struct dm_btree_info nb_info;
169
170         /*
171          * Just the top level for deleting whole devices.
172          */
173         struct dm_btree_info tl_info;
174
175         /*
176          * Just the bottom level for creating new devices.
177          */
178         struct dm_btree_info bl_info;
179
180         /*
181          * Describes the device details btree.
182          */
183         struct dm_btree_info details_info;
184
185         struct rw_semaphore root_lock;
186         uint32_t time;
187         dm_block_t root;
188         dm_block_t details_root;
189         struct list_head thin_devices;
190         uint64_t trans_id;
191         unsigned long flags;
192         sector_t data_block_size;
193
194         /*
195          * Pre-commit callback.
196          *
197          * This allows the thin provisioning target to run a callback before
198          * the metadata are committed.
199          */
200         dm_pool_pre_commit_fn pre_commit_fn;
201         void *pre_commit_context;
202
203         /*
204          * We reserve a section of the metadata for commit overhead.
205          * All reported space does *not* include this.
206          */
207         dm_block_t metadata_reserve;
208
209         /*
210          * Set if a transaction has to be aborted but the attempt to roll back
211          * to the previous (good) transaction failed.  The only pool metadata
212          * operation possible in this state is the closing of the device.
213          */
214         bool fail_io:1;
215
216         /*
217          * Set once a thin-pool has been accessed through one of the interfaces
218          * that imply the pool is in-service (e.g. thin devices created/deleted,
219          * thin-pool message, metadata snapshots, etc).
220          */
221         bool in_service:1;
222
223         /*
224          * Reading the space map roots can fail, so we read it into these
225          * buffers before the superblock is locked and updated.
226          */
227         __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
228         __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
229 };
230
231 struct dm_thin_device {
232         struct list_head list;
233         struct dm_pool_metadata *pmd;
234         dm_thin_id id;
235
236         int open_count;
237         bool changed:1;
238         bool aborted_with_changes:1;
239         uint64_t mapped_blocks;
240         uint64_t transaction_id;
241         uint32_t creation_time;
242         uint32_t snapshotted_time;
243 };
244
245 /*
246  *--------------------------------------------------------------
247  * superblock validator
248  *--------------------------------------------------------------
249  */
250 #define SUPERBLOCK_CSUM_XOR 160774
251
252 static void sb_prepare_for_write(struct dm_block_validator *v,
253                                  struct dm_block *b,
254                                  size_t block_size)
255 {
256         struct thin_disk_superblock *disk_super = dm_block_data(b);
257
258         disk_super->blocknr = cpu_to_le64(dm_block_location(b));
259         disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
260                                                       block_size - sizeof(__le32),
261                                                       SUPERBLOCK_CSUM_XOR));
262 }
263
264 static int sb_check(struct dm_block_validator *v,
265                     struct dm_block *b,
266                     size_t block_size)
267 {
268         struct thin_disk_superblock *disk_super = dm_block_data(b);
269         __le32 csum_le;
270
271         if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
272                 DMERR("%s failed: blocknr %llu: wanted %llu",
273                       __func__, le64_to_cpu(disk_super->blocknr),
274                       (unsigned long long)dm_block_location(b));
275                 return -ENOTBLK;
276         }
277
278         if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
279                 DMERR("%s failed: magic %llu: wanted %llu",
280                       __func__, le64_to_cpu(disk_super->magic),
281                       (unsigned long long)THIN_SUPERBLOCK_MAGIC);
282                 return -EILSEQ;
283         }
284
285         csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
286                                              block_size - sizeof(__le32),
287                                              SUPERBLOCK_CSUM_XOR));
288         if (csum_le != disk_super->csum) {
289                 DMERR("%s failed: csum %u: wanted %u",
290                       __func__, le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
291                 return -EILSEQ;
292         }
293
294         return 0;
295 }
296
297 static struct dm_block_validator sb_validator = {
298         .name = "superblock",
299         .prepare_for_write = sb_prepare_for_write,
300         .check = sb_check
301 };
302
303 /*
304  *--------------------------------------------------------------
305  * Methods for the btree value types
306  *--------------------------------------------------------------
307  */
308 static uint64_t pack_block_time(dm_block_t b, uint32_t t)
309 {
310         return (b << 24) | t;
311 }
312
313 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
314 {
315         *b = v >> 24;
316         *t = v & ((1 << 24) - 1);
317 }
318
319 /*
320  * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as
321  * possible.  'with_runs' reads contiguous runs of blocks, and calls the
322  * given sm function.
323  */
324 typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t);
325
326 static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned int count, run_fn fn)
327 {
328         uint64_t b, begin, end;
329         uint32_t t;
330         bool in_run = false;
331         unsigned int i;
332
333         for (i = 0; i < count; i++, value_le++) {
334                 /* We know value_le is 8 byte aligned */
335                 unpack_block_time(le64_to_cpu(*value_le), &b, &t);
336
337                 if (in_run) {
338                         if (b == end) {
339                                 end++;
340                         } else {
341                                 fn(sm, begin, end);
342                                 begin = b;
343                                 end = b + 1;
344                         }
345                 } else {
346                         in_run = true;
347                         begin = b;
348                         end = b + 1;
349                 }
350         }
351
352         if (in_run)
353                 fn(sm, begin, end);
354 }
355
356 static void data_block_inc(void *context, const void *value_le, unsigned int count)
357 {
358         with_runs((struct dm_space_map *) context,
359                   (const __le64 *) value_le, count, dm_sm_inc_blocks);
360 }
361
362 static void data_block_dec(void *context, const void *value_le, unsigned int count)
363 {
364         with_runs((struct dm_space_map *) context,
365                   (const __le64 *) value_le, count, dm_sm_dec_blocks);
366 }
367
368 static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
369 {
370         __le64 v1_le, v2_le;
371         uint64_t b1, b2;
372         uint32_t t;
373
374         memcpy(&v1_le, value1_le, sizeof(v1_le));
375         memcpy(&v2_le, value2_le, sizeof(v2_le));
376         unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
377         unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
378
379         return b1 == b2;
380 }
381
382 static void subtree_inc(void *context, const void *value, unsigned int count)
383 {
384         struct dm_btree_info *info = context;
385         const __le64 *root_le = value;
386         unsigned int i;
387
388         for (i = 0; i < count; i++, root_le++)
389                 dm_tm_inc(info->tm, le64_to_cpu(*root_le));
390 }
391
392 static void subtree_dec(void *context, const void *value, unsigned int count)
393 {
394         struct dm_btree_info *info = context;
395         const __le64 *root_le = value;
396         unsigned int i;
397
398         for (i = 0; i < count; i++, root_le++)
399                 if (dm_btree_del(info, le64_to_cpu(*root_le)))
400                         DMERR("btree delete failed");
401 }
402
403 static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
404 {
405         __le64 v1_le, v2_le;
406
407         memcpy(&v1_le, value1_le, sizeof(v1_le));
408         memcpy(&v2_le, value2_le, sizeof(v2_le));
409
410         return v1_le == v2_le;
411 }
412
413 /*----------------------------------------------------------------*/
414
415 /*
416  * Variant that is used for in-core only changes or code that
417  * shouldn't put the pool in service on its own (e.g. commit).
418  */
419 static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
420         __acquires(pmd->root_lock)
421 {
422         down_write(&pmd->root_lock);
423 }
424
425 static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
426 {
427         pmd_write_lock_in_core(pmd);
428         if (unlikely(!pmd->in_service))
429                 pmd->in_service = true;
430 }
431
432 static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
433         __releases(pmd->root_lock)
434 {
435         up_write(&pmd->root_lock);
436 }
437
438 /*----------------------------------------------------------------*/
439
440 static int superblock_lock_zero(struct dm_pool_metadata *pmd,
441                                 struct dm_block **sblock)
442 {
443         return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
444                                      &sb_validator, sblock);
445 }
446
447 static int superblock_lock(struct dm_pool_metadata *pmd,
448                            struct dm_block **sblock)
449 {
450         return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
451                                 &sb_validator, sblock);
452 }
453
454 static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
455 {
456         int r;
457         unsigned int i;
458         struct dm_block *b;
459         __le64 *data_le, zero = cpu_to_le64(0);
460         unsigned int block_size = dm_bm_block_size(bm) / sizeof(__le64);
461
462         /*
463          * We can't use a validator here - it may be all zeroes.
464          */
465         r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
466         if (r)
467                 return r;
468
469         data_le = dm_block_data(b);
470         *result = 1;
471         for (i = 0; i < block_size; i++) {
472                 if (data_le[i] != zero) {
473                         *result = 0;
474                         break;
475                 }
476         }
477
478         dm_bm_unlock(b);
479
480         return 0;
481 }
482
483 static void __setup_btree_details(struct dm_pool_metadata *pmd)
484 {
485         pmd->info.tm = pmd->tm;
486         pmd->info.levels = 2;
487         pmd->info.value_type.context = pmd->data_sm;
488         pmd->info.value_type.size = sizeof(__le64);
489         pmd->info.value_type.inc = data_block_inc;
490         pmd->info.value_type.dec = data_block_dec;
491         pmd->info.value_type.equal = data_block_equal;
492
493         memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
494         pmd->nb_info.tm = pmd->nb_tm;
495
496         pmd->tl_info.tm = pmd->tm;
497         pmd->tl_info.levels = 1;
498         pmd->tl_info.value_type.context = &pmd->bl_info;
499         pmd->tl_info.value_type.size = sizeof(__le64);
500         pmd->tl_info.value_type.inc = subtree_inc;
501         pmd->tl_info.value_type.dec = subtree_dec;
502         pmd->tl_info.value_type.equal = subtree_equal;
503
504         pmd->bl_info.tm = pmd->tm;
505         pmd->bl_info.levels = 1;
506         pmd->bl_info.value_type.context = pmd->data_sm;
507         pmd->bl_info.value_type.size = sizeof(__le64);
508         pmd->bl_info.value_type.inc = data_block_inc;
509         pmd->bl_info.value_type.dec = data_block_dec;
510         pmd->bl_info.value_type.equal = data_block_equal;
511
512         pmd->details_info.tm = pmd->tm;
513         pmd->details_info.levels = 1;
514         pmd->details_info.value_type.context = NULL;
515         pmd->details_info.value_type.size = sizeof(struct disk_device_details);
516         pmd->details_info.value_type.inc = NULL;
517         pmd->details_info.value_type.dec = NULL;
518         pmd->details_info.value_type.equal = NULL;
519 }
520
521 static int save_sm_roots(struct dm_pool_metadata *pmd)
522 {
523         int r;
524         size_t len;
525
526         r = dm_sm_root_size(pmd->metadata_sm, &len);
527         if (r < 0)
528                 return r;
529
530         r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
531         if (r < 0)
532                 return r;
533
534         r = dm_sm_root_size(pmd->data_sm, &len);
535         if (r < 0)
536                 return r;
537
538         return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
539 }
540
541 static void copy_sm_roots(struct dm_pool_metadata *pmd,
542                           struct thin_disk_superblock *disk)
543 {
544         memcpy(&disk->metadata_space_map_root,
545                &pmd->metadata_space_map_root,
546                sizeof(pmd->metadata_space_map_root));
547
548         memcpy(&disk->data_space_map_root,
549                &pmd->data_space_map_root,
550                sizeof(pmd->data_space_map_root));
551 }
552
553 static int __write_initial_superblock(struct dm_pool_metadata *pmd)
554 {
555         int r;
556         struct dm_block *sblock;
557         struct thin_disk_superblock *disk_super;
558         sector_t bdev_size = bdev_nr_sectors(pmd->bdev);
559
560         if (bdev_size > THIN_METADATA_MAX_SECTORS)
561                 bdev_size = THIN_METADATA_MAX_SECTORS;
562
563         r = dm_sm_commit(pmd->data_sm);
564         if (r < 0)
565                 return r;
566
567         r = dm_tm_pre_commit(pmd->tm);
568         if (r < 0)
569                 return r;
570
571         r = save_sm_roots(pmd);
572         if (r < 0)
573                 return r;
574
575         r = superblock_lock_zero(pmd, &sblock);
576         if (r)
577                 return r;
578
579         disk_super = dm_block_data(sblock);
580         disk_super->flags = 0;
581         memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
582         disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
583         disk_super->version = cpu_to_le32(THIN_VERSION);
584         disk_super->time = 0;
585         disk_super->trans_id = 0;
586         disk_super->held_root = 0;
587
588         copy_sm_roots(pmd, disk_super);
589
590         disk_super->data_mapping_root = cpu_to_le64(pmd->root);
591         disk_super->device_details_root = cpu_to_le64(pmd->details_root);
592         disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
593         disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
594         disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
595
596         return dm_tm_commit(pmd->tm, sblock);
597 }
598
599 static int __format_metadata(struct dm_pool_metadata *pmd)
600 {
601         int r;
602
603         r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
604                                  &pmd->tm, &pmd->metadata_sm);
605         if (r < 0) {
606                 DMERR("tm_create_with_sm failed");
607                 return r;
608         }
609
610         pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
611         if (IS_ERR(pmd->data_sm)) {
612                 DMERR("sm_disk_create failed");
613                 r = PTR_ERR(pmd->data_sm);
614                 goto bad_cleanup_tm;
615         }
616
617         pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
618         if (!pmd->nb_tm) {
619                 DMERR("could not create non-blocking clone tm");
620                 r = -ENOMEM;
621                 goto bad_cleanup_data_sm;
622         }
623
624         __setup_btree_details(pmd);
625
626         r = dm_btree_empty(&pmd->info, &pmd->root);
627         if (r < 0)
628                 goto bad_cleanup_nb_tm;
629
630         r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
631         if (r < 0) {
632                 DMERR("couldn't create devices root");
633                 goto bad_cleanup_nb_tm;
634         }
635
636         r = __write_initial_superblock(pmd);
637         if (r)
638                 goto bad_cleanup_nb_tm;
639
640         return 0;
641
642 bad_cleanup_nb_tm:
643         dm_tm_destroy(pmd->nb_tm);
644 bad_cleanup_data_sm:
645         dm_sm_destroy(pmd->data_sm);
646 bad_cleanup_tm:
647         dm_tm_destroy(pmd->tm);
648         dm_sm_destroy(pmd->metadata_sm);
649
650         return r;
651 }
652
653 static int __check_incompat_features(struct thin_disk_superblock *disk_super,
654                                      struct dm_pool_metadata *pmd)
655 {
656         uint32_t features;
657
658         features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
659         if (features) {
660                 DMERR("could not access metadata due to unsupported optional features (%lx).",
661                       (unsigned long)features);
662                 return -EINVAL;
663         }
664
665         /*
666          * Check for read-only metadata to skip the following RDWR checks.
667          */
668         if (bdev_read_only(pmd->bdev))
669                 return 0;
670
671         features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
672         if (features) {
673                 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
674                       (unsigned long)features);
675                 return -EINVAL;
676         }
677
678         return 0;
679 }
680
681 static int __open_metadata(struct dm_pool_metadata *pmd)
682 {
683         int r;
684         struct dm_block *sblock;
685         struct thin_disk_superblock *disk_super;
686
687         r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
688                             &sb_validator, &sblock);
689         if (r < 0) {
690                 DMERR("couldn't read superblock");
691                 return r;
692         }
693
694         disk_super = dm_block_data(sblock);
695
696         /* Verify the data block size hasn't changed */
697         if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
698                 DMERR("changing the data block size (from %u to %llu) is not supported",
699                       le32_to_cpu(disk_super->data_block_size),
700                       (unsigned long long)pmd->data_block_size);
701                 r = -EINVAL;
702                 goto bad_unlock_sblock;
703         }
704
705         r = __check_incompat_features(disk_super, pmd);
706         if (r < 0)
707                 goto bad_unlock_sblock;
708
709         r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
710                                disk_super->metadata_space_map_root,
711                                sizeof(disk_super->metadata_space_map_root),
712                                &pmd->tm, &pmd->metadata_sm);
713         if (r < 0) {
714                 DMERR("tm_open_with_sm failed");
715                 goto bad_unlock_sblock;
716         }
717
718         pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
719                                        sizeof(disk_super->data_space_map_root));
720         if (IS_ERR(pmd->data_sm)) {
721                 DMERR("sm_disk_open failed");
722                 r = PTR_ERR(pmd->data_sm);
723                 goto bad_cleanup_tm;
724         }
725
726         pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
727         if (!pmd->nb_tm) {
728                 DMERR("could not create non-blocking clone tm");
729                 r = -ENOMEM;
730                 goto bad_cleanup_data_sm;
731         }
732
733         /*
734          * For pool metadata opening process, root setting is redundant
735          * because it will be set again in __begin_transaction(). But dm
736          * pool aborting process really needs to get last transaction's
737          * root to avoid accessing broken btree.
738          */
739         pmd->root = le64_to_cpu(disk_super->data_mapping_root);
740         pmd->details_root = le64_to_cpu(disk_super->device_details_root);
741
742         __setup_btree_details(pmd);
743         dm_bm_unlock(sblock);
744
745         return 0;
746
747 bad_cleanup_data_sm:
748         dm_sm_destroy(pmd->data_sm);
749 bad_cleanup_tm:
750         dm_tm_destroy(pmd->tm);
751         dm_sm_destroy(pmd->metadata_sm);
752 bad_unlock_sblock:
753         dm_bm_unlock(sblock);
754
755         return r;
756 }
757
758 static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
759 {
760         int r, unformatted;
761
762         r = __superblock_all_zeroes(pmd->bm, &unformatted);
763         if (r)
764                 return r;
765
766         if (unformatted)
767                 return format_device ? __format_metadata(pmd) : -EPERM;
768
769         return __open_metadata(pmd);
770 }
771
772 static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
773 {
774         int r;
775
776         pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
777                                           THIN_MAX_CONCURRENT_LOCKS);
778         if (IS_ERR(pmd->bm)) {
779                 DMERR("could not create block manager");
780                 r = PTR_ERR(pmd->bm);
781                 pmd->bm = NULL;
782                 return r;
783         }
784
785         r = __open_or_format_metadata(pmd, format_device);
786         if (r) {
787                 dm_block_manager_destroy(pmd->bm);
788                 pmd->bm = NULL;
789         }
790
791         return r;
792 }
793
794 static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd,
795                                               bool destroy_bm)
796 {
797         dm_sm_destroy(pmd->data_sm);
798         dm_sm_destroy(pmd->metadata_sm);
799         dm_tm_destroy(pmd->nb_tm);
800         dm_tm_destroy(pmd->tm);
801         if (destroy_bm)
802                 dm_block_manager_destroy(pmd->bm);
803 }
804
805 static int __begin_transaction(struct dm_pool_metadata *pmd)
806 {
807         int r;
808         struct thin_disk_superblock *disk_super;
809         struct dm_block *sblock;
810
811         /*
812          * We re-read the superblock every time.  Shouldn't need to do this
813          * really.
814          */
815         r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
816                             &sb_validator, &sblock);
817         if (r)
818                 return r;
819
820         disk_super = dm_block_data(sblock);
821         pmd->time = le32_to_cpu(disk_super->time);
822         pmd->root = le64_to_cpu(disk_super->data_mapping_root);
823         pmd->details_root = le64_to_cpu(disk_super->device_details_root);
824         pmd->trans_id = le64_to_cpu(disk_super->trans_id);
825         pmd->flags = le32_to_cpu(disk_super->flags);
826         pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
827
828         dm_bm_unlock(sblock);
829         return 0;
830 }
831
832 static int __write_changed_details(struct dm_pool_metadata *pmd)
833 {
834         int r;
835         struct dm_thin_device *td, *tmp;
836         struct disk_device_details details;
837         uint64_t key;
838
839         list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
840                 if (!td->changed)
841                         continue;
842
843                 key = td->id;
844
845                 details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
846                 details.transaction_id = cpu_to_le64(td->transaction_id);
847                 details.creation_time = cpu_to_le32(td->creation_time);
848                 details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
849                 __dm_bless_for_disk(&details);
850
851                 r = dm_btree_insert(&pmd->details_info, pmd->details_root,
852                                     &key, &details, &pmd->details_root);
853                 if (r)
854                         return r;
855
856                 if (td->open_count)
857                         td->changed = false;
858                 else {
859                         list_del(&td->list);
860                         kfree(td);
861                 }
862         }
863
864         return 0;
865 }
866
867 static int __commit_transaction(struct dm_pool_metadata *pmd)
868 {
869         int r;
870         struct thin_disk_superblock *disk_super;
871         struct dm_block *sblock;
872
873         /*
874          * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
875          */
876         BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
877         BUG_ON(!rwsem_is_locked(&pmd->root_lock));
878
879         if (unlikely(!pmd->in_service))
880                 return 0;
881
882         if (pmd->pre_commit_fn) {
883                 r = pmd->pre_commit_fn(pmd->pre_commit_context);
884                 if (r < 0) {
885                         DMERR("pre-commit callback failed");
886                         return r;
887                 }
888         }
889
890         r = __write_changed_details(pmd);
891         if (r < 0)
892                 return r;
893
894         r = dm_sm_commit(pmd->data_sm);
895         if (r < 0)
896                 return r;
897
898         r = dm_tm_pre_commit(pmd->tm);
899         if (r < 0)
900                 return r;
901
902         r = save_sm_roots(pmd);
903         if (r < 0)
904                 return r;
905
906         r = superblock_lock(pmd, &sblock);
907         if (r)
908                 return r;
909
910         disk_super = dm_block_data(sblock);
911         disk_super->time = cpu_to_le32(pmd->time);
912         disk_super->data_mapping_root = cpu_to_le64(pmd->root);
913         disk_super->device_details_root = cpu_to_le64(pmd->details_root);
914         disk_super->trans_id = cpu_to_le64(pmd->trans_id);
915         disk_super->flags = cpu_to_le32(pmd->flags);
916
917         copy_sm_roots(pmd, disk_super);
918
919         return dm_tm_commit(pmd->tm, sblock);
920 }
921
922 static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
923 {
924         int r;
925         dm_block_t total;
926         dm_block_t max_blocks = 4096; /* 16M */
927
928         r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
929         if (r) {
930                 DMERR("could not get size of metadata device");
931                 pmd->metadata_reserve = max_blocks;
932         } else
933                 pmd->metadata_reserve = min(max_blocks, div_u64(total, 10));
934 }
935
936 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
937                                                sector_t data_block_size,
938                                                bool format_device)
939 {
940         int r;
941         struct dm_pool_metadata *pmd;
942
943         pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
944         if (!pmd) {
945                 DMERR("could not allocate metadata struct");
946                 return ERR_PTR(-ENOMEM);
947         }
948
949         init_rwsem(&pmd->root_lock);
950         pmd->time = 0;
951         INIT_LIST_HEAD(&pmd->thin_devices);
952         pmd->fail_io = false;
953         pmd->in_service = false;
954         pmd->bdev = bdev;
955         pmd->data_block_size = data_block_size;
956         pmd->pre_commit_fn = NULL;
957         pmd->pre_commit_context = NULL;
958
959         r = __create_persistent_data_objects(pmd, format_device);
960         if (r) {
961                 kfree(pmd);
962                 return ERR_PTR(r);
963         }
964
965         r = __begin_transaction(pmd);
966         if (r < 0) {
967                 if (dm_pool_metadata_close(pmd) < 0)
968                         DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
969                 return ERR_PTR(r);
970         }
971
972         __set_metadata_reserve(pmd);
973
974         return pmd;
975 }
976
977 int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
978 {
979         int r;
980         unsigned int open_devices = 0;
981         struct dm_thin_device *td, *tmp;
982
983         down_read(&pmd->root_lock);
984         list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
985                 if (td->open_count)
986                         open_devices++;
987                 else {
988                         list_del(&td->list);
989                         kfree(td);
990                 }
991         }
992         up_read(&pmd->root_lock);
993
994         if (open_devices) {
995                 DMERR("attempt to close pmd when %u device(s) are still open",
996                        open_devices);
997                 return -EBUSY;
998         }
999
1000         pmd_write_lock_in_core(pmd);
1001         if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
1002                 r = __commit_transaction(pmd);
1003                 if (r < 0)
1004                         DMWARN("%s: __commit_transaction() failed, error = %d",
1005                                __func__, r);
1006         }
1007         pmd_write_unlock(pmd);
1008         if (!pmd->fail_io)
1009                 __destroy_persistent_data_objects(pmd, true);
1010
1011         kfree(pmd);
1012         return 0;
1013 }
1014
1015 /*
1016  * __open_device: Returns @td corresponding to device with id @dev,
1017  * creating it if @create is set and incrementing @td->open_count.
1018  * On failure, @td is undefined.
1019  */
1020 static int __open_device(struct dm_pool_metadata *pmd,
1021                          dm_thin_id dev, int create,
1022                          struct dm_thin_device **td)
1023 {
1024         int r, changed = 0;
1025         struct dm_thin_device *td2;
1026         uint64_t key = dev;
1027         struct disk_device_details details_le;
1028
1029         /*
1030          * If the device is already open, return it.
1031          */
1032         list_for_each_entry(td2, &pmd->thin_devices, list)
1033                 if (td2->id == dev) {
1034                         /*
1035                          * May not create an already-open device.
1036                          */
1037                         if (create)
1038                                 return -EEXIST;
1039
1040                         td2->open_count++;
1041                         *td = td2;
1042                         return 0;
1043                 }
1044
1045         /*
1046          * Check the device exists.
1047          */
1048         r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1049                             &key, &details_le);
1050         if (r) {
1051                 if (r != -ENODATA || !create)
1052                         return r;
1053
1054                 /*
1055                  * Create new device.
1056                  */
1057                 changed = 1;
1058                 details_le.mapped_blocks = 0;
1059                 details_le.transaction_id = cpu_to_le64(pmd->trans_id);
1060                 details_le.creation_time = cpu_to_le32(pmd->time);
1061                 details_le.snapshotted_time = cpu_to_le32(pmd->time);
1062         }
1063
1064         *td = kmalloc(sizeof(**td), GFP_NOIO);
1065         if (!*td)
1066                 return -ENOMEM;
1067
1068         (*td)->pmd = pmd;
1069         (*td)->id = dev;
1070         (*td)->open_count = 1;
1071         (*td)->changed = changed;
1072         (*td)->aborted_with_changes = false;
1073         (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
1074         (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
1075         (*td)->creation_time = le32_to_cpu(details_le.creation_time);
1076         (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
1077
1078         list_add(&(*td)->list, &pmd->thin_devices);
1079
1080         return 0;
1081 }
1082
1083 static void __close_device(struct dm_thin_device *td)
1084 {
1085         --td->open_count;
1086 }
1087
1088 static int __create_thin(struct dm_pool_metadata *pmd,
1089                          dm_thin_id dev)
1090 {
1091         int r;
1092         dm_block_t dev_root;
1093         uint64_t key = dev;
1094         struct dm_thin_device *td;
1095         __le64 value;
1096
1097         r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1098                             &key, NULL);
1099         if (!r)
1100                 return -EEXIST;
1101
1102         /*
1103          * Create an empty btree for the mappings.
1104          */
1105         r = dm_btree_empty(&pmd->bl_info, &dev_root);
1106         if (r)
1107                 return r;
1108
1109         /*
1110          * Insert it into the main mapping tree.
1111          */
1112         value = cpu_to_le64(dev_root);
1113         __dm_bless_for_disk(&value);
1114         r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1115         if (r) {
1116                 dm_btree_del(&pmd->bl_info, dev_root);
1117                 return r;
1118         }
1119
1120         r = __open_device(pmd, dev, 1, &td);
1121         if (r) {
1122                 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1123                 dm_btree_del(&pmd->bl_info, dev_root);
1124                 return r;
1125         }
1126         __close_device(td);
1127
1128         return r;
1129 }
1130
1131 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1132 {
1133         int r = -EINVAL;
1134
1135         pmd_write_lock(pmd);
1136         if (!pmd->fail_io)
1137                 r = __create_thin(pmd, dev);
1138         pmd_write_unlock(pmd);
1139
1140         return r;
1141 }
1142
1143 static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1144                                   struct dm_thin_device *snap,
1145                                   dm_thin_id origin, uint32_t time)
1146 {
1147         int r;
1148         struct dm_thin_device *td;
1149
1150         r = __open_device(pmd, origin, 0, &td);
1151         if (r)
1152                 return r;
1153
1154         td->changed = true;
1155         td->snapshotted_time = time;
1156
1157         snap->mapped_blocks = td->mapped_blocks;
1158         snap->snapshotted_time = time;
1159         __close_device(td);
1160
1161         return 0;
1162 }
1163
1164 static int __create_snap(struct dm_pool_metadata *pmd,
1165                          dm_thin_id dev, dm_thin_id origin)
1166 {
1167         int r;
1168         dm_block_t origin_root;
1169         uint64_t key = origin, dev_key = dev;
1170         struct dm_thin_device *td;
1171         __le64 value;
1172
1173         /* check this device is unused */
1174         r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1175                             &dev_key, NULL);
1176         if (!r)
1177                 return -EEXIST;
1178
1179         /* find the mapping tree for the origin */
1180         r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1181         if (r)
1182                 return r;
1183         origin_root = le64_to_cpu(value);
1184
1185         /* clone the origin, an inc will do */
1186         dm_tm_inc(pmd->tm, origin_root);
1187
1188         /* insert into the main mapping tree */
1189         value = cpu_to_le64(origin_root);
1190         __dm_bless_for_disk(&value);
1191         key = dev;
1192         r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1193         if (r) {
1194                 dm_tm_dec(pmd->tm, origin_root);
1195                 return r;
1196         }
1197
1198         pmd->time++;
1199
1200         r = __open_device(pmd, dev, 1, &td);
1201         if (r)
1202                 goto bad;
1203
1204         r = __set_snapshot_details(pmd, td, origin, pmd->time);
1205         __close_device(td);
1206
1207         if (r)
1208                 goto bad;
1209
1210         return 0;
1211
1212 bad:
1213         dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1214         dm_btree_remove(&pmd->details_info, pmd->details_root,
1215                         &key, &pmd->details_root);
1216         return r;
1217 }
1218
1219 int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1220                                  dm_thin_id dev,
1221                                  dm_thin_id origin)
1222 {
1223         int r = -EINVAL;
1224
1225         pmd_write_lock(pmd);
1226         if (!pmd->fail_io)
1227                 r = __create_snap(pmd, dev, origin);
1228         pmd_write_unlock(pmd);
1229
1230         return r;
1231 }
1232
1233 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1234 {
1235         int r;
1236         uint64_t key = dev;
1237         struct dm_thin_device *td;
1238
1239         /* TODO: failure should mark the transaction invalid */
1240         r = __open_device(pmd, dev, 0, &td);
1241         if (r)
1242                 return r;
1243
1244         if (td->open_count > 1) {
1245                 __close_device(td);
1246                 return -EBUSY;
1247         }
1248
1249         list_del(&td->list);
1250         kfree(td);
1251         r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1252                             &key, &pmd->details_root);
1253         if (r)
1254                 return r;
1255
1256         r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1257         if (r)
1258                 return r;
1259
1260         return 0;
1261 }
1262
1263 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1264                                dm_thin_id dev)
1265 {
1266         int r = -EINVAL;
1267
1268         pmd_write_lock(pmd);
1269         if (!pmd->fail_io)
1270                 r = __delete_device(pmd, dev);
1271         pmd_write_unlock(pmd);
1272
1273         return r;
1274 }
1275
1276 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1277                                         uint64_t current_id,
1278                                         uint64_t new_id)
1279 {
1280         int r = -EINVAL;
1281
1282         pmd_write_lock(pmd);
1283
1284         if (pmd->fail_io)
1285                 goto out;
1286
1287         if (pmd->trans_id != current_id) {
1288                 DMERR("mismatched transaction id");
1289                 goto out;
1290         }
1291
1292         pmd->trans_id = new_id;
1293         r = 0;
1294
1295 out:
1296         pmd_write_unlock(pmd);
1297
1298         return r;
1299 }
1300
1301 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1302                                         uint64_t *result)
1303 {
1304         int r = -EINVAL;
1305
1306         down_read(&pmd->root_lock);
1307         if (!pmd->fail_io) {
1308                 *result = pmd->trans_id;
1309                 r = 0;
1310         }
1311         up_read(&pmd->root_lock);
1312
1313         return r;
1314 }
1315
1316 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1317 {
1318         int r, inc;
1319         struct thin_disk_superblock *disk_super;
1320         struct dm_block *copy, *sblock;
1321         dm_block_t held_root;
1322
1323         /*
1324          * We commit to ensure the btree roots which we increment in a
1325          * moment are up to date.
1326          */
1327         r = __commit_transaction(pmd);
1328         if (r < 0) {
1329                 DMWARN("%s: __commit_transaction() failed, error = %d",
1330                        __func__, r);
1331                 return r;
1332         }
1333
1334         /*
1335          * Copy the superblock.
1336          */
1337         dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1338         r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1339                                &sb_validator, &copy, &inc);
1340         if (r)
1341                 return r;
1342
1343         BUG_ON(!inc);
1344
1345         held_root = dm_block_location(copy);
1346         disk_super = dm_block_data(copy);
1347
1348         if (le64_to_cpu(disk_super->held_root)) {
1349                 DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1350
1351                 dm_tm_dec(pmd->tm, held_root);
1352                 dm_tm_unlock(pmd->tm, copy);
1353                 return -EBUSY;
1354         }
1355
1356         /*
1357          * Wipe the spacemap since we're not publishing this.
1358          */
1359         memset(&disk_super->data_space_map_root, 0,
1360                sizeof(disk_super->data_space_map_root));
1361         memset(&disk_super->metadata_space_map_root, 0,
1362                sizeof(disk_super->metadata_space_map_root));
1363
1364         /*
1365          * Increment the data structures that need to be preserved.
1366          */
1367         dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1368         dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1369         dm_tm_unlock(pmd->tm, copy);
1370
1371         /*
1372          * Write the held root into the superblock.
1373          */
1374         r = superblock_lock(pmd, &sblock);
1375         if (r) {
1376                 dm_tm_dec(pmd->tm, held_root);
1377                 return r;
1378         }
1379
1380         disk_super = dm_block_data(sblock);
1381         disk_super->held_root = cpu_to_le64(held_root);
1382         dm_bm_unlock(sblock);
1383         return 0;
1384 }
1385
1386 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1387 {
1388         int r = -EINVAL;
1389
1390         pmd_write_lock(pmd);
1391         if (!pmd->fail_io)
1392                 r = __reserve_metadata_snap(pmd);
1393         pmd_write_unlock(pmd);
1394
1395         return r;
1396 }
1397
1398 static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1399 {
1400         int r;
1401         struct thin_disk_superblock *disk_super;
1402         struct dm_block *sblock, *copy;
1403         dm_block_t held_root;
1404
1405         r = superblock_lock(pmd, &sblock);
1406         if (r)
1407                 return r;
1408
1409         disk_super = dm_block_data(sblock);
1410         held_root = le64_to_cpu(disk_super->held_root);
1411         disk_super->held_root = cpu_to_le64(0);
1412
1413         dm_bm_unlock(sblock);
1414
1415         if (!held_root) {
1416                 DMWARN("No pool metadata snapshot found: nothing to release.");
1417                 return -EINVAL;
1418         }
1419
1420         r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1421         if (r)
1422                 return r;
1423
1424         disk_super = dm_block_data(copy);
1425         dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
1426         dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
1427         dm_sm_dec_block(pmd->metadata_sm, held_root);
1428
1429         dm_tm_unlock(pmd->tm, copy);
1430
1431         return 0;
1432 }
1433
1434 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1435 {
1436         int r = -EINVAL;
1437
1438         pmd_write_lock(pmd);
1439         if (!pmd->fail_io)
1440                 r = __release_metadata_snap(pmd);
1441         pmd_write_unlock(pmd);
1442
1443         return r;
1444 }
1445
1446 static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1447                                dm_block_t *result)
1448 {
1449         int r;
1450         struct thin_disk_superblock *disk_super;
1451         struct dm_block *sblock;
1452
1453         r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1454                             &sb_validator, &sblock);
1455         if (r)
1456                 return r;
1457
1458         disk_super = dm_block_data(sblock);
1459         *result = le64_to_cpu(disk_super->held_root);
1460
1461         dm_bm_unlock(sblock);
1462
1463         return 0;
1464 }
1465
1466 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1467                               dm_block_t *result)
1468 {
1469         int r = -EINVAL;
1470
1471         down_read(&pmd->root_lock);
1472         if (!pmd->fail_io)
1473                 r = __get_metadata_snap(pmd, result);
1474         up_read(&pmd->root_lock);
1475
1476         return r;
1477 }
1478
1479 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1480                              struct dm_thin_device **td)
1481 {
1482         int r = -EINVAL;
1483
1484         pmd_write_lock_in_core(pmd);
1485         if (!pmd->fail_io)
1486                 r = __open_device(pmd, dev, 0, td);
1487         pmd_write_unlock(pmd);
1488
1489         return r;
1490 }
1491
1492 int dm_pool_close_thin_device(struct dm_thin_device *td)
1493 {
1494         pmd_write_lock_in_core(td->pmd);
1495         __close_device(td);
1496         pmd_write_unlock(td->pmd);
1497
1498         return 0;
1499 }
1500
1501 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1502 {
1503         return td->id;
1504 }
1505
1506 /*
1507  * Check whether @time (of block creation) is older than @td's last snapshot.
1508  * If so then the associated block is shared with the last snapshot device.
1509  * Any block on a device created *after* the device last got snapshotted is
1510  * necessarily not shared.
1511  */
1512 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1513 {
1514         return td->snapshotted_time > time;
1515 }
1516
1517 static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
1518                                  struct dm_thin_lookup_result *result)
1519 {
1520         uint64_t block_time = 0;
1521         dm_block_t exception_block;
1522         uint32_t exception_time;
1523
1524         block_time = le64_to_cpu(value);
1525         unpack_block_time(block_time, &exception_block, &exception_time);
1526         result->block = exception_block;
1527         result->shared = __snapshotted_since(td, exception_time);
1528 }
1529
1530 static int __find_block(struct dm_thin_device *td, dm_block_t block,
1531                         int can_issue_io, struct dm_thin_lookup_result *result)
1532 {
1533         int r;
1534         __le64 value;
1535         struct dm_pool_metadata *pmd = td->pmd;
1536         dm_block_t keys[2] = { td->id, block };
1537         struct dm_btree_info *info;
1538
1539         if (can_issue_io)
1540                 info = &pmd->info;
1541         else
1542                 info = &pmd->nb_info;
1543
1544         r = dm_btree_lookup(info, pmd->root, keys, &value);
1545         if (!r)
1546                 unpack_lookup_result(td, value, result);
1547
1548         return r;
1549 }
1550
1551 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1552                        int can_issue_io, struct dm_thin_lookup_result *result)
1553 {
1554         int r;
1555         struct dm_pool_metadata *pmd = td->pmd;
1556
1557         down_read(&pmd->root_lock);
1558         if (pmd->fail_io) {
1559                 up_read(&pmd->root_lock);
1560                 return -EINVAL;
1561         }
1562
1563         r = __find_block(td, block, can_issue_io, result);
1564
1565         up_read(&pmd->root_lock);
1566         return r;
1567 }
1568
1569 static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
1570                                           dm_block_t *vblock,
1571                                           struct dm_thin_lookup_result *result)
1572 {
1573         int r;
1574         __le64 value;
1575         struct dm_pool_metadata *pmd = td->pmd;
1576         dm_block_t keys[2] = { td->id, block };
1577
1578         r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
1579         if (!r)
1580                 unpack_lookup_result(td, value, result);
1581
1582         return r;
1583 }
1584
1585 static int __find_mapped_range(struct dm_thin_device *td,
1586                                dm_block_t begin, dm_block_t end,
1587                                dm_block_t *thin_begin, dm_block_t *thin_end,
1588                                dm_block_t *pool_begin, bool *maybe_shared)
1589 {
1590         int r;
1591         dm_block_t pool_end;
1592         struct dm_thin_lookup_result lookup;
1593
1594         if (end < begin)
1595                 return -ENODATA;
1596
1597         r = __find_next_mapped_block(td, begin, &begin, &lookup);
1598         if (r)
1599                 return r;
1600
1601         if (begin >= end)
1602                 return -ENODATA;
1603
1604         *thin_begin = begin;
1605         *pool_begin = lookup.block;
1606         *maybe_shared = lookup.shared;
1607
1608         begin++;
1609         pool_end = *pool_begin + 1;
1610         while (begin != end) {
1611                 r = __find_block(td, begin, true, &lookup);
1612                 if (r) {
1613                         if (r == -ENODATA)
1614                                 break;
1615
1616                         return r;
1617                 }
1618
1619                 if ((lookup.block != pool_end) ||
1620                     (lookup.shared != *maybe_shared))
1621                         break;
1622
1623                 pool_end++;
1624                 begin++;
1625         }
1626
1627         *thin_end = begin;
1628         return 0;
1629 }
1630
1631 int dm_thin_find_mapped_range(struct dm_thin_device *td,
1632                               dm_block_t begin, dm_block_t end,
1633                               dm_block_t *thin_begin, dm_block_t *thin_end,
1634                               dm_block_t *pool_begin, bool *maybe_shared)
1635 {
1636         int r = -EINVAL;
1637         struct dm_pool_metadata *pmd = td->pmd;
1638
1639         down_read(&pmd->root_lock);
1640         if (!pmd->fail_io) {
1641                 r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
1642                                         pool_begin, maybe_shared);
1643         }
1644         up_read(&pmd->root_lock);
1645
1646         return r;
1647 }
1648
1649 static int __insert(struct dm_thin_device *td, dm_block_t block,
1650                     dm_block_t data_block)
1651 {
1652         int r, inserted;
1653         __le64 value;
1654         struct dm_pool_metadata *pmd = td->pmd;
1655         dm_block_t keys[2] = { td->id, block };
1656
1657         value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1658         __dm_bless_for_disk(&value);
1659
1660         r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1661                                    &pmd->root, &inserted);
1662         if (r)
1663                 return r;
1664
1665         td->changed = true;
1666         if (inserted)
1667                 td->mapped_blocks++;
1668
1669         return 0;
1670 }
1671
1672 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1673                          dm_block_t data_block)
1674 {
1675         int r = -EINVAL;
1676
1677         pmd_write_lock(td->pmd);
1678         if (!td->pmd->fail_io)
1679                 r = __insert(td, block, data_block);
1680         pmd_write_unlock(td->pmd);
1681
1682         return r;
1683 }
1684
1685 static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
1686 {
1687         int r;
1688         unsigned int count, total_count = 0;
1689         struct dm_pool_metadata *pmd = td->pmd;
1690         dm_block_t keys[1] = { td->id };
1691         __le64 value;
1692         dm_block_t mapping_root;
1693
1694         /*
1695          * Find the mapping tree
1696          */
1697         r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
1698         if (r)
1699                 return r;
1700
1701         /*
1702          * Remove from the mapping tree, taking care to inc the
1703          * ref count so it doesn't get deleted.
1704          */
1705         mapping_root = le64_to_cpu(value);
1706         dm_tm_inc(pmd->tm, mapping_root);
1707         r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
1708         if (r)
1709                 return r;
1710
1711         /*
1712          * Remove leaves stops at the first unmapped entry, so we have to
1713          * loop round finding mapped ranges.
1714          */
1715         while (begin < end) {
1716                 r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
1717                 if (r == -ENODATA)
1718                         break;
1719
1720                 if (r)
1721                         return r;
1722
1723                 if (begin >= end)
1724                         break;
1725
1726                 r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
1727                 if (r)
1728                         return r;
1729
1730                 total_count += count;
1731         }
1732
1733         td->mapped_blocks -= total_count;
1734         td->changed = true;
1735
1736         /*
1737          * Reinsert the mapping tree.
1738          */
1739         value = cpu_to_le64(mapping_root);
1740         __dm_bless_for_disk(&value);
1741         return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
1742 }
1743
1744 int dm_thin_remove_range(struct dm_thin_device *td,
1745                          dm_block_t begin, dm_block_t end)
1746 {
1747         int r = -EINVAL;
1748
1749         pmd_write_lock(td->pmd);
1750         if (!td->pmd->fail_io)
1751                 r = __remove_range(td, begin, end);
1752         pmd_write_unlock(td->pmd);
1753
1754         return r;
1755 }
1756
1757 int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1758 {
1759         int r;
1760         uint32_t ref_count;
1761
1762         down_read(&pmd->root_lock);
1763         r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1764         if (!r)
1765                 *result = (ref_count > 1);
1766         up_read(&pmd->root_lock);
1767
1768         return r;
1769 }
1770
1771 int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1772 {
1773         int r = 0;
1774
1775         pmd_write_lock(pmd);
1776         r = dm_sm_inc_blocks(pmd->data_sm, b, e);
1777         pmd_write_unlock(pmd);
1778
1779         return r;
1780 }
1781
1782 int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1783 {
1784         int r = 0;
1785
1786         pmd_write_lock(pmd);
1787         r = dm_sm_dec_blocks(pmd->data_sm, b, e);
1788         pmd_write_unlock(pmd);
1789
1790         return r;
1791 }
1792
1793 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1794 {
1795         int r;
1796
1797         down_read(&td->pmd->root_lock);
1798         r = td->changed;
1799         up_read(&td->pmd->root_lock);
1800
1801         return r;
1802 }
1803
1804 bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1805 {
1806         bool r = false;
1807         struct dm_thin_device *td, *tmp;
1808
1809         down_read(&pmd->root_lock);
1810         list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1811                 if (td->changed) {
1812                         r = td->changed;
1813                         break;
1814                 }
1815         }
1816         up_read(&pmd->root_lock);
1817
1818         return r;
1819 }
1820
1821 bool dm_thin_aborted_changes(struct dm_thin_device *td)
1822 {
1823         bool r;
1824
1825         down_read(&td->pmd->root_lock);
1826         r = td->aborted_with_changes;
1827         up_read(&td->pmd->root_lock);
1828
1829         return r;
1830 }
1831
1832 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1833 {
1834         int r = -EINVAL;
1835
1836         pmd_write_lock(pmd);
1837         if (!pmd->fail_io)
1838                 r = dm_sm_new_block(pmd->data_sm, result);
1839         pmd_write_unlock(pmd);
1840
1841         return r;
1842 }
1843
1844 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1845 {
1846         int r = -EINVAL;
1847
1848         /*
1849          * Care is taken to not have commit be what
1850          * triggers putting the thin-pool in-service.
1851          */
1852         pmd_write_lock_in_core(pmd);
1853         if (pmd->fail_io)
1854                 goto out;
1855
1856         r = __commit_transaction(pmd);
1857         if (r < 0)
1858                 goto out;
1859
1860         /*
1861          * Open the next transaction.
1862          */
1863         r = __begin_transaction(pmd);
1864 out:
1865         pmd_write_unlock(pmd);
1866         return r;
1867 }
1868
1869 static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1870 {
1871         struct dm_thin_device *td;
1872
1873         list_for_each_entry(td, &pmd->thin_devices, list)
1874                 td->aborted_with_changes = td->changed;
1875 }
1876
1877 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1878 {
1879         int r = -EINVAL;
1880         struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
1881
1882         /* fail_io is double-checked with pmd->root_lock held below */
1883         if (unlikely(pmd->fail_io))
1884                 return r;
1885
1886         /*
1887          * Replacement block manager (new_bm) is created and old_bm destroyed outside of
1888          * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
1889          * shrinker associated with the block manager's bufio client vs pmd root_lock).
1890          * - must take shrinker_rwsem without holding pmd->root_lock
1891          */
1892         new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
1893                                          THIN_MAX_CONCURRENT_LOCKS);
1894
1895         pmd_write_lock(pmd);
1896         if (pmd->fail_io) {
1897                 pmd_write_unlock(pmd);
1898                 goto out;
1899         }
1900
1901         __set_abort_with_changes_flags(pmd);
1902         __destroy_persistent_data_objects(pmd, false);
1903         old_bm = pmd->bm;
1904         if (IS_ERR(new_bm)) {
1905                 DMERR("could not create block manager during abort");
1906                 pmd->bm = NULL;
1907                 r = PTR_ERR(new_bm);
1908                 goto out_unlock;
1909         }
1910
1911         pmd->bm = new_bm;
1912         r = __open_or_format_metadata(pmd, false);
1913         if (r) {
1914                 pmd->bm = NULL;
1915                 goto out_unlock;
1916         }
1917         new_bm = NULL;
1918 out_unlock:
1919         if (r)
1920                 pmd->fail_io = true;
1921         pmd_write_unlock(pmd);
1922         dm_block_manager_destroy(old_bm);
1923 out:
1924         if (new_bm && !IS_ERR(new_bm))
1925                 dm_block_manager_destroy(new_bm);
1926
1927         return r;
1928 }
1929
1930 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1931 {
1932         int r = -EINVAL;
1933
1934         down_read(&pmd->root_lock);
1935         if (!pmd->fail_io)
1936                 r = dm_sm_get_nr_free(pmd->data_sm, result);
1937         up_read(&pmd->root_lock);
1938
1939         return r;
1940 }
1941
1942 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1943                                           dm_block_t *result)
1944 {
1945         int r = -EINVAL;
1946
1947         down_read(&pmd->root_lock);
1948         if (!pmd->fail_io)
1949                 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1950
1951         if (!r) {
1952                 if (*result < pmd->metadata_reserve)
1953                         *result = 0;
1954                 else
1955                         *result -= pmd->metadata_reserve;
1956         }
1957         up_read(&pmd->root_lock);
1958
1959         return r;
1960 }
1961
1962 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1963                                   dm_block_t *result)
1964 {
1965         int r = -EINVAL;
1966
1967         down_read(&pmd->root_lock);
1968         if (!pmd->fail_io)
1969                 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1970         up_read(&pmd->root_lock);
1971
1972         return r;
1973 }
1974
1975 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1976 {
1977         int r = -EINVAL;
1978
1979         down_read(&pmd->root_lock);
1980         if (!pmd->fail_io)
1981                 r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1982         up_read(&pmd->root_lock);
1983
1984         return r;
1985 }
1986
1987 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1988 {
1989         int r = -EINVAL;
1990         struct dm_pool_metadata *pmd = td->pmd;
1991
1992         down_read(&pmd->root_lock);
1993         if (!pmd->fail_io) {
1994                 *result = td->mapped_blocks;
1995                 r = 0;
1996         }
1997         up_read(&pmd->root_lock);
1998
1999         return r;
2000 }
2001
2002 static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
2003 {
2004         int r;
2005         __le64 value_le;
2006         dm_block_t thin_root;
2007         struct dm_pool_metadata *pmd = td->pmd;
2008
2009         r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
2010         if (r)
2011                 return r;
2012
2013         thin_root = le64_to_cpu(value_le);
2014
2015         return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
2016 }
2017
2018 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
2019                                      dm_block_t *result)
2020 {
2021         int r = -EINVAL;
2022         struct dm_pool_metadata *pmd = td->pmd;
2023
2024         down_read(&pmd->root_lock);
2025         if (!pmd->fail_io)
2026                 r = __highest_block(td, result);
2027         up_read(&pmd->root_lock);
2028
2029         return r;
2030 }
2031
2032 static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
2033 {
2034         int r;
2035         dm_block_t old_count;
2036
2037         r = dm_sm_get_nr_blocks(sm, &old_count);
2038         if (r)
2039                 return r;
2040
2041         if (new_count == old_count)
2042                 return 0;
2043
2044         if (new_count < old_count) {
2045                 DMERR("cannot reduce size of space map");
2046                 return -EINVAL;
2047         }
2048
2049         return dm_sm_extend(sm, new_count - old_count);
2050 }
2051
2052 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2053 {
2054         int r = -EINVAL;
2055
2056         pmd_write_lock(pmd);
2057         if (!pmd->fail_io)
2058                 r = __resize_space_map(pmd->data_sm, new_count);
2059         pmd_write_unlock(pmd);
2060
2061         return r;
2062 }
2063
2064 int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2065 {
2066         int r = -EINVAL;
2067
2068         pmd_write_lock(pmd);
2069         if (!pmd->fail_io) {
2070                 r = __resize_space_map(pmd->metadata_sm, new_count);
2071                 if (!r)
2072                         __set_metadata_reserve(pmd);
2073         }
2074         pmd_write_unlock(pmd);
2075
2076         return r;
2077 }
2078
2079 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
2080 {
2081         pmd_write_lock_in_core(pmd);
2082         dm_bm_set_read_only(pmd->bm);
2083         pmd_write_unlock(pmd);
2084 }
2085
2086 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
2087 {
2088         pmd_write_lock_in_core(pmd);
2089         dm_bm_set_read_write(pmd->bm);
2090         pmd_write_unlock(pmd);
2091 }
2092
2093 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
2094                                         dm_block_t threshold,
2095                                         dm_sm_threshold_fn fn,
2096                                         void *context)
2097 {
2098         int r = -EINVAL;
2099
2100         pmd_write_lock_in_core(pmd);
2101         if (!pmd->fail_io) {
2102                 r = dm_sm_register_threshold_callback(pmd->metadata_sm,
2103                                                       threshold, fn, context);
2104         }
2105         pmd_write_unlock(pmd);
2106
2107         return r;
2108 }
2109
2110 void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
2111                                           dm_pool_pre_commit_fn fn,
2112                                           void *context)
2113 {
2114         pmd_write_lock_in_core(pmd);
2115         pmd->pre_commit_fn = fn;
2116         pmd->pre_commit_context = context;
2117         pmd_write_unlock(pmd);
2118 }
2119
2120 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
2121 {
2122         int r = -EINVAL;
2123         struct dm_block *sblock;
2124         struct thin_disk_superblock *disk_super;
2125
2126         pmd_write_lock(pmd);
2127         if (pmd->fail_io)
2128                 goto out;
2129
2130         pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
2131
2132         r = superblock_lock(pmd, &sblock);
2133         if (r) {
2134                 DMERR("couldn't lock superblock");
2135                 goto out;
2136         }
2137
2138         disk_super = dm_block_data(sblock);
2139         disk_super->flags = cpu_to_le32(pmd->flags);
2140
2141         dm_bm_unlock(sblock);
2142 out:
2143         pmd_write_unlock(pmd);
2144         return r;
2145 }
2146
2147 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
2148 {
2149         bool needs_check;
2150
2151         down_read(&pmd->root_lock);
2152         needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
2153         up_read(&pmd->root_lock);
2154
2155         return needs_check;
2156 }
2157
2158 void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
2159 {
2160         down_read(&pmd->root_lock);
2161         if (!pmd->fail_io)
2162                 dm_tm_issue_prefetches(pmd->tm);
2163         up_read(&pmd->root_lock);
2164 }