dm clone: replace spin_lock_irqsave with spin_lock_irq
[linux-block.git] / drivers / md / dm-clone-target.c
CommitLineData
7431b783
NT
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
4 */
5
6#include <linux/mm.h>
7#include <linux/bio.h>
8#include <linux/err.h>
9#include <linux/hash.h>
10#include <linux/list.h>
11#include <linux/log2.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/dm-io.h>
16#include <linux/mutex.h>
17#include <linux/atomic.h>
18#include <linux/bitops.h>
19#include <linux/blkdev.h>
20#include <linux/kdev_t.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/jiffies.h>
24#include <linux/mempool.h>
25#include <linux/spinlock.h>
26#include <linux/blk_types.h>
27#include <linux/dm-kcopyd.h>
28#include <linux/workqueue.h>
29#include <linux/backing-dev.h>
30#include <linux/device-mapper.h>
31
32#include "dm.h"
33#include "dm-clone-metadata.h"
34
35#define DM_MSG_PREFIX "clone"
36
37/*
38 * Minimum and maximum allowed region sizes
39 */
40#define MIN_REGION_SIZE (1 << 3) /* 4KB */
41#define MAX_REGION_SIZE (1 << 21) /* 1GB */
42
43#define MIN_HYDRATIONS 256 /* Size of hydration mempool */
44#define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
45#define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
46
47#define COMMIT_PERIOD HZ /* 1 sec */
48
49/*
50 * Hydration hash table size: 1 << HASH_TABLE_BITS
51 */
52#define HASH_TABLE_BITS 15
53
54DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
55 "A percentage of time allocated for hydrating regions");
56
57/* Slab cache for struct dm_clone_region_hydration */
58static struct kmem_cache *_hydration_cache;
59
60/* dm-clone metadata modes */
61enum clone_metadata_mode {
62 CM_WRITE, /* metadata may be changed */
63 CM_READ_ONLY, /* metadata may not be changed */
64 CM_FAIL, /* all metadata I/O fails */
65};
66
67struct hash_table_bucket;
68
69struct clone {
70 struct dm_target *ti;
71 struct dm_target_callbacks callbacks;
72
73 struct dm_dev *metadata_dev;
74 struct dm_dev *dest_dev;
75 struct dm_dev *source_dev;
76
77 unsigned long nr_regions;
78 sector_t region_size;
79 unsigned int region_shift;
80
81 /*
82 * A metadata commit and the actions taken in case it fails should run
83 * as a single atomic step.
84 */
85 struct mutex commit_lock;
86
87 struct dm_clone_metadata *cmd;
88
89 /* Region hydration hash table */
90 struct hash_table_bucket *ht;
91
92 atomic_t ios_in_flight;
93
94 wait_queue_head_t hydration_stopped;
95
96 mempool_t hydration_pool;
97
98 unsigned long last_commit_jiffies;
99
100 /*
101 * We defer incoming WRITE bios for regions that are not hydrated,
102 * until after these regions have been hydrated.
103 *
104 * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
105 * metadata have been committed.
106 */
107 spinlock_t lock;
108 struct bio_list deferred_bios;
109 struct bio_list deferred_discard_bios;
110 struct bio_list deferred_flush_bios;
111 struct bio_list deferred_flush_completions;
112
113 /* Maximum number of regions being copied during background hydration. */
114 unsigned int hydration_threshold;
115
116 /* Number of regions to batch together during background hydration. */
117 unsigned int hydration_batch_size;
118
119 /* Which region to hydrate next */
120 unsigned long hydration_offset;
121
122 atomic_t hydrations_in_flight;
123
124 /*
125 * Save a copy of the table line rather than reconstructing it for the
126 * status.
127 */
128 unsigned int nr_ctr_args;
129 const char **ctr_args;
130
131 struct workqueue_struct *wq;
132 struct work_struct worker;
133 struct delayed_work waker;
134
135 struct dm_kcopyd_client *kcopyd_client;
136
137 enum clone_metadata_mode mode;
138 unsigned long flags;
139};
140
141/*
142 * dm-clone flags
143 */
144#define DM_CLONE_DISCARD_PASSDOWN 0
145#define DM_CLONE_HYDRATION_ENABLED 1
146#define DM_CLONE_HYDRATION_SUSPENDED 2
147
148/*---------------------------------------------------------------------------*/
149
150/*
151 * Metadata failure handling.
152 */
153static enum clone_metadata_mode get_clone_mode(struct clone *clone)
154{
155 return READ_ONCE(clone->mode);
156}
157
158static const char *clone_device_name(struct clone *clone)
159{
160 return dm_table_device_name(clone->ti->table);
161}
162
163static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
164{
165 const char *descs[] = {
166 "read-write",
167 "read-only",
168 "fail"
169 };
170
171 enum clone_metadata_mode old_mode = get_clone_mode(clone);
172
173 /* Never move out of fail mode */
174 if (old_mode == CM_FAIL)
175 new_mode = CM_FAIL;
176
177 switch (new_mode) {
178 case CM_FAIL:
179 case CM_READ_ONLY:
180 dm_clone_metadata_set_read_only(clone->cmd);
181 break;
182
183 case CM_WRITE:
184 dm_clone_metadata_set_read_write(clone->cmd);
185 break;
186 }
187
188 WRITE_ONCE(clone->mode, new_mode);
189
190 if (new_mode != old_mode) {
191 dm_table_event(clone->ti->table);
192 DMINFO("%s: Switching to %s mode", clone_device_name(clone),
193 descs[(int)new_mode]);
194 }
195}
196
197static void __abort_transaction(struct clone *clone)
198{
199 const char *dev_name = clone_device_name(clone);
200
201 if (get_clone_mode(clone) >= CM_READ_ONLY)
202 return;
203
204 DMERR("%s: Aborting current metadata transaction", dev_name);
205 if (dm_clone_metadata_abort(clone->cmd)) {
206 DMERR("%s: Failed to abort metadata transaction", dev_name);
207 __set_clone_mode(clone, CM_FAIL);
208 }
209}
210
211static void __reload_in_core_bitset(struct clone *clone)
212{
213 const char *dev_name = clone_device_name(clone);
214
215 if (get_clone_mode(clone) == CM_FAIL)
216 return;
217
218 /* Reload the on-disk bitset */
219 DMINFO("%s: Reloading on-disk bitmap", dev_name);
220 if (dm_clone_reload_in_core_bitset(clone->cmd)) {
221 DMERR("%s: Failed to reload on-disk bitmap", dev_name);
222 __set_clone_mode(clone, CM_FAIL);
223 }
224}
225
226static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
227{
228 DMERR("%s: Metadata operation `%s' failed: error = %d",
229 clone_device_name(clone), op, r);
230
231 __abort_transaction(clone);
232 __set_clone_mode(clone, CM_READ_ONLY);
233
234 /*
235 * dm_clone_reload_in_core_bitset() may run concurrently with either
236 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
237 * it's safe as we have already set the metadata to read-only mode.
238 */
239 __reload_in_core_bitset(clone);
240}
241
242/*---------------------------------------------------------------------------*/
243
244/* Wake up anyone waiting for region hydrations to stop */
245static inline void wakeup_hydration_waiters(struct clone *clone)
246{
247 wake_up_all(&clone->hydration_stopped);
248}
249
250static inline void wake_worker(struct clone *clone)
251{
252 queue_work(clone->wq, &clone->worker);
253}
254
255/*---------------------------------------------------------------------------*/
256
257/*
258 * bio helper functions.
259 */
260static inline void remap_to_source(struct clone *clone, struct bio *bio)
261{
262 bio_set_dev(bio, clone->source_dev->bdev);
263}
264
265static inline void remap_to_dest(struct clone *clone, struct bio *bio)
266{
267 bio_set_dev(bio, clone->dest_dev->bdev);
268}
269
270static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
271{
272 return op_is_flush(bio->bi_opf) &&
273 dm_clone_changed_this_transaction(clone->cmd);
274}
275
276/* Get the address of the region in sectors */
277static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
278{
279 return (region_nr << clone->region_shift);
280}
281
282/* Get the region number of the bio */
283static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
284{
285 return (bio->bi_iter.bi_sector >> clone->region_shift);
286}
287
288/* Get the region range covered by the bio */
289static void bio_region_range(struct clone *clone, struct bio *bio,
290 unsigned long *rs, unsigned long *re)
291{
292 *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
293 *re = bio_end_sector(bio) >> clone->region_shift;
294}
295
296/* Check whether a bio overwrites a region */
297static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
298{
299 return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
300}
301
302static void fail_bios(struct bio_list *bios, blk_status_t status)
303{
304 struct bio *bio;
305
306 while ((bio = bio_list_pop(bios))) {
307 bio->bi_status = status;
308 bio_endio(bio);
309 }
310}
311
312static void submit_bios(struct bio_list *bios)
313{
314 struct bio *bio;
315 struct blk_plug plug;
316
317 blk_start_plug(&plug);
318
319 while ((bio = bio_list_pop(bios)))
320 generic_make_request(bio);
321
322 blk_finish_plug(&plug);
323}
324
325/*
326 * Submit bio to the underlying device.
327 *
328 * If the bio triggers a commit, delay it, until after the metadata have been
329 * committed.
330 *
331 * NOTE: The bio remapping must be performed by the caller.
332 */
333static void issue_bio(struct clone *clone, struct bio *bio)
334{
7431b783
NT
335 if (!bio_triggers_commit(clone, bio)) {
336 generic_make_request(bio);
337 return;
338 }
339
340 /*
341 * If the metadata mode is RO or FAIL we won't be able to commit the
342 * metadata, so we complete the bio with an error.
343 */
344 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
345 bio_io_error(bio);
346 return;
347 }
348
349 /*
350 * Batch together any bios that trigger commits and then issue a single
351 * commit for them in process_deferred_flush_bios().
352 */
6ca43ed8 353 spin_lock_irq(&clone->lock);
7431b783 354 bio_list_add(&clone->deferred_flush_bios, bio);
6ca43ed8 355 spin_unlock_irq(&clone->lock);
7431b783
NT
356
357 wake_worker(clone);
358}
359
360/*
361 * Remap bio to the destination device and submit it.
362 *
363 * If the bio triggers a commit, delay it, until after the metadata have been
364 * committed.
365 */
366static void remap_and_issue(struct clone *clone, struct bio *bio)
367{
368 remap_to_dest(clone, bio);
369 issue_bio(clone, bio);
370}
371
372/*
373 * Issue bios that have been deferred until after their region has finished
374 * hydrating.
375 *
376 * We delegate the bio submission to the worker thread, so this is safe to call
377 * from interrupt context.
378 */
379static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
380{
381 struct bio *bio;
382 unsigned long flags;
383 struct bio_list flush_bios = BIO_EMPTY_LIST;
384 struct bio_list normal_bios = BIO_EMPTY_LIST;
385
386 if (bio_list_empty(bios))
387 return;
388
389 while ((bio = bio_list_pop(bios))) {
390 if (bio_triggers_commit(clone, bio))
391 bio_list_add(&flush_bios, bio);
392 else
393 bio_list_add(&normal_bios, bio);
394 }
395
396 spin_lock_irqsave(&clone->lock, flags);
397 bio_list_merge(&clone->deferred_bios, &normal_bios);
398 bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
399 spin_unlock_irqrestore(&clone->lock, flags);
400
401 wake_worker(clone);
402}
403
404static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
405{
406 unsigned long flags;
407
408 /*
409 * If the bio has the REQ_FUA flag set we must commit the metadata
410 * before signaling its completion.
411 *
412 * complete_overwrite_bio() is only called by hydration_complete(),
413 * after having successfully updated the metadata. This means we don't
414 * need to call dm_clone_changed_this_transaction() to check if the
415 * metadata has changed and thus we can avoid taking the metadata spin
416 * lock.
417 */
418 if (!(bio->bi_opf & REQ_FUA)) {
419 bio_endio(bio);
420 return;
421 }
422
423 /*
424 * If the metadata mode is RO or FAIL we won't be able to commit the
425 * metadata, so we complete the bio with an error.
426 */
427 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
428 bio_io_error(bio);
429 return;
430 }
431
432 /*
433 * Batch together any bios that trigger commits and then issue a single
434 * commit for them in process_deferred_flush_bios().
435 */
436 spin_lock_irqsave(&clone->lock, flags);
437 bio_list_add(&clone->deferred_flush_completions, bio);
438 spin_unlock_irqrestore(&clone->lock, flags);
439
440 wake_worker(clone);
441}
442
443static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
444{
445 bio->bi_iter.bi_sector = sector;
446 bio->bi_iter.bi_size = to_bytes(len);
447}
448
449static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
450{
451 unsigned long rs, re;
452
453 /*
454 * If the destination device supports discards, remap and trim the
455 * discard bio and pass it down. Otherwise complete the bio
456 * immediately.
457 */
458 if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
459 remap_to_dest(clone, bio);
460 bio_region_range(clone, bio, &rs, &re);
461 trim_bio(bio, rs << clone->region_shift,
462 (re - rs) << clone->region_shift);
463 generic_make_request(bio);
464 } else
465 bio_endio(bio);
466}
467
468static void process_discard_bio(struct clone *clone, struct bio *bio)
469{
6ca43ed8 470 unsigned long rs, re;
7431b783
NT
471
472 bio_region_range(clone, bio, &rs, &re);
473 BUG_ON(re > clone->nr_regions);
474
475 if (unlikely(rs == re)) {
476 bio_endio(bio);
477 return;
478 }
479
480 /*
481 * The covered regions are already hydrated so we just need to pass
482 * down the discard.
483 */
484 if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) {
485 complete_discard_bio(clone, bio, true);
486 return;
487 }
488
489 /*
490 * If the metadata mode is RO or FAIL we won't be able to update the
491 * metadata for the regions covered by the discard so we just ignore
492 * it.
493 */
494 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
495 bio_endio(bio);
496 return;
497 }
498
499 /*
500 * Defer discard processing.
501 */
6ca43ed8 502 spin_lock_irq(&clone->lock);
7431b783 503 bio_list_add(&clone->deferred_discard_bios, bio);
6ca43ed8 504 spin_unlock_irq(&clone->lock);
7431b783
NT
505
506 wake_worker(clone);
507}
508
509/*---------------------------------------------------------------------------*/
510
511/*
512 * dm-clone region hydrations.
513 */
514struct dm_clone_region_hydration {
515 struct clone *clone;
516 unsigned long region_nr;
517
518 struct bio *overwrite_bio;
519 bio_end_io_t *overwrite_bio_end_io;
520
521 struct bio_list deferred_bios;
522
523 blk_status_t status;
524
525 /* Used by hydration batching */
526 struct list_head list;
527
528 /* Used by hydration hash table */
529 struct hlist_node h;
530};
531
532/*
533 * Hydration hash table implementation.
534 *
535 * Ideally we would like to use list_bl, which uses bit spin locks and employs
536 * the least significant bit of the list head to lock the corresponding bucket,
537 * reducing the memory overhead for the locks. But, currently, list_bl and bit
538 * spin locks don't support IRQ safe versions. Since we have to take the lock
539 * in both process and interrupt context, we must fall back to using regular
540 * spin locks; one per hash table bucket.
541 */
542struct hash_table_bucket {
543 struct hlist_head head;
544
545 /* Spinlock protecting the bucket */
546 spinlock_t lock;
547};
548
549#define bucket_lock_irqsave(bucket, flags) \
550 spin_lock_irqsave(&(bucket)->lock, flags)
551
552#define bucket_unlock_irqrestore(bucket, flags) \
553 spin_unlock_irqrestore(&(bucket)->lock, flags)
554
555static int hash_table_init(struct clone *clone)
556{
557 unsigned int i, sz;
558 struct hash_table_bucket *bucket;
559
560 sz = 1 << HASH_TABLE_BITS;
561
562 clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
563 if (!clone->ht)
564 return -ENOMEM;
565
566 for (i = 0; i < sz; i++) {
567 bucket = clone->ht + i;
568
569 INIT_HLIST_HEAD(&bucket->head);
570 spin_lock_init(&bucket->lock);
571 }
572
573 return 0;
574}
575
576static void hash_table_exit(struct clone *clone)
577{
578 kvfree(clone->ht);
579}
580
581static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
582 unsigned long region_nr)
583{
584 return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
585}
586
587/*
588 * Search hash table for a hydration with hd->region_nr == region_nr
589 *
590 * NOTE: Must be called with the bucket lock held
591 */
0a005856
Y
592static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
593 unsigned long region_nr)
7431b783
NT
594{
595 struct dm_clone_region_hydration *hd;
596
597 hlist_for_each_entry(hd, &bucket->head, h) {
598 if (hd->region_nr == region_nr)
599 return hd;
600 }
601
602 return NULL;
603}
604
605/*
606 * Insert a hydration into the hash table.
607 *
608 * NOTE: Must be called with the bucket lock held.
609 */
610static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
611 struct dm_clone_region_hydration *hd)
612{
613 hlist_add_head(&hd->h, &bucket->head);
614}
615
616/*
617 * This function inserts a hydration into the hash table, unless someone else
618 * managed to insert a hydration for the same region first. In the latter case
619 * it returns the existing hydration descriptor for this region.
620 *
621 * NOTE: Must be called with the hydration hash table lock held.
622 */
623static struct dm_clone_region_hydration *
624__find_or_insert_region_hydration(struct hash_table_bucket *bucket,
625 struct dm_clone_region_hydration *hd)
626{
627 struct dm_clone_region_hydration *hd2;
628
629 hd2 = __hash_find(bucket, hd->region_nr);
630 if (hd2)
631 return hd2;
632
633 __insert_region_hydration(bucket, hd);
634
635 return hd;
636}
637
638/*---------------------------------------------------------------------------*/
639
640/* Allocate a hydration */
641static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
642{
643 struct dm_clone_region_hydration *hd;
644
645 /*
646 * Allocate a hydration from the hydration mempool.
647 * This might block but it can't fail.
648 */
649 hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
650 hd->clone = clone;
651
652 return hd;
653}
654
655static inline void free_hydration(struct dm_clone_region_hydration *hd)
656{
657 mempool_free(hd, &hd->clone->hydration_pool);
658}
659
660/* Initialize a hydration */
661static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
662{
663 hd->region_nr = region_nr;
664 hd->overwrite_bio = NULL;
665 bio_list_init(&hd->deferred_bios);
666 hd->status = 0;
667
668 INIT_LIST_HEAD(&hd->list);
669 INIT_HLIST_NODE(&hd->h);
670}
671
672/*---------------------------------------------------------------------------*/
673
674/*
675 * Update dm-clone's metadata after a region has finished hydrating and remove
676 * hydration from the hash table.
677 */
678static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
679{
680 int r = 0;
681 unsigned long flags;
682 struct hash_table_bucket *bucket;
683 struct clone *clone = hd->clone;
684
685 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
686 r = -EPERM;
687
688 /* Update the metadata */
689 if (likely(!r) && hd->status == BLK_STS_OK)
690 r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
691
692 bucket = get_hash_table_bucket(clone, hd->region_nr);
693
694 /* Remove hydration from hash table */
695 bucket_lock_irqsave(bucket, flags);
696 hlist_del(&hd->h);
697 bucket_unlock_irqrestore(bucket, flags);
698
699 return r;
700}
701
702/*
703 * Complete a region's hydration:
704 *
705 * 1. Update dm-clone's metadata.
706 * 2. Remove hydration from hash table.
707 * 3. Complete overwrite bio.
708 * 4. Issue deferred bios.
709 * 5. If this was the last hydration, wake up anyone waiting for
710 * hydrations to finish.
711 */
712static void hydration_complete(struct dm_clone_region_hydration *hd)
713{
714 int r;
715 blk_status_t status;
716 struct clone *clone = hd->clone;
717
718 r = hydration_update_metadata(hd);
719
720 if (hd->status == BLK_STS_OK && likely(!r)) {
721 if (hd->overwrite_bio)
722 complete_overwrite_bio(clone, hd->overwrite_bio);
723
724 issue_deferred_bios(clone, &hd->deferred_bios);
725 } else {
726 status = r ? BLK_STS_IOERR : hd->status;
727
728 if (hd->overwrite_bio)
729 bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
730
731 fail_bios(&hd->deferred_bios, status);
732 }
733
734 free_hydration(hd);
735
736 if (atomic_dec_and_test(&clone->hydrations_in_flight))
737 wakeup_hydration_waiters(clone);
738}
739
740static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
741{
742 blk_status_t status;
743
744 struct dm_clone_region_hydration *tmp, *hd = context;
745 struct clone *clone = hd->clone;
746
747 LIST_HEAD(batched_hydrations);
748
749 if (read_err || write_err) {
750 DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
751 status = BLK_STS_IOERR;
752 } else {
753 status = BLK_STS_OK;
754 }
755 list_splice_tail(&hd->list, &batched_hydrations);
756
757 hd->status = status;
758 hydration_complete(hd);
759
760 /* Complete batched hydrations */
761 list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
762 hd->status = status;
763 hydration_complete(hd);
764 }
765
766 /* Continue background hydration, if there is no I/O in-flight */
767 if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
768 !atomic_read(&clone->ios_in_flight))
769 wake_worker(clone);
770}
771
772static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
773{
774 unsigned long region_start, region_end;
775 sector_t tail_size, region_size, total_size;
776 struct dm_io_region from, to;
777 struct clone *clone = hd->clone;
778
779 region_size = clone->region_size;
780 region_start = hd->region_nr;
781 region_end = region_start + nr_regions - 1;
782
783 total_size = (nr_regions - 1) << clone->region_shift;
784
785 if (region_end == clone->nr_regions - 1) {
786 /*
787 * The last region of the target might be smaller than
788 * region_size.
789 */
790 tail_size = clone->ti->len & (region_size - 1);
791 if (!tail_size)
792 tail_size = region_size;
793 } else {
794 tail_size = region_size;
795 }
796
797 total_size += tail_size;
798
799 from.bdev = clone->source_dev->bdev;
800 from.sector = region_to_sector(clone, region_start);
801 from.count = total_size;
802
803 to.bdev = clone->dest_dev->bdev;
804 to.sector = from.sector;
805 to.count = from.count;
806
807 /* Issue copy */
808 atomic_add(nr_regions, &clone->hydrations_in_flight);
809 dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
810 hydration_kcopyd_callback, hd);
811}
812
813static void overwrite_endio(struct bio *bio)
814{
815 struct dm_clone_region_hydration *hd = bio->bi_private;
816
817 bio->bi_end_io = hd->overwrite_bio_end_io;
818 hd->status = bio->bi_status;
819
820 hydration_complete(hd);
821}
822
823static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
824{
825 /*
826 * We don't need to save and restore bio->bi_private because device
827 * mapper core generates a new bio for us to use, with clean
828 * bi_private.
829 */
830 hd->overwrite_bio = bio;
831 hd->overwrite_bio_end_io = bio->bi_end_io;
832
833 bio->bi_end_io = overwrite_endio;
834 bio->bi_private = hd;
835
836 atomic_inc(&hd->clone->hydrations_in_flight);
837 generic_make_request(bio);
838}
839
840/*
841 * Hydrate bio's region.
842 *
843 * This function starts the hydration of the bio's region and puts the bio in
844 * the list of deferred bios for this region. In case, by the time this
845 * function is called, the region has finished hydrating it's submitted to the
846 * destination device.
847 *
848 * NOTE: The bio remapping must be performed by the caller.
849 */
850static void hydrate_bio_region(struct clone *clone, struct bio *bio)
851{
852 unsigned long flags;
853 unsigned long region_nr;
854 struct hash_table_bucket *bucket;
855 struct dm_clone_region_hydration *hd, *hd2;
856
857 region_nr = bio_to_region(clone, bio);
858 bucket = get_hash_table_bucket(clone, region_nr);
859
860 bucket_lock_irqsave(bucket, flags);
861
862 hd = __hash_find(bucket, region_nr);
863 if (hd) {
864 /* Someone else is hydrating the region */
865 bio_list_add(&hd->deferred_bios, bio);
866 bucket_unlock_irqrestore(bucket, flags);
867 return;
868 }
869
870 if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
871 /* The region has been hydrated */
872 bucket_unlock_irqrestore(bucket, flags);
873 issue_bio(clone, bio);
874 return;
875 }
876
877 /*
878 * We must allocate a hydration descriptor and start the hydration of
879 * the corresponding region.
880 */
881 bucket_unlock_irqrestore(bucket, flags);
882
883 hd = alloc_hydration(clone);
884 hydration_init(hd, region_nr);
885
886 bucket_lock_irqsave(bucket, flags);
887
888 /* Check if the region has been hydrated in the meantime. */
889 if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
890 bucket_unlock_irqrestore(bucket, flags);
891 free_hydration(hd);
892 issue_bio(clone, bio);
893 return;
894 }
895
896 hd2 = __find_or_insert_region_hydration(bucket, hd);
897 if (hd2 != hd) {
898 /* Someone else started the region's hydration. */
899 bio_list_add(&hd2->deferred_bios, bio);
900 bucket_unlock_irqrestore(bucket, flags);
901 free_hydration(hd);
902 return;
903 }
904
905 /*
906 * If the metadata mode is RO or FAIL then there is no point starting a
907 * hydration, since we will not be able to update the metadata when the
908 * hydration finishes.
909 */
910 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
911 hlist_del(&hd->h);
912 bucket_unlock_irqrestore(bucket, flags);
913 free_hydration(hd);
914 bio_io_error(bio);
915 return;
916 }
917
918 /*
919 * Start region hydration.
920 *
921 * If a bio overwrites a region, i.e., its size is equal to the
922 * region's size, then we don't need to copy the region from the source
923 * to the destination device.
924 */
925 if (is_overwrite_bio(clone, bio)) {
926 bucket_unlock_irqrestore(bucket, flags);
927 hydration_overwrite(hd, bio);
928 } else {
929 bio_list_add(&hd->deferred_bios, bio);
930 bucket_unlock_irqrestore(bucket, flags);
931 hydration_copy(hd, 1);
932 }
933}
934
935/*---------------------------------------------------------------------------*/
936
937/*
938 * Background hydrations.
939 */
940
941/*
942 * Batch region hydrations.
943 *
944 * To better utilize device bandwidth we batch together the hydration of
945 * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
946 * is good for small, random write performance (because of the overwriting of
947 * un-hydrated regions) and at the same time issue big copy requests to kcopyd
948 * to achieve high hydration bandwidth.
949 */
950struct batch_info {
951 struct dm_clone_region_hydration *head;
952 unsigned int nr_batched_regions;
953};
954
955static void __batch_hydration(struct batch_info *batch,
956 struct dm_clone_region_hydration *hd)
957{
958 struct clone *clone = hd->clone;
959 unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
960
961 if (batch->head) {
962 /* Try to extend the current batch */
963 if (batch->nr_batched_regions < max_batch_size &&
964 (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
965 list_add_tail(&hd->list, &batch->head->list);
966 batch->nr_batched_regions++;
967 hd = NULL;
968 }
969
970 /* Check if we should issue the current batch */
971 if (batch->nr_batched_regions >= max_batch_size || hd) {
972 hydration_copy(batch->head, batch->nr_batched_regions);
973 batch->head = NULL;
974 batch->nr_batched_regions = 0;
975 }
976 }
977
978 if (!hd)
979 return;
980
981 /* We treat max batch sizes of zero and one equivalently */
982 if (max_batch_size <= 1) {
983 hydration_copy(hd, 1);
984 return;
985 }
986
987 /* Start a new batch */
988 BUG_ON(!list_empty(&hd->list));
989 batch->head = hd;
990 batch->nr_batched_regions = 1;
991}
992
993static unsigned long __start_next_hydration(struct clone *clone,
994 unsigned long offset,
995 struct batch_info *batch)
996{
997 unsigned long flags;
998 struct hash_table_bucket *bucket;
999 struct dm_clone_region_hydration *hd;
1000 unsigned long nr_regions = clone->nr_regions;
1001
1002 hd = alloc_hydration(clone);
1003
1004 /* Try to find a region to hydrate. */
1005 do {
1006 offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
1007 if (offset == nr_regions)
1008 break;
1009
1010 bucket = get_hash_table_bucket(clone, offset);
1011 bucket_lock_irqsave(bucket, flags);
1012
1013 if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
1014 !__hash_find(bucket, offset)) {
1015 hydration_init(hd, offset);
1016 __insert_region_hydration(bucket, hd);
1017 bucket_unlock_irqrestore(bucket, flags);
1018
1019 /* Batch hydration */
1020 __batch_hydration(batch, hd);
1021
1022 return (offset + 1);
1023 }
1024
1025 bucket_unlock_irqrestore(bucket, flags);
1026
1027 } while (++offset < nr_regions);
1028
1029 if (hd)
1030 free_hydration(hd);
1031
1032 return offset;
1033}
1034
1035/*
1036 * This function searches for regions that still reside in the source device
1037 * and starts their hydration.
1038 */
1039static void do_hydration(struct clone *clone)
1040{
1041 unsigned int current_volume;
1042 unsigned long offset, nr_regions = clone->nr_regions;
1043
1044 struct batch_info batch = {
1045 .head = NULL,
1046 .nr_batched_regions = 0,
1047 };
1048
1049 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1050 return;
1051
1052 if (dm_clone_is_hydration_done(clone->cmd))
1053 return;
1054
1055 /*
1056 * Avoid race with device suspension.
1057 */
1058 atomic_inc(&clone->hydrations_in_flight);
1059
1060 /*
1061 * Make sure atomic_inc() is ordered before test_bit(), otherwise we
1062 * might race with clone_postsuspend() and start a region hydration
1063 * after the target has been suspended.
1064 *
1065 * This is paired with the smp_mb__after_atomic() in
1066 * clone_postsuspend().
1067 */
1068 smp_mb__after_atomic();
1069
1070 offset = clone->hydration_offset;
1071 while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
1072 !atomic_read(&clone->ios_in_flight) &&
1073 test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
1074 offset < nr_regions) {
1075 current_volume = atomic_read(&clone->hydrations_in_flight);
1076 current_volume += batch.nr_batched_regions;
1077
1078 if (current_volume > READ_ONCE(clone->hydration_threshold))
1079 break;
1080
1081 offset = __start_next_hydration(clone, offset, &batch);
1082 }
1083
1084 if (batch.head)
1085 hydration_copy(batch.head, batch.nr_batched_regions);
1086
1087 if (offset >= nr_regions)
1088 offset = 0;
1089
1090 clone->hydration_offset = offset;
1091
1092 if (atomic_dec_and_test(&clone->hydrations_in_flight))
1093 wakeup_hydration_waiters(clone);
1094}
1095
1096/*---------------------------------------------------------------------------*/
1097
1098static bool need_commit_due_to_time(struct clone *clone)
1099{
1100 return !time_in_range(jiffies, clone->last_commit_jiffies,
1101 clone->last_commit_jiffies + COMMIT_PERIOD);
1102}
1103
1104/*
1105 * A non-zero return indicates read-only or fail mode.
1106 */
1107static int commit_metadata(struct clone *clone)
1108{
1109 int r = 0;
1110
1111 mutex_lock(&clone->commit_lock);
1112
1113 if (!dm_clone_changed_this_transaction(clone->cmd))
1114 goto out;
1115
1116 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
1117 r = -EPERM;
1118 goto out;
1119 }
1120
1121 r = dm_clone_metadata_commit(clone->cmd);
1122
1123 if (unlikely(r)) {
1124 __metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
1125 goto out;
1126 }
1127
1128 if (dm_clone_is_hydration_done(clone->cmd))
1129 dm_table_event(clone->ti->table);
1130out:
1131 mutex_unlock(&clone->commit_lock);
1132
1133 return r;
1134}
1135
1136static void process_deferred_discards(struct clone *clone)
1137{
1138 int r = -EPERM;
1139 struct bio *bio;
1140 struct blk_plug plug;
6ca43ed8 1141 unsigned long rs, re;
7431b783
NT
1142 struct bio_list discards = BIO_EMPTY_LIST;
1143
6ca43ed8 1144 spin_lock_irq(&clone->lock);
7431b783
NT
1145 bio_list_merge(&discards, &clone->deferred_discard_bios);
1146 bio_list_init(&clone->deferred_discard_bios);
6ca43ed8 1147 spin_unlock_irq(&clone->lock);
7431b783
NT
1148
1149 if (bio_list_empty(&discards))
1150 return;
1151
1152 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1153 goto out;
1154
1155 /* Update the metadata */
1156 bio_list_for_each(bio, &discards) {
1157 bio_region_range(clone, bio, &rs, &re);
1158 /*
1159 * A discard request might cover regions that have been already
1160 * hydrated. There is no need to update the metadata for these
1161 * regions.
1162 */
1163 r = dm_clone_cond_set_range(clone->cmd, rs, re - rs);
1164
1165 if (unlikely(r))
1166 break;
1167 }
1168out:
1169 blk_start_plug(&plug);
1170 while ((bio = bio_list_pop(&discards)))
1171 complete_discard_bio(clone, bio, r == 0);
1172 blk_finish_plug(&plug);
1173}
1174
1175static void process_deferred_bios(struct clone *clone)
1176{
7431b783
NT
1177 struct bio_list bios = BIO_EMPTY_LIST;
1178
6ca43ed8 1179 spin_lock_irq(&clone->lock);
7431b783
NT
1180 bio_list_merge(&bios, &clone->deferred_bios);
1181 bio_list_init(&clone->deferred_bios);
6ca43ed8 1182 spin_unlock_irq(&clone->lock);
7431b783
NT
1183
1184 if (bio_list_empty(&bios))
1185 return;
1186
1187 submit_bios(&bios);
1188}
1189
1190static void process_deferred_flush_bios(struct clone *clone)
1191{
1192 struct bio *bio;
7431b783
NT
1193 struct bio_list bios = BIO_EMPTY_LIST;
1194 struct bio_list bio_completions = BIO_EMPTY_LIST;
1195
1196 /*
1197 * If there are any deferred flush bios, we must commit the metadata
1198 * before issuing them or signaling their completion.
1199 */
6ca43ed8 1200 spin_lock_irq(&clone->lock);
7431b783
NT
1201 bio_list_merge(&bios, &clone->deferred_flush_bios);
1202 bio_list_init(&clone->deferred_flush_bios);
1203
1204 bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
1205 bio_list_init(&clone->deferred_flush_completions);
6ca43ed8 1206 spin_unlock_irq(&clone->lock);
7431b783
NT
1207
1208 if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
1209 !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
1210 return;
1211
1212 if (commit_metadata(clone)) {
1213 bio_list_merge(&bios, &bio_completions);
1214
1215 while ((bio = bio_list_pop(&bios)))
1216 bio_io_error(bio);
1217
1218 return;
1219 }
1220
1221 clone->last_commit_jiffies = jiffies;
1222
1223 while ((bio = bio_list_pop(&bio_completions)))
1224 bio_endio(bio);
1225
1226 while ((bio = bio_list_pop(&bios)))
1227 generic_make_request(bio);
1228}
1229
1230static void do_worker(struct work_struct *work)
1231{
1232 struct clone *clone = container_of(work, typeof(*clone), worker);
1233
1234 process_deferred_bios(clone);
1235 process_deferred_discards(clone);
1236
1237 /*
1238 * process_deferred_flush_bios():
1239 *
1240 * - Commit metadata
1241 *
1242 * - Process deferred REQ_FUA completions
1243 *
1244 * - Process deferred REQ_PREFLUSH bios
1245 */
1246 process_deferred_flush_bios(clone);
1247
1248 /* Background hydration */
1249 do_hydration(clone);
1250}
1251
1252/*
1253 * Commit periodically so that not too much unwritten data builds up.
1254 *
1255 * Also, restart background hydration, if it has been stopped by in-flight I/O.
1256 */
1257static void do_waker(struct work_struct *work)
1258{
1259 struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
1260
1261 wake_worker(clone);
1262 queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
1263}
1264
1265/*---------------------------------------------------------------------------*/
1266
1267/*
1268 * Target methods
1269 */
1270static int clone_map(struct dm_target *ti, struct bio *bio)
1271{
1272 struct clone *clone = ti->private;
1273 unsigned long region_nr;
1274
1275 atomic_inc(&clone->ios_in_flight);
1276
1277 if (unlikely(get_clone_mode(clone) == CM_FAIL))
1278 return DM_MAPIO_KILL;
1279
1280 /*
1281 * REQ_PREFLUSH bios carry no data:
1282 *
1283 * - Commit metadata, if changed
1284 *
1285 * - Pass down to destination device
1286 */
1287 if (bio->bi_opf & REQ_PREFLUSH) {
1288 remap_and_issue(clone, bio);
1289 return DM_MAPIO_SUBMITTED;
1290 }
1291
1292 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1293
1294 /*
1295 * dm-clone interprets discards and performs a fast hydration of the
1296 * discarded regions, i.e., we skip the copy from the source device and
1297 * just mark the regions as hydrated.
1298 */
1299 if (bio_op(bio) == REQ_OP_DISCARD) {
1300 process_discard_bio(clone, bio);
1301 return DM_MAPIO_SUBMITTED;
1302 }
1303
1304 /*
1305 * If the bio's region is hydrated, redirect it to the destination
1306 * device.
1307 *
1308 * If the region is not hydrated and the bio is a READ, redirect it to
1309 * the source device.
1310 *
1311 * Else, defer WRITE bio until after its region has been hydrated and
1312 * start the region's hydration immediately.
1313 */
1314 region_nr = bio_to_region(clone, bio);
1315 if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
1316 remap_and_issue(clone, bio);
1317 return DM_MAPIO_SUBMITTED;
1318 } else if (bio_data_dir(bio) == READ) {
1319 remap_to_source(clone, bio);
1320 return DM_MAPIO_REMAPPED;
1321 }
1322
1323 remap_to_dest(clone, bio);
1324 hydrate_bio_region(clone, bio);
1325
1326 return DM_MAPIO_SUBMITTED;
1327}
1328
1329static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
1330{
1331 struct clone *clone = ti->private;
1332
1333 atomic_dec(&clone->ios_in_flight);
1334
1335 return DM_ENDIO_DONE;
1336}
1337
1338static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
1339 ssize_t *sz_ptr)
1340{
1341 ssize_t sz = *sz_ptr;
1342 unsigned int count;
1343
1344 count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1345 count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1346
1347 DMEMIT("%u ", count);
1348
1349 if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
1350 DMEMIT("no_hydration ");
1351
1352 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
1353 DMEMIT("no_discard_passdown ");
1354
1355 *sz_ptr = sz;
1356}
1357
1358static void emit_core_args(struct clone *clone, char *result,
1359 unsigned int maxlen, ssize_t *sz_ptr)
1360{
1361 ssize_t sz = *sz_ptr;
1362 unsigned int count = 4;
1363
1364 DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
1365 READ_ONCE(clone->hydration_threshold),
1366 READ_ONCE(clone->hydration_batch_size));
1367
1368 *sz_ptr = sz;
1369}
1370
1371/*
1372 * Status format:
1373 *
1374 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1375 * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
1376 * <#features> <features>* <#core args> <core args>* <clone metadata mode>
1377 */
1378static void clone_status(struct dm_target *ti, status_type_t type,
1379 unsigned int status_flags, char *result,
1380 unsigned int maxlen)
1381{
1382 int r;
1383 unsigned int i;
1384 ssize_t sz = 0;
1385 dm_block_t nr_free_metadata_blocks = 0;
1386 dm_block_t nr_metadata_blocks = 0;
1387 char buf[BDEVNAME_SIZE];
1388 struct clone *clone = ti->private;
1389
1390 switch (type) {
1391 case STATUSTYPE_INFO:
1392 if (get_clone_mode(clone) == CM_FAIL) {
1393 DMEMIT("Fail");
1394 break;
1395 }
1396
1397 /* Commit to ensure statistics aren't out-of-date */
1398 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
1399 (void) commit_metadata(clone);
1400
1401 r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
1402
1403 if (r) {
1404 DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
1405 clone_device_name(clone), r);
1406 goto error;
1407 }
1408
1409 r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
1410
1411 if (r) {
1412 DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
1413 clone_device_name(clone), r);
1414 goto error;
1415 }
1416
1417 DMEMIT("%u %llu/%llu %llu %lu/%lu %u ",
1418 DM_CLONE_METADATA_BLOCK_SIZE,
1419 (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
1420 (unsigned long long)nr_metadata_blocks,
1421 (unsigned long long)clone->region_size,
1422 dm_clone_nr_of_hydrated_regions(clone->cmd),
1423 clone->nr_regions,
1424 atomic_read(&clone->hydrations_in_flight));
1425
1426 emit_flags(clone, result, maxlen, &sz);
1427 emit_core_args(clone, result, maxlen, &sz);
1428
1429 switch (get_clone_mode(clone)) {
1430 case CM_WRITE:
1431 DMEMIT("rw");
1432 break;
1433 case CM_READ_ONLY:
1434 DMEMIT("ro");
1435 break;
1436 case CM_FAIL:
1437 DMEMIT("Fail");
1438 }
1439
1440 break;
1441
1442 case STATUSTYPE_TABLE:
1443 format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
1444 DMEMIT("%s ", buf);
1445
1446 format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
1447 DMEMIT("%s ", buf);
1448
1449 format_dev_t(buf, clone->source_dev->bdev->bd_dev);
1450 DMEMIT("%s", buf);
1451
1452 for (i = 0; i < clone->nr_ctr_args; i++)
1453 DMEMIT(" %s", clone->ctr_args[i]);
1454 }
1455
1456 return;
1457
1458error:
1459 DMEMIT("Error");
1460}
1461
1462static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1463{
1464 struct request_queue *dest_q, *source_q;
1465 struct clone *clone = container_of(cb, struct clone, callbacks);
1466
1467 source_q = bdev_get_queue(clone->source_dev->bdev);
1468 dest_q = bdev_get_queue(clone->dest_dev->bdev);
1469
1470 return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
1471 bdi_congested(source_q->backing_dev_info, bdi_bits));
1472}
1473
1474static sector_t get_dev_size(struct dm_dev *dev)
1475{
1476 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1477}
1478
1479/*---------------------------------------------------------------------------*/
1480
1481/*
1482 * Construct a clone device mapping:
1483 *
1484 * clone <metadata dev> <destination dev> <source dev> <region size>
1485 * [<#feature args> [<feature arg>]* [<#core args> [key value]*]]
1486 *
1487 * metadata dev: Fast device holding the persistent metadata
1488 * destination dev: The destination device, which will become a clone of the
1489 * source device
1490 * source dev: The read-only source device that gets cloned
1491 * region size: dm-clone unit size in sectors
1492 *
1493 * #feature args: Number of feature arguments passed
1494 * feature args: E.g. no_hydration, no_discard_passdown
1495 *
1496 * #core arguments: An even number of core arguments
1497 * core arguments: Key/value pairs for tuning the core
1498 * E.g. 'hydration_threshold 256'
1499 */
1500static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
1501{
1502 int r;
1503 unsigned int argc;
1504 const char *arg_name;
1505 struct dm_target *ti = clone->ti;
1506
1507 const struct dm_arg args = {
1508 .min = 0,
1509 .max = 2,
1510 .error = "Invalid number of feature arguments"
1511 };
1512
1513 /* No feature arguments supplied */
1514 if (!as->argc)
1515 return 0;
1516
1517 r = dm_read_arg_group(&args, as, &argc, &ti->error);
1518 if (r)
1519 return r;
1520
1521 while (argc) {
1522 arg_name = dm_shift_arg(as);
1523 argc--;
1524
1525 if (!strcasecmp(arg_name, "no_hydration")) {
1526 __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1527 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1528 __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1529 } else {
1530 ti->error = "Invalid feature argument";
1531 return -EINVAL;
1532 }
1533 }
1534
1535 return 0;
1536}
1537
1538static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
1539{
1540 int r;
1541 unsigned int argc;
1542 unsigned int value;
1543 const char *arg_name;
1544 struct dm_target *ti = clone->ti;
1545
1546 const struct dm_arg args = {
1547 .min = 0,
1548 .max = 4,
1549 .error = "Invalid number of core arguments"
1550 };
1551
1552 /* Initialize core arguments */
1553 clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
1554 clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
1555
1556 /* No core arguments supplied */
1557 if (!as->argc)
1558 return 0;
1559
1560 r = dm_read_arg_group(&args, as, &argc, &ti->error);
1561 if (r)
1562 return r;
1563
1564 if (argc & 1) {
1565 ti->error = "Number of core arguments must be even";
1566 return -EINVAL;
1567 }
1568
1569 while (argc) {
1570 arg_name = dm_shift_arg(as);
1571 argc -= 2;
1572
1573 if (!strcasecmp(arg_name, "hydration_threshold")) {
1574 if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1575 ti->error = "Invalid value for argument `hydration_threshold'";
1576 return -EINVAL;
1577 }
1578 clone->hydration_threshold = value;
1579 } else if (!strcasecmp(arg_name, "hydration_batch_size")) {
1580 if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1581 ti->error = "Invalid value for argument `hydration_batch_size'";
1582 return -EINVAL;
1583 }
1584 clone->hydration_batch_size = value;
1585 } else {
1586 ti->error = "Invalid core argument";
1587 return -EINVAL;
1588 }
1589 }
1590
1591 return 0;
1592}
1593
1594static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
1595{
1596 int r;
1597 unsigned int region_size;
1598 struct dm_arg arg;
1599
1600 arg.min = MIN_REGION_SIZE;
1601 arg.max = MAX_REGION_SIZE;
1602 arg.error = "Invalid region size";
1603
1604 r = dm_read_arg(&arg, as, &region_size, error);
1605 if (r)
1606 return r;
1607
1608 /* Check region size is a power of 2 */
1609 if (!is_power_of_2(region_size)) {
1610 *error = "Region size is not a power of 2";
1611 return -EINVAL;
1612 }
1613
1614 /* Validate the region size against the device logical block size */
1615 if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
1616 region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
1617 *error = "Region size is not a multiple of device logical block size";
1618 return -EINVAL;
1619 }
1620
1621 clone->region_size = region_size;
1622
1623 return 0;
1624}
1625
1626static int validate_nr_regions(unsigned long n, char **error)
1627{
1628 /*
1629 * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
1630 * further to 2^31 regions.
1631 */
1632 if (n > (1UL << 31)) {
1633 *error = "Too many regions. Consider increasing the region size";
1634 return -EINVAL;
1635 }
1636
1637 return 0;
1638}
1639
1640static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1641{
1642 int r;
1643 sector_t metadata_dev_size;
1644 char b[BDEVNAME_SIZE];
1645
1646 r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1647 &clone->metadata_dev);
1648 if (r) {
1649 *error = "Error opening metadata device";
1650 return r;
1651 }
1652
1653 metadata_dev_size = get_dev_size(clone->metadata_dev);
1654 if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
1655 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1656 bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
1657
1658 return 0;
1659}
1660
1661static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1662{
1663 int r;
1664 sector_t dest_dev_size;
1665
1666 r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1667 &clone->dest_dev);
1668 if (r) {
1669 *error = "Error opening destination device";
1670 return r;
1671 }
1672
1673 dest_dev_size = get_dev_size(clone->dest_dev);
1674 if (dest_dev_size < clone->ti->len) {
1675 dm_put_device(clone->ti, clone->dest_dev);
1676 *error = "Device size larger than destination device";
1677 return -EINVAL;
1678 }
1679
1680 return 0;
1681}
1682
1683static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1684{
1685 int r;
1686 sector_t source_dev_size;
1687
1688 r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
1689 &clone->source_dev);
1690 if (r) {
1691 *error = "Error opening source device";
1692 return r;
1693 }
1694
1695 source_dev_size = get_dev_size(clone->source_dev);
1696 if (source_dev_size < clone->ti->len) {
1697 dm_put_device(clone->ti, clone->source_dev);
1698 *error = "Device size larger than source device";
1699 return -EINVAL;
1700 }
1701
1702 return 0;
1703}
1704
1705static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
1706{
1707 unsigned int i;
1708 const char **copy;
1709
1710 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1711 if (!copy)
1712 goto error;
1713
1714 for (i = 0; i < argc; i++) {
1715 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1716
1717 if (!copy[i]) {
1718 while (i--)
1719 kfree(copy[i]);
1720 kfree(copy);
1721 goto error;
1722 }
1723 }
1724
1725 clone->nr_ctr_args = argc;
1726 clone->ctr_args = copy;
1727 return 0;
1728
1729error:
1730 *error = "Failed to allocate memory for table line";
1731 return -ENOMEM;
1732}
1733
1734static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1735{
1736 int r;
1737 struct clone *clone;
1738 struct dm_arg_set as;
1739
1740 if (argc < 4) {
1741 ti->error = "Invalid number of arguments";
1742 return -EINVAL;
1743 }
1744
1745 as.argc = argc;
1746 as.argv = argv;
1747
1748 clone = kzalloc(sizeof(*clone), GFP_KERNEL);
1749 if (!clone) {
1750 ti->error = "Failed to allocate clone structure";
1751 return -ENOMEM;
1752 }
1753
1754 clone->ti = ti;
1755
1756 /* Initialize dm-clone flags */
1757 __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1758 __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1759 __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1760
1761 r = parse_metadata_dev(clone, &as, &ti->error);
1762 if (r)
1763 goto out_with_clone;
1764
1765 r = parse_dest_dev(clone, &as, &ti->error);
1766 if (r)
1767 goto out_with_meta_dev;
1768
1769 r = parse_source_dev(clone, &as, &ti->error);
1770 if (r)
1771 goto out_with_dest_dev;
1772
1773 r = parse_region_size(clone, &as, &ti->error);
1774 if (r)
1775 goto out_with_source_dev;
1776
1777 clone->region_shift = __ffs(clone->region_size);
1778 clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size);
1779
1780 r = validate_nr_regions(clone->nr_regions, &ti->error);
1781 if (r)
1782 goto out_with_source_dev;
1783
1784 r = dm_set_target_max_io_len(ti, clone->region_size);
1785 if (r) {
1786 ti->error = "Failed to set max io len";
1787 goto out_with_source_dev;
1788 }
1789
1790 r = parse_feature_args(&as, clone);
1791 if (r)
1792 goto out_with_source_dev;
1793
1794 r = parse_core_args(&as, clone);
1795 if (r)
1796 goto out_with_source_dev;
1797
1798 /* Load metadata */
1799 clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
1800 clone->region_size);
1801 if (IS_ERR(clone->cmd)) {
1802 ti->error = "Failed to load metadata";
1803 r = PTR_ERR(clone->cmd);
1804 goto out_with_source_dev;
1805 }
1806
1807 __set_clone_mode(clone, CM_WRITE);
1808
1809 if (get_clone_mode(clone) != CM_WRITE) {
1810 ti->error = "Unable to get write access to metadata, please check/repair metadata";
1811 r = -EPERM;
1812 goto out_with_metadata;
1813 }
1814
1815 clone->last_commit_jiffies = jiffies;
1816
1817 /* Allocate hydration hash table */
1818 r = hash_table_init(clone);
1819 if (r) {
1820 ti->error = "Failed to allocate hydration hash table";
1821 goto out_with_metadata;
1822 }
1823
1824 atomic_set(&clone->ios_in_flight, 0);
1825 init_waitqueue_head(&clone->hydration_stopped);
1826 spin_lock_init(&clone->lock);
1827 bio_list_init(&clone->deferred_bios);
1828 bio_list_init(&clone->deferred_discard_bios);
1829 bio_list_init(&clone->deferred_flush_bios);
1830 bio_list_init(&clone->deferred_flush_completions);
1831 clone->hydration_offset = 0;
1832 atomic_set(&clone->hydrations_in_flight, 0);
1833
1834 clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
1835 if (!clone->wq) {
1836 ti->error = "Failed to allocate workqueue";
1837 r = -ENOMEM;
1838 goto out_with_ht;
1839 }
1840
1841 INIT_WORK(&clone->worker, do_worker);
1842 INIT_DELAYED_WORK(&clone->waker, do_waker);
1843
1844 clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1845 if (IS_ERR(clone->kcopyd_client)) {
1846 r = PTR_ERR(clone->kcopyd_client);
1847 goto out_with_wq;
1848 }
1849
1850 r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
1851 _hydration_cache);
1852 if (r) {
1853 ti->error = "Failed to create dm_clone_region_hydration memory pool";
1854 goto out_with_kcopyd;
1855 }
1856
1857 /* Save a copy of the table line */
1858 r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
1859 if (r)
1860 goto out_with_mempool;
1861
1862 mutex_init(&clone->commit_lock);
1863 clone->callbacks.congested_fn = clone_is_congested;
1864 dm_table_add_target_callbacks(ti->table, &clone->callbacks);
1865
1866 /* Enable flushes */
1867 ti->num_flush_bios = 1;
1868 ti->flush_supported = true;
1869
1870 /* Enable discards */
1871 ti->discards_supported = true;
1872 ti->num_discard_bios = 1;
1873
1874 ti->private = clone;
1875
1876 return 0;
1877
1878out_with_mempool:
1879 mempool_exit(&clone->hydration_pool);
1880out_with_kcopyd:
1881 dm_kcopyd_client_destroy(clone->kcopyd_client);
1882out_with_wq:
1883 destroy_workqueue(clone->wq);
1884out_with_ht:
1885 hash_table_exit(clone);
1886out_with_metadata:
1887 dm_clone_metadata_close(clone->cmd);
1888out_with_source_dev:
1889 dm_put_device(ti, clone->source_dev);
1890out_with_dest_dev:
1891 dm_put_device(ti, clone->dest_dev);
1892out_with_meta_dev:
1893 dm_put_device(ti, clone->metadata_dev);
1894out_with_clone:
1895 kfree(clone);
1896
1897 return r;
1898}
1899
1900static void clone_dtr(struct dm_target *ti)
1901{
1902 unsigned int i;
1903 struct clone *clone = ti->private;
1904
1905 mutex_destroy(&clone->commit_lock);
1906
1907 for (i = 0; i < clone->nr_ctr_args; i++)
1908 kfree(clone->ctr_args[i]);
1909 kfree(clone->ctr_args);
1910
1911 mempool_exit(&clone->hydration_pool);
1912 dm_kcopyd_client_destroy(clone->kcopyd_client);
1913 destroy_workqueue(clone->wq);
1914 hash_table_exit(clone);
1915 dm_clone_metadata_close(clone->cmd);
1916 dm_put_device(ti, clone->source_dev);
1917 dm_put_device(ti, clone->dest_dev);
1918 dm_put_device(ti, clone->metadata_dev);
1919
1920 kfree(clone);
1921}
1922
1923/*---------------------------------------------------------------------------*/
1924
1925static void clone_postsuspend(struct dm_target *ti)
1926{
1927 struct clone *clone = ti->private;
1928
1929 /*
1930 * To successfully suspend the device:
1931 *
1932 * - We cancel the delayed work for periodic commits and wait for
1933 * it to finish.
1934 *
1935 * - We stop the background hydration, i.e. we prevent new region
1936 * hydrations from starting.
1937 *
1938 * - We wait for any in-flight hydrations to finish.
1939 *
1940 * - We flush the workqueue.
1941 *
1942 * - We commit the metadata.
1943 */
1944 cancel_delayed_work_sync(&clone->waker);
1945
1946 set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1947
1948 /*
1949 * Make sure set_bit() is ordered before atomic_read(), otherwise we
1950 * might race with do_hydration() and miss some started region
1951 * hydrations.
1952 *
1953 * This is paired with smp_mb__after_atomic() in do_hydration().
1954 */
1955 smp_mb__after_atomic();
1956
1957 wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
1958 flush_workqueue(clone->wq);
1959
1960 (void) commit_metadata(clone);
1961}
1962
1963static void clone_resume(struct dm_target *ti)
1964{
1965 struct clone *clone = ti->private;
1966
1967 clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1968 do_waker(&clone->waker.work);
1969}
1970
1971static bool bdev_supports_discards(struct block_device *bdev)
1972{
1973 struct request_queue *q = bdev_get_queue(bdev);
1974
1975 return (q && blk_queue_discard(q));
1976}
1977
1978/*
1979 * If discard_passdown was enabled verify that the destination device supports
1980 * discards. Disable discard_passdown if not.
1981 */
1982static void disable_passdown_if_not_supported(struct clone *clone)
1983{
1984 struct block_device *dest_dev = clone->dest_dev->bdev;
1985 struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
1986 const char *reason = NULL;
1987 char buf[BDEVNAME_SIZE];
1988
1989 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
1990 return;
1991
1992 if (!bdev_supports_discards(dest_dev))
1993 reason = "discard unsupported";
1994 else if (dest_limits->max_discard_sectors < clone->region_size)
1995 reason = "max discard sectors smaller than a region";
1996
1997 if (reason) {
1998 DMWARN("Destination device (%s) %s: Disabling discard passdown.",
1999 bdevname(dest_dev, buf), reason);
2000 clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
2001 }
2002}
2003
2004static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
2005{
2006 struct block_device *dest_bdev = clone->dest_dev->bdev;
2007 struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
2008
2009 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
2010 /* No passdown is done so we set our own virtual limits */
2011 limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
2012 limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
2013 return;
2014 }
2015
2016 /*
2017 * clone_iterate_devices() is stacking both the source and destination
2018 * device limits but discards aren't passed to the source device, so
2019 * inherit destination's limits.
2020 */
2021 limits->max_discard_sectors = dest_limits->max_discard_sectors;
2022 limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
2023 limits->discard_granularity = dest_limits->discard_granularity;
2024 limits->discard_alignment = dest_limits->discard_alignment;
2025 limits->discard_misaligned = dest_limits->discard_misaligned;
2026 limits->max_discard_segments = dest_limits->max_discard_segments;
2027}
2028
2029static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
2030{
2031 struct clone *clone = ti->private;
2032 u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2033
2034 /*
2035 * If the system-determined stacked limits are compatible with
2036 * dm-clone's region size (io_opt is a factor) do not override them.
2037 */
2038 if (io_opt_sectors < clone->region_size ||
2039 do_div(io_opt_sectors, clone->region_size)) {
2040 blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
2041 blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
2042 }
2043
2044 disable_passdown_if_not_supported(clone);
2045 set_discard_limits(clone, limits);
2046}
2047
2048static int clone_iterate_devices(struct dm_target *ti,
2049 iterate_devices_callout_fn fn, void *data)
2050{
2051 int ret;
2052 struct clone *clone = ti->private;
2053 struct dm_dev *dest_dev = clone->dest_dev;
2054 struct dm_dev *source_dev = clone->source_dev;
2055
2056 ret = fn(ti, source_dev, 0, ti->len, data);
2057 if (!ret)
2058 ret = fn(ti, dest_dev, 0, ti->len, data);
2059 return ret;
2060}
2061
2062/*
2063 * dm-clone message functions.
2064 */
2065static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
2066{
2067 WRITE_ONCE(clone->hydration_threshold, nr_regions);
2068
2069 /*
2070 * If user space sets hydration_threshold to zero then the hydration
2071 * will stop. If at a later time the hydration_threshold is increased
2072 * we must restart the hydration process by waking up the worker.
2073 */
2074 wake_worker(clone);
2075}
2076
2077static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
2078{
2079 WRITE_ONCE(clone->hydration_batch_size, nr_regions);
2080}
2081
2082static void enable_hydration(struct clone *clone)
2083{
2084 if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
2085 wake_worker(clone);
2086}
2087
2088static void disable_hydration(struct clone *clone)
2089{
2090 clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
2091}
2092
2093static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
2094 char *result, unsigned int maxlen)
2095{
2096 struct clone *clone = ti->private;
2097 unsigned int value;
2098
2099 if (!argc)
2100 return -EINVAL;
2101
2102 if (!strcasecmp(argv[0], "enable_hydration")) {
2103 enable_hydration(clone);
2104 return 0;
2105 }
2106
2107 if (!strcasecmp(argv[0], "disable_hydration")) {
2108 disable_hydration(clone);
2109 return 0;
2110 }
2111
2112 if (argc != 2)
2113 return -EINVAL;
2114
2115 if (!strcasecmp(argv[0], "hydration_threshold")) {
2116 if (kstrtouint(argv[1], 10, &value))
2117 return -EINVAL;
2118
2119 set_hydration_threshold(clone, value);
2120
2121 return 0;
2122 }
2123
2124 if (!strcasecmp(argv[0], "hydration_batch_size")) {
2125 if (kstrtouint(argv[1], 10, &value))
2126 return -EINVAL;
2127
2128 set_hydration_batch_size(clone, value);
2129
2130 return 0;
2131 }
2132
2133 DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
2134 return -EINVAL;
2135}
2136
2137static struct target_type clone_target = {
2138 .name = "clone",
2139 .version = {1, 0, 0},
2140 .module = THIS_MODULE,
2141 .ctr = clone_ctr,
2142 .dtr = clone_dtr,
2143 .map = clone_map,
2144 .end_io = clone_endio,
2145 .postsuspend = clone_postsuspend,
2146 .resume = clone_resume,
2147 .status = clone_status,
2148 .message = clone_message,
2149 .io_hints = clone_io_hints,
2150 .iterate_devices = clone_iterate_devices,
2151};
2152
2153/*---------------------------------------------------------------------------*/
2154
2155/* Module functions */
2156static int __init dm_clone_init(void)
2157{
2158 int r;
2159
2160 _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
2161 if (!_hydration_cache)
2162 return -ENOMEM;
2163
2164 r = dm_register_target(&clone_target);
2165 if (r < 0) {
2166 DMERR("Failed to register clone target");
2167 return r;
2168 }
2169
2170 return 0;
2171}
2172
2173static void __exit dm_clone_exit(void)
2174{
2175 dm_unregister_target(&clone_target);
2176
2177 kmem_cache_destroy(_hydration_cache);
2178 _hydration_cache = NULL;
2179}
2180
2181/* Module hooks */
2182module_init(dm_clone_init);
2183module_exit(dm_clone_exit);
2184
2185MODULE_DESCRIPTION(DM_NAME " clone target");
2186MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
2187MODULE_LICENSE("GPL");