Commit | Line | Data |
---|---|---|
7431b783 NT |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* | |
3 | * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. | |
4 | */ | |
5 | ||
6 | #include <linux/mm.h> | |
7 | #include <linux/bio.h> | |
8 | #include <linux/err.h> | |
9 | #include <linux/hash.h> | |
10 | #include <linux/list.h> | |
11 | #include <linux/log2.h> | |
12 | #include <linux/init.h> | |
13 | #include <linux/slab.h> | |
14 | #include <linux/wait.h> | |
15 | #include <linux/dm-io.h> | |
16 | #include <linux/mutex.h> | |
17 | #include <linux/atomic.h> | |
18 | #include <linux/bitops.h> | |
19 | #include <linux/blkdev.h> | |
20 | #include <linux/kdev_t.h> | |
21 | #include <linux/kernel.h> | |
22 | #include <linux/module.h> | |
23 | #include <linux/jiffies.h> | |
24 | #include <linux/mempool.h> | |
25 | #include <linux/spinlock.h> | |
26 | #include <linux/blk_types.h> | |
27 | #include <linux/dm-kcopyd.h> | |
28 | #include <linux/workqueue.h> | |
29 | #include <linux/backing-dev.h> | |
30 | #include <linux/device-mapper.h> | |
31 | ||
32 | #include "dm.h" | |
33 | #include "dm-clone-metadata.h" | |
34 | ||
35 | #define DM_MSG_PREFIX "clone" | |
36 | ||
37 | /* | |
38 | * Minimum and maximum allowed region sizes | |
39 | */ | |
40 | #define MIN_REGION_SIZE (1 << 3) /* 4KB */ | |
41 | #define MAX_REGION_SIZE (1 << 21) /* 1GB */ | |
42 | ||
43 | #define MIN_HYDRATIONS 256 /* Size of hydration mempool */ | |
44 | #define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */ | |
45 | #define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */ | |
46 | ||
47 | #define COMMIT_PERIOD HZ /* 1 sec */ | |
48 | ||
49 | /* | |
50 | * Hydration hash table size: 1 << HASH_TABLE_BITS | |
51 | */ | |
52 | #define HASH_TABLE_BITS 15 | |
53 | ||
54 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle, | |
55 | "A percentage of time allocated for hydrating regions"); | |
56 | ||
57 | /* Slab cache for struct dm_clone_region_hydration */ | |
58 | static struct kmem_cache *_hydration_cache; | |
59 | ||
60 | /* dm-clone metadata modes */ | |
61 | enum clone_metadata_mode { | |
62 | CM_WRITE, /* metadata may be changed */ | |
63 | CM_READ_ONLY, /* metadata may not be changed */ | |
64 | CM_FAIL, /* all metadata I/O fails */ | |
65 | }; | |
66 | ||
67 | struct hash_table_bucket; | |
68 | ||
69 | struct clone { | |
70 | struct dm_target *ti; | |
7431b783 NT |
71 | |
72 | struct dm_dev *metadata_dev; | |
73 | struct dm_dev *dest_dev; | |
74 | struct dm_dev *source_dev; | |
75 | ||
76 | unsigned long nr_regions; | |
77 | sector_t region_size; | |
78 | unsigned int region_shift; | |
79 | ||
80 | /* | |
81 | * A metadata commit and the actions taken in case it fails should run | |
82 | * as a single atomic step. | |
83 | */ | |
84 | struct mutex commit_lock; | |
85 | ||
86 | struct dm_clone_metadata *cmd; | |
87 | ||
88 | /* Region hydration hash table */ | |
89 | struct hash_table_bucket *ht; | |
90 | ||
91 | atomic_t ios_in_flight; | |
92 | ||
93 | wait_queue_head_t hydration_stopped; | |
94 | ||
95 | mempool_t hydration_pool; | |
96 | ||
97 | unsigned long last_commit_jiffies; | |
98 | ||
99 | /* | |
100 | * We defer incoming WRITE bios for regions that are not hydrated, | |
101 | * until after these regions have been hydrated. | |
102 | * | |
103 | * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the | |
104 | * metadata have been committed. | |
105 | */ | |
106 | spinlock_t lock; | |
107 | struct bio_list deferred_bios; | |
108 | struct bio_list deferred_discard_bios; | |
109 | struct bio_list deferred_flush_bios; | |
110 | struct bio_list deferred_flush_completions; | |
111 | ||
112 | /* Maximum number of regions being copied during background hydration. */ | |
113 | unsigned int hydration_threshold; | |
114 | ||
115 | /* Number of regions to batch together during background hydration. */ | |
116 | unsigned int hydration_batch_size; | |
117 | ||
118 | /* Which region to hydrate next */ | |
119 | unsigned long hydration_offset; | |
120 | ||
121 | atomic_t hydrations_in_flight; | |
122 | ||
123 | /* | |
124 | * Save a copy of the table line rather than reconstructing it for the | |
125 | * status. | |
126 | */ | |
127 | unsigned int nr_ctr_args; | |
128 | const char **ctr_args; | |
129 | ||
130 | struct workqueue_struct *wq; | |
131 | struct work_struct worker; | |
132 | struct delayed_work waker; | |
133 | ||
134 | struct dm_kcopyd_client *kcopyd_client; | |
135 | ||
136 | enum clone_metadata_mode mode; | |
137 | unsigned long flags; | |
138 | }; | |
139 | ||
140 | /* | |
141 | * dm-clone flags | |
142 | */ | |
143 | #define DM_CLONE_DISCARD_PASSDOWN 0 | |
144 | #define DM_CLONE_HYDRATION_ENABLED 1 | |
145 | #define DM_CLONE_HYDRATION_SUSPENDED 2 | |
146 | ||
147 | /*---------------------------------------------------------------------------*/ | |
148 | ||
149 | /* | |
150 | * Metadata failure handling. | |
151 | */ | |
152 | static enum clone_metadata_mode get_clone_mode(struct clone *clone) | |
153 | { | |
154 | return READ_ONCE(clone->mode); | |
155 | } | |
156 | ||
157 | static const char *clone_device_name(struct clone *clone) | |
158 | { | |
159 | return dm_table_device_name(clone->ti->table); | |
160 | } | |
161 | ||
162 | static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode) | |
163 | { | |
ec132ef2 | 164 | static const char * const descs[] = { |
7431b783 NT |
165 | "read-write", |
166 | "read-only", | |
167 | "fail" | |
168 | }; | |
169 | ||
170 | enum clone_metadata_mode old_mode = get_clone_mode(clone); | |
171 | ||
172 | /* Never move out of fail mode */ | |
173 | if (old_mode == CM_FAIL) | |
174 | new_mode = CM_FAIL; | |
175 | ||
176 | switch (new_mode) { | |
177 | case CM_FAIL: | |
178 | case CM_READ_ONLY: | |
179 | dm_clone_metadata_set_read_only(clone->cmd); | |
180 | break; | |
181 | ||
182 | case CM_WRITE: | |
183 | dm_clone_metadata_set_read_write(clone->cmd); | |
184 | break; | |
185 | } | |
186 | ||
187 | WRITE_ONCE(clone->mode, new_mode); | |
188 | ||
189 | if (new_mode != old_mode) { | |
190 | dm_table_event(clone->ti->table); | |
191 | DMINFO("%s: Switching to %s mode", clone_device_name(clone), | |
192 | descs[(int)new_mode]); | |
193 | } | |
194 | } | |
195 | ||
196 | static void __abort_transaction(struct clone *clone) | |
197 | { | |
198 | const char *dev_name = clone_device_name(clone); | |
199 | ||
200 | if (get_clone_mode(clone) >= CM_READ_ONLY) | |
201 | return; | |
202 | ||
203 | DMERR("%s: Aborting current metadata transaction", dev_name); | |
204 | if (dm_clone_metadata_abort(clone->cmd)) { | |
205 | DMERR("%s: Failed to abort metadata transaction", dev_name); | |
206 | __set_clone_mode(clone, CM_FAIL); | |
207 | } | |
208 | } | |
209 | ||
210 | static void __reload_in_core_bitset(struct clone *clone) | |
211 | { | |
212 | const char *dev_name = clone_device_name(clone); | |
213 | ||
214 | if (get_clone_mode(clone) == CM_FAIL) | |
215 | return; | |
216 | ||
217 | /* Reload the on-disk bitset */ | |
218 | DMINFO("%s: Reloading on-disk bitmap", dev_name); | |
219 | if (dm_clone_reload_in_core_bitset(clone->cmd)) { | |
220 | DMERR("%s: Failed to reload on-disk bitmap", dev_name); | |
221 | __set_clone_mode(clone, CM_FAIL); | |
222 | } | |
223 | } | |
224 | ||
225 | static void __metadata_operation_failed(struct clone *clone, const char *op, int r) | |
226 | { | |
227 | DMERR("%s: Metadata operation `%s' failed: error = %d", | |
228 | clone_device_name(clone), op, r); | |
229 | ||
230 | __abort_transaction(clone); | |
231 | __set_clone_mode(clone, CM_READ_ONLY); | |
232 | ||
233 | /* | |
234 | * dm_clone_reload_in_core_bitset() may run concurrently with either | |
235 | * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but | |
236 | * it's safe as we have already set the metadata to read-only mode. | |
237 | */ | |
238 | __reload_in_core_bitset(clone); | |
239 | } | |
240 | ||
241 | /*---------------------------------------------------------------------------*/ | |
242 | ||
243 | /* Wake up anyone waiting for region hydrations to stop */ | |
244 | static inline void wakeup_hydration_waiters(struct clone *clone) | |
245 | { | |
246 | wake_up_all(&clone->hydration_stopped); | |
247 | } | |
248 | ||
249 | static inline void wake_worker(struct clone *clone) | |
250 | { | |
251 | queue_work(clone->wq, &clone->worker); | |
252 | } | |
253 | ||
254 | /*---------------------------------------------------------------------------*/ | |
255 | ||
256 | /* | |
257 | * bio helper functions. | |
258 | */ | |
259 | static inline void remap_to_source(struct clone *clone, struct bio *bio) | |
260 | { | |
261 | bio_set_dev(bio, clone->source_dev->bdev); | |
262 | } | |
263 | ||
264 | static inline void remap_to_dest(struct clone *clone, struct bio *bio) | |
265 | { | |
266 | bio_set_dev(bio, clone->dest_dev->bdev); | |
267 | } | |
268 | ||
269 | static bool bio_triggers_commit(struct clone *clone, struct bio *bio) | |
270 | { | |
271 | return op_is_flush(bio->bi_opf) && | |
272 | dm_clone_changed_this_transaction(clone->cmd); | |
273 | } | |
274 | ||
275 | /* Get the address of the region in sectors */ | |
276 | static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) | |
277 | { | |
9fc06ff5 | 278 | return ((sector_t)region_nr << clone->region_shift); |
7431b783 NT |
279 | } |
280 | ||
281 | /* Get the region number of the bio */ | |
282 | static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) | |
283 | { | |
284 | return (bio->bi_iter.bi_sector >> clone->region_shift); | |
285 | } | |
286 | ||
287 | /* Get the region range covered by the bio */ | |
288 | static void bio_region_range(struct clone *clone, struct bio *bio, | |
4b514290 | 289 | unsigned long *rs, unsigned long *nr_regions) |
7431b783 | 290 | { |
4b514290 NT |
291 | unsigned long end; |
292 | ||
7431b783 | 293 | *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); |
4b514290 NT |
294 | end = bio_end_sector(bio) >> clone->region_shift; |
295 | ||
296 | if (*rs >= end) | |
297 | *nr_regions = 0; | |
298 | else | |
299 | *nr_regions = end - *rs; | |
7431b783 NT |
300 | } |
301 | ||
302 | /* Check whether a bio overwrites a region */ | |
303 | static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio) | |
304 | { | |
305 | return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size); | |
306 | } | |
307 | ||
308 | static void fail_bios(struct bio_list *bios, blk_status_t status) | |
309 | { | |
310 | struct bio *bio; | |
311 | ||
312 | while ((bio = bio_list_pop(bios))) { | |
313 | bio->bi_status = status; | |
314 | bio_endio(bio); | |
315 | } | |
316 | } | |
317 | ||
318 | static void submit_bios(struct bio_list *bios) | |
319 | { | |
320 | struct bio *bio; | |
321 | struct blk_plug plug; | |
322 | ||
323 | blk_start_plug(&plug); | |
324 | ||
325 | while ((bio = bio_list_pop(bios))) | |
ed00aabd | 326 | submit_bio_noacct(bio); |
7431b783 NT |
327 | |
328 | blk_finish_plug(&plug); | |
329 | } | |
330 | ||
331 | /* | |
332 | * Submit bio to the underlying device. | |
333 | * | |
334 | * If the bio triggers a commit, delay it, until after the metadata have been | |
335 | * committed. | |
336 | * | |
337 | * NOTE: The bio remapping must be performed by the caller. | |
338 | */ | |
339 | static void issue_bio(struct clone *clone, struct bio *bio) | |
340 | { | |
7431b783 | 341 | if (!bio_triggers_commit(clone, bio)) { |
ed00aabd | 342 | submit_bio_noacct(bio); |
7431b783 NT |
343 | return; |
344 | } | |
345 | ||
346 | /* | |
347 | * If the metadata mode is RO or FAIL we won't be able to commit the | |
348 | * metadata, so we complete the bio with an error. | |
349 | */ | |
350 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
351 | bio_io_error(bio); | |
352 | return; | |
353 | } | |
354 | ||
355 | /* | |
356 | * Batch together any bios that trigger commits and then issue a single | |
357 | * commit for them in process_deferred_flush_bios(). | |
358 | */ | |
6ca43ed8 | 359 | spin_lock_irq(&clone->lock); |
7431b783 | 360 | bio_list_add(&clone->deferred_flush_bios, bio); |
6ca43ed8 | 361 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
362 | |
363 | wake_worker(clone); | |
364 | } | |
365 | ||
366 | /* | |
367 | * Remap bio to the destination device and submit it. | |
368 | * | |
369 | * If the bio triggers a commit, delay it, until after the metadata have been | |
370 | * committed. | |
371 | */ | |
372 | static void remap_and_issue(struct clone *clone, struct bio *bio) | |
373 | { | |
374 | remap_to_dest(clone, bio); | |
375 | issue_bio(clone, bio); | |
376 | } | |
377 | ||
378 | /* | |
379 | * Issue bios that have been deferred until after their region has finished | |
380 | * hydrating. | |
381 | * | |
382 | * We delegate the bio submission to the worker thread, so this is safe to call | |
383 | * from interrupt context. | |
384 | */ | |
385 | static void issue_deferred_bios(struct clone *clone, struct bio_list *bios) | |
386 | { | |
387 | struct bio *bio; | |
388 | unsigned long flags; | |
389 | struct bio_list flush_bios = BIO_EMPTY_LIST; | |
390 | struct bio_list normal_bios = BIO_EMPTY_LIST; | |
391 | ||
392 | if (bio_list_empty(bios)) | |
393 | return; | |
394 | ||
395 | while ((bio = bio_list_pop(bios))) { | |
396 | if (bio_triggers_commit(clone, bio)) | |
397 | bio_list_add(&flush_bios, bio); | |
398 | else | |
399 | bio_list_add(&normal_bios, bio); | |
400 | } | |
401 | ||
402 | spin_lock_irqsave(&clone->lock, flags); | |
403 | bio_list_merge(&clone->deferred_bios, &normal_bios); | |
404 | bio_list_merge(&clone->deferred_flush_bios, &flush_bios); | |
405 | spin_unlock_irqrestore(&clone->lock, flags); | |
406 | ||
407 | wake_worker(clone); | |
408 | } | |
409 | ||
410 | static void complete_overwrite_bio(struct clone *clone, struct bio *bio) | |
411 | { | |
412 | unsigned long flags; | |
413 | ||
414 | /* | |
415 | * If the bio has the REQ_FUA flag set we must commit the metadata | |
416 | * before signaling its completion. | |
417 | * | |
418 | * complete_overwrite_bio() is only called by hydration_complete(), | |
419 | * after having successfully updated the metadata. This means we don't | |
420 | * need to call dm_clone_changed_this_transaction() to check if the | |
421 | * metadata has changed and thus we can avoid taking the metadata spin | |
422 | * lock. | |
423 | */ | |
424 | if (!(bio->bi_opf & REQ_FUA)) { | |
425 | bio_endio(bio); | |
426 | return; | |
427 | } | |
428 | ||
429 | /* | |
430 | * If the metadata mode is RO or FAIL we won't be able to commit the | |
431 | * metadata, so we complete the bio with an error. | |
432 | */ | |
433 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
434 | bio_io_error(bio); | |
435 | return; | |
436 | } | |
437 | ||
438 | /* | |
439 | * Batch together any bios that trigger commits and then issue a single | |
440 | * commit for them in process_deferred_flush_bios(). | |
441 | */ | |
442 | spin_lock_irqsave(&clone->lock, flags); | |
443 | bio_list_add(&clone->deferred_flush_completions, bio); | |
444 | spin_unlock_irqrestore(&clone->lock, flags); | |
445 | ||
446 | wake_worker(clone); | |
447 | } | |
448 | ||
449 | static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) | |
450 | { | |
451 | bio->bi_iter.bi_sector = sector; | |
452 | bio->bi_iter.bi_size = to_bytes(len); | |
453 | } | |
454 | ||
455 | static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) | |
456 | { | |
4b514290 | 457 | unsigned long rs, nr_regions; |
7431b783 NT |
458 | |
459 | /* | |
460 | * If the destination device supports discards, remap and trim the | |
461 | * discard bio and pass it down. Otherwise complete the bio | |
462 | * immediately. | |
463 | */ | |
464 | if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { | |
465 | remap_to_dest(clone, bio); | |
4b514290 | 466 | bio_region_range(clone, bio, &rs, &nr_regions); |
9fc06ff5 | 467 | trim_bio(bio, region_to_sector(clone, rs), |
4b514290 | 468 | nr_regions << clone->region_shift); |
ed00aabd | 469 | submit_bio_noacct(bio); |
7431b783 NT |
470 | } else |
471 | bio_endio(bio); | |
472 | } | |
473 | ||
474 | static void process_discard_bio(struct clone *clone, struct bio *bio) | |
475 | { | |
4b514290 | 476 | unsigned long rs, nr_regions; |
7431b783 | 477 | |
4b514290 NT |
478 | bio_region_range(clone, bio, &rs, &nr_regions); |
479 | if (!nr_regions) { | |
480 | bio_endio(bio); | |
481 | return; | |
482 | } | |
7431b783 | 483 | |
4b514290 NT |
484 | if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs || |
485 | (rs + nr_regions) > clone->nr_regions)) { | |
486 | DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)", | |
487 | clone_device_name(clone), rs, nr_regions, | |
488 | clone->nr_regions, | |
489 | (unsigned long long)bio->bi_iter.bi_sector, | |
490 | bio_sectors(bio)); | |
7431b783 NT |
491 | bio_endio(bio); |
492 | return; | |
493 | } | |
494 | ||
495 | /* | |
496 | * The covered regions are already hydrated so we just need to pass | |
497 | * down the discard. | |
498 | */ | |
4b514290 | 499 | if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) { |
7431b783 NT |
500 | complete_discard_bio(clone, bio, true); |
501 | return; | |
502 | } | |
503 | ||
504 | /* | |
505 | * If the metadata mode is RO or FAIL we won't be able to update the | |
506 | * metadata for the regions covered by the discard so we just ignore | |
507 | * it. | |
508 | */ | |
509 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
510 | bio_endio(bio); | |
511 | return; | |
512 | } | |
513 | ||
514 | /* | |
515 | * Defer discard processing. | |
516 | */ | |
6ca43ed8 | 517 | spin_lock_irq(&clone->lock); |
7431b783 | 518 | bio_list_add(&clone->deferred_discard_bios, bio); |
6ca43ed8 | 519 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
520 | |
521 | wake_worker(clone); | |
522 | } | |
523 | ||
524 | /*---------------------------------------------------------------------------*/ | |
525 | ||
526 | /* | |
527 | * dm-clone region hydrations. | |
528 | */ | |
529 | struct dm_clone_region_hydration { | |
530 | struct clone *clone; | |
531 | unsigned long region_nr; | |
532 | ||
533 | struct bio *overwrite_bio; | |
534 | bio_end_io_t *overwrite_bio_end_io; | |
535 | ||
536 | struct bio_list deferred_bios; | |
537 | ||
538 | blk_status_t status; | |
539 | ||
540 | /* Used by hydration batching */ | |
541 | struct list_head list; | |
542 | ||
543 | /* Used by hydration hash table */ | |
544 | struct hlist_node h; | |
545 | }; | |
546 | ||
547 | /* | |
548 | * Hydration hash table implementation. | |
549 | * | |
550 | * Ideally we would like to use list_bl, which uses bit spin locks and employs | |
551 | * the least significant bit of the list head to lock the corresponding bucket, | |
552 | * reducing the memory overhead for the locks. But, currently, list_bl and bit | |
553 | * spin locks don't support IRQ safe versions. Since we have to take the lock | |
554 | * in both process and interrupt context, we must fall back to using regular | |
555 | * spin locks; one per hash table bucket. | |
556 | */ | |
557 | struct hash_table_bucket { | |
558 | struct hlist_head head; | |
559 | ||
560 | /* Spinlock protecting the bucket */ | |
561 | spinlock_t lock; | |
562 | }; | |
563 | ||
564 | #define bucket_lock_irqsave(bucket, flags) \ | |
565 | spin_lock_irqsave(&(bucket)->lock, flags) | |
566 | ||
567 | #define bucket_unlock_irqrestore(bucket, flags) \ | |
568 | spin_unlock_irqrestore(&(bucket)->lock, flags) | |
569 | ||
52c67d41 NT |
570 | #define bucket_lock_irq(bucket) \ |
571 | spin_lock_irq(&(bucket)->lock) | |
572 | ||
573 | #define bucket_unlock_irq(bucket) \ | |
574 | spin_unlock_irq(&(bucket)->lock) | |
575 | ||
7431b783 NT |
576 | static int hash_table_init(struct clone *clone) |
577 | { | |
578 | unsigned int i, sz; | |
579 | struct hash_table_bucket *bucket; | |
580 | ||
581 | sz = 1 << HASH_TABLE_BITS; | |
582 | ||
583 | clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL); | |
584 | if (!clone->ht) | |
585 | return -ENOMEM; | |
586 | ||
587 | for (i = 0; i < sz; i++) { | |
588 | bucket = clone->ht + i; | |
589 | ||
590 | INIT_HLIST_HEAD(&bucket->head); | |
591 | spin_lock_init(&bucket->lock); | |
592 | } | |
593 | ||
594 | return 0; | |
595 | } | |
596 | ||
597 | static void hash_table_exit(struct clone *clone) | |
598 | { | |
599 | kvfree(clone->ht); | |
600 | } | |
601 | ||
602 | static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone, | |
603 | unsigned long region_nr) | |
604 | { | |
605 | return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)]; | |
606 | } | |
607 | ||
608 | /* | |
609 | * Search hash table for a hydration with hd->region_nr == region_nr | |
610 | * | |
611 | * NOTE: Must be called with the bucket lock held | |
612 | */ | |
0a005856 Y |
613 | static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, |
614 | unsigned long region_nr) | |
7431b783 NT |
615 | { |
616 | struct dm_clone_region_hydration *hd; | |
617 | ||
618 | hlist_for_each_entry(hd, &bucket->head, h) { | |
619 | if (hd->region_nr == region_nr) | |
620 | return hd; | |
621 | } | |
622 | ||
623 | return NULL; | |
624 | } | |
625 | ||
626 | /* | |
627 | * Insert a hydration into the hash table. | |
628 | * | |
629 | * NOTE: Must be called with the bucket lock held. | |
630 | */ | |
631 | static inline void __insert_region_hydration(struct hash_table_bucket *bucket, | |
632 | struct dm_clone_region_hydration *hd) | |
633 | { | |
634 | hlist_add_head(&hd->h, &bucket->head); | |
635 | } | |
636 | ||
637 | /* | |
638 | * This function inserts a hydration into the hash table, unless someone else | |
639 | * managed to insert a hydration for the same region first. In the latter case | |
640 | * it returns the existing hydration descriptor for this region. | |
641 | * | |
642 | * NOTE: Must be called with the hydration hash table lock held. | |
643 | */ | |
644 | static struct dm_clone_region_hydration * | |
645 | __find_or_insert_region_hydration(struct hash_table_bucket *bucket, | |
646 | struct dm_clone_region_hydration *hd) | |
647 | { | |
648 | struct dm_clone_region_hydration *hd2; | |
649 | ||
650 | hd2 = __hash_find(bucket, hd->region_nr); | |
651 | if (hd2) | |
652 | return hd2; | |
653 | ||
654 | __insert_region_hydration(bucket, hd); | |
655 | ||
656 | return hd; | |
657 | } | |
658 | ||
659 | /*---------------------------------------------------------------------------*/ | |
660 | ||
661 | /* Allocate a hydration */ | |
662 | static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone) | |
663 | { | |
664 | struct dm_clone_region_hydration *hd; | |
665 | ||
666 | /* | |
667 | * Allocate a hydration from the hydration mempool. | |
668 | * This might block but it can't fail. | |
669 | */ | |
670 | hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO); | |
671 | hd->clone = clone; | |
672 | ||
673 | return hd; | |
674 | } | |
675 | ||
676 | static inline void free_hydration(struct dm_clone_region_hydration *hd) | |
677 | { | |
678 | mempool_free(hd, &hd->clone->hydration_pool); | |
679 | } | |
680 | ||
681 | /* Initialize a hydration */ | |
682 | static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr) | |
683 | { | |
684 | hd->region_nr = region_nr; | |
685 | hd->overwrite_bio = NULL; | |
686 | bio_list_init(&hd->deferred_bios); | |
687 | hd->status = 0; | |
688 | ||
689 | INIT_LIST_HEAD(&hd->list); | |
690 | INIT_HLIST_NODE(&hd->h); | |
691 | } | |
692 | ||
693 | /*---------------------------------------------------------------------------*/ | |
694 | ||
695 | /* | |
696 | * Update dm-clone's metadata after a region has finished hydrating and remove | |
697 | * hydration from the hash table. | |
698 | */ | |
699 | static int hydration_update_metadata(struct dm_clone_region_hydration *hd) | |
700 | { | |
701 | int r = 0; | |
702 | unsigned long flags; | |
703 | struct hash_table_bucket *bucket; | |
704 | struct clone *clone = hd->clone; | |
705 | ||
706 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | |
707 | r = -EPERM; | |
708 | ||
709 | /* Update the metadata */ | |
710 | if (likely(!r) && hd->status == BLK_STS_OK) | |
711 | r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr); | |
712 | ||
713 | bucket = get_hash_table_bucket(clone, hd->region_nr); | |
714 | ||
715 | /* Remove hydration from hash table */ | |
716 | bucket_lock_irqsave(bucket, flags); | |
717 | hlist_del(&hd->h); | |
718 | bucket_unlock_irqrestore(bucket, flags); | |
719 | ||
720 | return r; | |
721 | } | |
722 | ||
723 | /* | |
724 | * Complete a region's hydration: | |
725 | * | |
726 | * 1. Update dm-clone's metadata. | |
727 | * 2. Remove hydration from hash table. | |
728 | * 3. Complete overwrite bio. | |
729 | * 4. Issue deferred bios. | |
730 | * 5. If this was the last hydration, wake up anyone waiting for | |
731 | * hydrations to finish. | |
732 | */ | |
733 | static void hydration_complete(struct dm_clone_region_hydration *hd) | |
734 | { | |
735 | int r; | |
736 | blk_status_t status; | |
737 | struct clone *clone = hd->clone; | |
738 | ||
739 | r = hydration_update_metadata(hd); | |
740 | ||
741 | if (hd->status == BLK_STS_OK && likely(!r)) { | |
742 | if (hd->overwrite_bio) | |
743 | complete_overwrite_bio(clone, hd->overwrite_bio); | |
744 | ||
745 | issue_deferred_bios(clone, &hd->deferred_bios); | |
746 | } else { | |
747 | status = r ? BLK_STS_IOERR : hd->status; | |
748 | ||
749 | if (hd->overwrite_bio) | |
750 | bio_list_add(&hd->deferred_bios, hd->overwrite_bio); | |
751 | ||
752 | fail_bios(&hd->deferred_bios, status); | |
753 | } | |
754 | ||
755 | free_hydration(hd); | |
756 | ||
757 | if (atomic_dec_and_test(&clone->hydrations_in_flight)) | |
758 | wakeup_hydration_waiters(clone); | |
759 | } | |
760 | ||
761 | static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context) | |
762 | { | |
763 | blk_status_t status; | |
764 | ||
765 | struct dm_clone_region_hydration *tmp, *hd = context; | |
766 | struct clone *clone = hd->clone; | |
767 | ||
768 | LIST_HEAD(batched_hydrations); | |
769 | ||
770 | if (read_err || write_err) { | |
771 | DMERR_LIMIT("%s: hydration failed", clone_device_name(clone)); | |
772 | status = BLK_STS_IOERR; | |
773 | } else { | |
774 | status = BLK_STS_OK; | |
775 | } | |
776 | list_splice_tail(&hd->list, &batched_hydrations); | |
777 | ||
778 | hd->status = status; | |
779 | hydration_complete(hd); | |
780 | ||
781 | /* Complete batched hydrations */ | |
782 | list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) { | |
783 | hd->status = status; | |
784 | hydration_complete(hd); | |
785 | } | |
786 | ||
787 | /* Continue background hydration, if there is no I/O in-flight */ | |
788 | if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && | |
789 | !atomic_read(&clone->ios_in_flight)) | |
790 | wake_worker(clone); | |
791 | } | |
792 | ||
793 | static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions) | |
794 | { | |
795 | unsigned long region_start, region_end; | |
796 | sector_t tail_size, region_size, total_size; | |
797 | struct dm_io_region from, to; | |
798 | struct clone *clone = hd->clone; | |
799 | ||
9fc06ff5 NT |
800 | if (WARN_ON(!nr_regions)) |
801 | return; | |
802 | ||
7431b783 NT |
803 | region_size = clone->region_size; |
804 | region_start = hd->region_nr; | |
805 | region_end = region_start + nr_regions - 1; | |
806 | ||
9fc06ff5 | 807 | total_size = region_to_sector(clone, nr_regions - 1); |
7431b783 NT |
808 | |
809 | if (region_end == clone->nr_regions - 1) { | |
810 | /* | |
811 | * The last region of the target might be smaller than | |
812 | * region_size. | |
813 | */ | |
814 | tail_size = clone->ti->len & (region_size - 1); | |
815 | if (!tail_size) | |
816 | tail_size = region_size; | |
817 | } else { | |
818 | tail_size = region_size; | |
819 | } | |
820 | ||
821 | total_size += tail_size; | |
822 | ||
823 | from.bdev = clone->source_dev->bdev; | |
824 | from.sector = region_to_sector(clone, region_start); | |
825 | from.count = total_size; | |
826 | ||
827 | to.bdev = clone->dest_dev->bdev; | |
828 | to.sector = from.sector; | |
829 | to.count = from.count; | |
830 | ||
831 | /* Issue copy */ | |
832 | atomic_add(nr_regions, &clone->hydrations_in_flight); | |
833 | dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0, | |
834 | hydration_kcopyd_callback, hd); | |
835 | } | |
836 | ||
837 | static void overwrite_endio(struct bio *bio) | |
838 | { | |
839 | struct dm_clone_region_hydration *hd = bio->bi_private; | |
840 | ||
841 | bio->bi_end_io = hd->overwrite_bio_end_io; | |
842 | hd->status = bio->bi_status; | |
843 | ||
844 | hydration_complete(hd); | |
845 | } | |
846 | ||
847 | static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio) | |
848 | { | |
849 | /* | |
850 | * We don't need to save and restore bio->bi_private because device | |
851 | * mapper core generates a new bio for us to use, with clean | |
852 | * bi_private. | |
853 | */ | |
854 | hd->overwrite_bio = bio; | |
855 | hd->overwrite_bio_end_io = bio->bi_end_io; | |
856 | ||
857 | bio->bi_end_io = overwrite_endio; | |
858 | bio->bi_private = hd; | |
859 | ||
860 | atomic_inc(&hd->clone->hydrations_in_flight); | |
ed00aabd | 861 | submit_bio_noacct(bio); |
7431b783 NT |
862 | } |
863 | ||
864 | /* | |
865 | * Hydrate bio's region. | |
866 | * | |
867 | * This function starts the hydration of the bio's region and puts the bio in | |
868 | * the list of deferred bios for this region. In case, by the time this | |
869 | * function is called, the region has finished hydrating it's submitted to the | |
870 | * destination device. | |
871 | * | |
872 | * NOTE: The bio remapping must be performed by the caller. | |
873 | */ | |
874 | static void hydrate_bio_region(struct clone *clone, struct bio *bio) | |
875 | { | |
7431b783 NT |
876 | unsigned long region_nr; |
877 | struct hash_table_bucket *bucket; | |
878 | struct dm_clone_region_hydration *hd, *hd2; | |
879 | ||
880 | region_nr = bio_to_region(clone, bio); | |
881 | bucket = get_hash_table_bucket(clone, region_nr); | |
882 | ||
52c67d41 | 883 | bucket_lock_irq(bucket); |
7431b783 NT |
884 | |
885 | hd = __hash_find(bucket, region_nr); | |
886 | if (hd) { | |
887 | /* Someone else is hydrating the region */ | |
888 | bio_list_add(&hd->deferred_bios, bio); | |
52c67d41 | 889 | bucket_unlock_irq(bucket); |
7431b783 NT |
890 | return; |
891 | } | |
892 | ||
893 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | |
894 | /* The region has been hydrated */ | |
52c67d41 | 895 | bucket_unlock_irq(bucket); |
7431b783 NT |
896 | issue_bio(clone, bio); |
897 | return; | |
898 | } | |
899 | ||
900 | /* | |
901 | * We must allocate a hydration descriptor and start the hydration of | |
902 | * the corresponding region. | |
903 | */ | |
52c67d41 | 904 | bucket_unlock_irq(bucket); |
7431b783 NT |
905 | |
906 | hd = alloc_hydration(clone); | |
907 | hydration_init(hd, region_nr); | |
908 | ||
52c67d41 | 909 | bucket_lock_irq(bucket); |
7431b783 NT |
910 | |
911 | /* Check if the region has been hydrated in the meantime. */ | |
912 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | |
52c67d41 | 913 | bucket_unlock_irq(bucket); |
7431b783 NT |
914 | free_hydration(hd); |
915 | issue_bio(clone, bio); | |
916 | return; | |
917 | } | |
918 | ||
919 | hd2 = __find_or_insert_region_hydration(bucket, hd); | |
920 | if (hd2 != hd) { | |
921 | /* Someone else started the region's hydration. */ | |
922 | bio_list_add(&hd2->deferred_bios, bio); | |
52c67d41 | 923 | bucket_unlock_irq(bucket); |
7431b783 NT |
924 | free_hydration(hd); |
925 | return; | |
926 | } | |
927 | ||
928 | /* | |
929 | * If the metadata mode is RO or FAIL then there is no point starting a | |
930 | * hydration, since we will not be able to update the metadata when the | |
931 | * hydration finishes. | |
932 | */ | |
933 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
934 | hlist_del(&hd->h); | |
52c67d41 | 935 | bucket_unlock_irq(bucket); |
7431b783 NT |
936 | free_hydration(hd); |
937 | bio_io_error(bio); | |
938 | return; | |
939 | } | |
940 | ||
941 | /* | |
942 | * Start region hydration. | |
943 | * | |
944 | * If a bio overwrites a region, i.e., its size is equal to the | |
945 | * region's size, then we don't need to copy the region from the source | |
946 | * to the destination device. | |
947 | */ | |
948 | if (is_overwrite_bio(clone, bio)) { | |
52c67d41 | 949 | bucket_unlock_irq(bucket); |
7431b783 NT |
950 | hydration_overwrite(hd, bio); |
951 | } else { | |
952 | bio_list_add(&hd->deferred_bios, bio); | |
52c67d41 | 953 | bucket_unlock_irq(bucket); |
7431b783 NT |
954 | hydration_copy(hd, 1); |
955 | } | |
956 | } | |
957 | ||
958 | /*---------------------------------------------------------------------------*/ | |
959 | ||
960 | /* | |
961 | * Background hydrations. | |
962 | */ | |
963 | ||
964 | /* | |
965 | * Batch region hydrations. | |
966 | * | |
967 | * To better utilize device bandwidth we batch together the hydration of | |
968 | * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which | |
969 | * is good for small, random write performance (because of the overwriting of | |
970 | * un-hydrated regions) and at the same time issue big copy requests to kcopyd | |
971 | * to achieve high hydration bandwidth. | |
972 | */ | |
973 | struct batch_info { | |
974 | struct dm_clone_region_hydration *head; | |
975 | unsigned int nr_batched_regions; | |
976 | }; | |
977 | ||
978 | static void __batch_hydration(struct batch_info *batch, | |
979 | struct dm_clone_region_hydration *hd) | |
980 | { | |
981 | struct clone *clone = hd->clone; | |
982 | unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size); | |
983 | ||
984 | if (batch->head) { | |
985 | /* Try to extend the current batch */ | |
986 | if (batch->nr_batched_regions < max_batch_size && | |
987 | (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) { | |
988 | list_add_tail(&hd->list, &batch->head->list); | |
989 | batch->nr_batched_regions++; | |
990 | hd = NULL; | |
991 | } | |
992 | ||
993 | /* Check if we should issue the current batch */ | |
994 | if (batch->nr_batched_regions >= max_batch_size || hd) { | |
995 | hydration_copy(batch->head, batch->nr_batched_regions); | |
996 | batch->head = NULL; | |
997 | batch->nr_batched_regions = 0; | |
998 | } | |
999 | } | |
1000 | ||
1001 | if (!hd) | |
1002 | return; | |
1003 | ||
1004 | /* We treat max batch sizes of zero and one equivalently */ | |
1005 | if (max_batch_size <= 1) { | |
1006 | hydration_copy(hd, 1); | |
1007 | return; | |
1008 | } | |
1009 | ||
1010 | /* Start a new batch */ | |
1011 | BUG_ON(!list_empty(&hd->list)); | |
1012 | batch->head = hd; | |
1013 | batch->nr_batched_regions = 1; | |
1014 | } | |
1015 | ||
1016 | static unsigned long __start_next_hydration(struct clone *clone, | |
1017 | unsigned long offset, | |
1018 | struct batch_info *batch) | |
1019 | { | |
7431b783 NT |
1020 | struct hash_table_bucket *bucket; |
1021 | struct dm_clone_region_hydration *hd; | |
1022 | unsigned long nr_regions = clone->nr_regions; | |
1023 | ||
1024 | hd = alloc_hydration(clone); | |
1025 | ||
1026 | /* Try to find a region to hydrate. */ | |
1027 | do { | |
1028 | offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset); | |
1029 | if (offset == nr_regions) | |
1030 | break; | |
1031 | ||
1032 | bucket = get_hash_table_bucket(clone, offset); | |
52c67d41 | 1033 | bucket_lock_irq(bucket); |
7431b783 NT |
1034 | |
1035 | if (!dm_clone_is_region_hydrated(clone->cmd, offset) && | |
1036 | !__hash_find(bucket, offset)) { | |
1037 | hydration_init(hd, offset); | |
1038 | __insert_region_hydration(bucket, hd); | |
52c67d41 | 1039 | bucket_unlock_irq(bucket); |
7431b783 NT |
1040 | |
1041 | /* Batch hydration */ | |
1042 | __batch_hydration(batch, hd); | |
1043 | ||
1044 | return (offset + 1); | |
1045 | } | |
1046 | ||
52c67d41 | 1047 | bucket_unlock_irq(bucket); |
7431b783 NT |
1048 | |
1049 | } while (++offset < nr_regions); | |
1050 | ||
1051 | if (hd) | |
1052 | free_hydration(hd); | |
1053 | ||
1054 | return offset; | |
1055 | } | |
1056 | ||
1057 | /* | |
1058 | * This function searches for regions that still reside in the source device | |
1059 | * and starts their hydration. | |
1060 | */ | |
1061 | static void do_hydration(struct clone *clone) | |
1062 | { | |
1063 | unsigned int current_volume; | |
1064 | unsigned long offset, nr_regions = clone->nr_regions; | |
1065 | ||
1066 | struct batch_info batch = { | |
1067 | .head = NULL, | |
1068 | .nr_batched_regions = 0, | |
1069 | }; | |
1070 | ||
1071 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | |
1072 | return; | |
1073 | ||
1074 | if (dm_clone_is_hydration_done(clone->cmd)) | |
1075 | return; | |
1076 | ||
1077 | /* | |
1078 | * Avoid race with device suspension. | |
1079 | */ | |
1080 | atomic_inc(&clone->hydrations_in_flight); | |
1081 | ||
1082 | /* | |
1083 | * Make sure atomic_inc() is ordered before test_bit(), otherwise we | |
1084 | * might race with clone_postsuspend() and start a region hydration | |
1085 | * after the target has been suspended. | |
1086 | * | |
1087 | * This is paired with the smp_mb__after_atomic() in | |
1088 | * clone_postsuspend(). | |
1089 | */ | |
1090 | smp_mb__after_atomic(); | |
1091 | ||
1092 | offset = clone->hydration_offset; | |
1093 | while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) && | |
1094 | !atomic_read(&clone->ios_in_flight) && | |
1095 | test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && | |
1096 | offset < nr_regions) { | |
1097 | current_volume = atomic_read(&clone->hydrations_in_flight); | |
1098 | current_volume += batch.nr_batched_regions; | |
1099 | ||
1100 | if (current_volume > READ_ONCE(clone->hydration_threshold)) | |
1101 | break; | |
1102 | ||
1103 | offset = __start_next_hydration(clone, offset, &batch); | |
1104 | } | |
1105 | ||
1106 | if (batch.head) | |
1107 | hydration_copy(batch.head, batch.nr_batched_regions); | |
1108 | ||
1109 | if (offset >= nr_regions) | |
1110 | offset = 0; | |
1111 | ||
1112 | clone->hydration_offset = offset; | |
1113 | ||
1114 | if (atomic_dec_and_test(&clone->hydrations_in_flight)) | |
1115 | wakeup_hydration_waiters(clone); | |
1116 | } | |
1117 | ||
1118 | /*---------------------------------------------------------------------------*/ | |
1119 | ||
1120 | static bool need_commit_due_to_time(struct clone *clone) | |
1121 | { | |
1122 | return !time_in_range(jiffies, clone->last_commit_jiffies, | |
1123 | clone->last_commit_jiffies + COMMIT_PERIOD); | |
1124 | } | |
1125 | ||
1126 | /* | |
1127 | * A non-zero return indicates read-only or fail mode. | |
1128 | */ | |
8b3fd1f5 | 1129 | static int commit_metadata(struct clone *clone, bool *dest_dev_flushed) |
7431b783 NT |
1130 | { |
1131 | int r = 0; | |
1132 | ||
8b3fd1f5 NT |
1133 | if (dest_dev_flushed) |
1134 | *dest_dev_flushed = false; | |
1135 | ||
7431b783 NT |
1136 | mutex_lock(&clone->commit_lock); |
1137 | ||
1138 | if (!dm_clone_changed_this_transaction(clone->cmd)) | |
1139 | goto out; | |
1140 | ||
1141 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
1142 | r = -EPERM; | |
1143 | goto out; | |
1144 | } | |
1145 | ||
8fdbfe8d NT |
1146 | r = dm_clone_metadata_pre_commit(clone->cmd); |
1147 | if (unlikely(r)) { | |
1148 | __metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r); | |
1149 | goto out; | |
1150 | } | |
7431b783 | 1151 | |
a587daa0 | 1152 | r = blkdev_issue_flush(clone->dest_dev->bdev); |
8b3fd1f5 NT |
1153 | if (unlikely(r)) { |
1154 | __metadata_operation_failed(clone, "flush destination device", r); | |
1155 | goto out; | |
1156 | } | |
1157 | ||
1158 | if (dest_dev_flushed) | |
1159 | *dest_dev_flushed = true; | |
1160 | ||
8fdbfe8d | 1161 | r = dm_clone_metadata_commit(clone->cmd); |
7431b783 NT |
1162 | if (unlikely(r)) { |
1163 | __metadata_operation_failed(clone, "dm_clone_metadata_commit", r); | |
1164 | goto out; | |
1165 | } | |
1166 | ||
1167 | if (dm_clone_is_hydration_done(clone->cmd)) | |
1168 | dm_table_event(clone->ti->table); | |
1169 | out: | |
1170 | mutex_unlock(&clone->commit_lock); | |
1171 | ||
1172 | return r; | |
1173 | } | |
1174 | ||
1175 | static void process_deferred_discards(struct clone *clone) | |
1176 | { | |
1177 | int r = -EPERM; | |
1178 | struct bio *bio; | |
1179 | struct blk_plug plug; | |
4b514290 | 1180 | unsigned long rs, nr_regions; |
7431b783 NT |
1181 | struct bio_list discards = BIO_EMPTY_LIST; |
1182 | ||
6ca43ed8 | 1183 | spin_lock_irq(&clone->lock); |
7431b783 NT |
1184 | bio_list_merge(&discards, &clone->deferred_discard_bios); |
1185 | bio_list_init(&clone->deferred_discard_bios); | |
6ca43ed8 | 1186 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
1187 | |
1188 | if (bio_list_empty(&discards)) | |
1189 | return; | |
1190 | ||
1191 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | |
1192 | goto out; | |
1193 | ||
1194 | /* Update the metadata */ | |
1195 | bio_list_for_each(bio, &discards) { | |
4b514290 | 1196 | bio_region_range(clone, bio, &rs, &nr_regions); |
7431b783 NT |
1197 | /* |
1198 | * A discard request might cover regions that have been already | |
1199 | * hydrated. There is no need to update the metadata for these | |
1200 | * regions. | |
1201 | */ | |
4b514290 | 1202 | r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions); |
7431b783 NT |
1203 | if (unlikely(r)) |
1204 | break; | |
1205 | } | |
1206 | out: | |
1207 | blk_start_plug(&plug); | |
1208 | while ((bio = bio_list_pop(&discards))) | |
1209 | complete_discard_bio(clone, bio, r == 0); | |
1210 | blk_finish_plug(&plug); | |
1211 | } | |
1212 | ||
1213 | static void process_deferred_bios(struct clone *clone) | |
1214 | { | |
7431b783 NT |
1215 | struct bio_list bios = BIO_EMPTY_LIST; |
1216 | ||
6ca43ed8 | 1217 | spin_lock_irq(&clone->lock); |
7431b783 NT |
1218 | bio_list_merge(&bios, &clone->deferred_bios); |
1219 | bio_list_init(&clone->deferred_bios); | |
6ca43ed8 | 1220 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
1221 | |
1222 | if (bio_list_empty(&bios)) | |
1223 | return; | |
1224 | ||
1225 | submit_bios(&bios); | |
1226 | } | |
1227 | ||
1228 | static void process_deferred_flush_bios(struct clone *clone) | |
1229 | { | |
1230 | struct bio *bio; | |
8b3fd1f5 | 1231 | bool dest_dev_flushed; |
7431b783 NT |
1232 | struct bio_list bios = BIO_EMPTY_LIST; |
1233 | struct bio_list bio_completions = BIO_EMPTY_LIST; | |
1234 | ||
1235 | /* | |
1236 | * If there are any deferred flush bios, we must commit the metadata | |
1237 | * before issuing them or signaling their completion. | |
1238 | */ | |
6ca43ed8 | 1239 | spin_lock_irq(&clone->lock); |
7431b783 NT |
1240 | bio_list_merge(&bios, &clone->deferred_flush_bios); |
1241 | bio_list_init(&clone->deferred_flush_bios); | |
1242 | ||
1243 | bio_list_merge(&bio_completions, &clone->deferred_flush_completions); | |
1244 | bio_list_init(&clone->deferred_flush_completions); | |
6ca43ed8 | 1245 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
1246 | |
1247 | if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && | |
1248 | !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) | |
1249 | return; | |
1250 | ||
8b3fd1f5 | 1251 | if (commit_metadata(clone, &dest_dev_flushed)) { |
7431b783 NT |
1252 | bio_list_merge(&bios, &bio_completions); |
1253 | ||
1254 | while ((bio = bio_list_pop(&bios))) | |
1255 | bio_io_error(bio); | |
1256 | ||
1257 | return; | |
1258 | } | |
1259 | ||
1260 | clone->last_commit_jiffies = jiffies; | |
1261 | ||
1262 | while ((bio = bio_list_pop(&bio_completions))) | |
1263 | bio_endio(bio); | |
1264 | ||
8b3fd1f5 NT |
1265 | while ((bio = bio_list_pop(&bios))) { |
1266 | if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) { | |
1267 | /* We just flushed the destination device as part of | |
1268 | * the metadata commit, so there is no reason to send | |
1269 | * another flush. | |
1270 | */ | |
1271 | bio_endio(bio); | |
1272 | } else { | |
ed00aabd | 1273 | submit_bio_noacct(bio); |
8b3fd1f5 NT |
1274 | } |
1275 | } | |
7431b783 NT |
1276 | } |
1277 | ||
1278 | static void do_worker(struct work_struct *work) | |
1279 | { | |
1280 | struct clone *clone = container_of(work, typeof(*clone), worker); | |
1281 | ||
1282 | process_deferred_bios(clone); | |
1283 | process_deferred_discards(clone); | |
1284 | ||
1285 | /* | |
1286 | * process_deferred_flush_bios(): | |
1287 | * | |
1288 | * - Commit metadata | |
1289 | * | |
1290 | * - Process deferred REQ_FUA completions | |
1291 | * | |
1292 | * - Process deferred REQ_PREFLUSH bios | |
1293 | */ | |
1294 | process_deferred_flush_bios(clone); | |
1295 | ||
1296 | /* Background hydration */ | |
1297 | do_hydration(clone); | |
1298 | } | |
1299 | ||
1300 | /* | |
1301 | * Commit periodically so that not too much unwritten data builds up. | |
1302 | * | |
1303 | * Also, restart background hydration, if it has been stopped by in-flight I/O. | |
1304 | */ | |
1305 | static void do_waker(struct work_struct *work) | |
1306 | { | |
1307 | struct clone *clone = container_of(to_delayed_work(work), struct clone, waker); | |
1308 | ||
1309 | wake_worker(clone); | |
1310 | queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD); | |
1311 | } | |
1312 | ||
1313 | /*---------------------------------------------------------------------------*/ | |
1314 | ||
1315 | /* | |
1316 | * Target methods | |
1317 | */ | |
1318 | static int clone_map(struct dm_target *ti, struct bio *bio) | |
1319 | { | |
1320 | struct clone *clone = ti->private; | |
1321 | unsigned long region_nr; | |
1322 | ||
1323 | atomic_inc(&clone->ios_in_flight); | |
1324 | ||
1325 | if (unlikely(get_clone_mode(clone) == CM_FAIL)) | |
1326 | return DM_MAPIO_KILL; | |
1327 | ||
1328 | /* | |
1329 | * REQ_PREFLUSH bios carry no data: | |
1330 | * | |
1331 | * - Commit metadata, if changed | |
1332 | * | |
1333 | * - Pass down to destination device | |
1334 | */ | |
1335 | if (bio->bi_opf & REQ_PREFLUSH) { | |
1336 | remap_and_issue(clone, bio); | |
1337 | return DM_MAPIO_SUBMITTED; | |
1338 | } | |
1339 | ||
1340 | bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); | |
1341 | ||
1342 | /* | |
1343 | * dm-clone interprets discards and performs a fast hydration of the | |
1344 | * discarded regions, i.e., we skip the copy from the source device and | |
1345 | * just mark the regions as hydrated. | |
1346 | */ | |
1347 | if (bio_op(bio) == REQ_OP_DISCARD) { | |
1348 | process_discard_bio(clone, bio); | |
1349 | return DM_MAPIO_SUBMITTED; | |
1350 | } | |
1351 | ||
1352 | /* | |
1353 | * If the bio's region is hydrated, redirect it to the destination | |
1354 | * device. | |
1355 | * | |
1356 | * If the region is not hydrated and the bio is a READ, redirect it to | |
1357 | * the source device. | |
1358 | * | |
1359 | * Else, defer WRITE bio until after its region has been hydrated and | |
1360 | * start the region's hydration immediately. | |
1361 | */ | |
1362 | region_nr = bio_to_region(clone, bio); | |
1363 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | |
1364 | remap_and_issue(clone, bio); | |
1365 | return DM_MAPIO_SUBMITTED; | |
1366 | } else if (bio_data_dir(bio) == READ) { | |
1367 | remap_to_source(clone, bio); | |
1368 | return DM_MAPIO_REMAPPED; | |
1369 | } | |
1370 | ||
1371 | remap_to_dest(clone, bio); | |
1372 | hydrate_bio_region(clone, bio); | |
1373 | ||
1374 | return DM_MAPIO_SUBMITTED; | |
1375 | } | |
1376 | ||
1377 | static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error) | |
1378 | { | |
1379 | struct clone *clone = ti->private; | |
1380 | ||
1381 | atomic_dec(&clone->ios_in_flight); | |
1382 | ||
1383 | return DM_ENDIO_DONE; | |
1384 | } | |
1385 | ||
1386 | static void emit_flags(struct clone *clone, char *result, unsigned int maxlen, | |
1387 | ssize_t *sz_ptr) | |
1388 | { | |
1389 | ssize_t sz = *sz_ptr; | |
1390 | unsigned int count; | |
1391 | ||
1392 | count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | |
1393 | count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | |
1394 | ||
1395 | DMEMIT("%u ", count); | |
1396 | ||
1397 | if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) | |
1398 | DMEMIT("no_hydration "); | |
1399 | ||
1400 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) | |
1401 | DMEMIT("no_discard_passdown "); | |
1402 | ||
1403 | *sz_ptr = sz; | |
1404 | } | |
1405 | ||
1406 | static void emit_core_args(struct clone *clone, char *result, | |
1407 | unsigned int maxlen, ssize_t *sz_ptr) | |
1408 | { | |
1409 | ssize_t sz = *sz_ptr; | |
1410 | unsigned int count = 4; | |
1411 | ||
1412 | DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count, | |
1413 | READ_ONCE(clone->hydration_threshold), | |
1414 | READ_ONCE(clone->hydration_batch_size)); | |
1415 | ||
1416 | *sz_ptr = sz; | |
1417 | } | |
1418 | ||
1419 | /* | |
1420 | * Status format: | |
1421 | * | |
1422 | * <metadata block size> <#used metadata blocks>/<#total metadata blocks> | |
1423 | * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions> | |
1424 | * <#features> <features>* <#core args> <core args>* <clone metadata mode> | |
1425 | */ | |
1426 | static void clone_status(struct dm_target *ti, status_type_t type, | |
1427 | unsigned int status_flags, char *result, | |
1428 | unsigned int maxlen) | |
1429 | { | |
1430 | int r; | |
1431 | unsigned int i; | |
1432 | ssize_t sz = 0; | |
1433 | dm_block_t nr_free_metadata_blocks = 0; | |
1434 | dm_block_t nr_metadata_blocks = 0; | |
1435 | char buf[BDEVNAME_SIZE]; | |
1436 | struct clone *clone = ti->private; | |
1437 | ||
1438 | switch (type) { | |
1439 | case STATUSTYPE_INFO: | |
1440 | if (get_clone_mode(clone) == CM_FAIL) { | |
1441 | DMEMIT("Fail"); | |
1442 | break; | |
1443 | } | |
1444 | ||
1445 | /* Commit to ensure statistics aren't out-of-date */ | |
1446 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) | |
8b3fd1f5 | 1447 | (void) commit_metadata(clone, NULL); |
7431b783 NT |
1448 | |
1449 | r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks); | |
1450 | ||
1451 | if (r) { | |
1452 | DMERR("%s: dm_clone_get_free_metadata_block_count returned %d", | |
1453 | clone_device_name(clone), r); | |
1454 | goto error; | |
1455 | } | |
1456 | ||
1457 | r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks); | |
1458 | ||
1459 | if (r) { | |
1460 | DMERR("%s: dm_clone_get_metadata_dev_size returned %d", | |
1461 | clone_device_name(clone), r); | |
1462 | goto error; | |
1463 | } | |
1464 | ||
81d5553d | 1465 | DMEMIT("%u %llu/%llu %llu %u/%lu %u ", |
7431b783 NT |
1466 | DM_CLONE_METADATA_BLOCK_SIZE, |
1467 | (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), | |
1468 | (unsigned long long)nr_metadata_blocks, | |
1469 | (unsigned long long)clone->region_size, | |
1470 | dm_clone_nr_of_hydrated_regions(clone->cmd), | |
1471 | clone->nr_regions, | |
1472 | atomic_read(&clone->hydrations_in_flight)); | |
1473 | ||
1474 | emit_flags(clone, result, maxlen, &sz); | |
1475 | emit_core_args(clone, result, maxlen, &sz); | |
1476 | ||
1477 | switch (get_clone_mode(clone)) { | |
1478 | case CM_WRITE: | |
1479 | DMEMIT("rw"); | |
1480 | break; | |
1481 | case CM_READ_ONLY: | |
1482 | DMEMIT("ro"); | |
1483 | break; | |
1484 | case CM_FAIL: | |
1485 | DMEMIT("Fail"); | |
1486 | } | |
1487 | ||
1488 | break; | |
1489 | ||
1490 | case STATUSTYPE_TABLE: | |
1491 | format_dev_t(buf, clone->metadata_dev->bdev->bd_dev); | |
1492 | DMEMIT("%s ", buf); | |
1493 | ||
1494 | format_dev_t(buf, clone->dest_dev->bdev->bd_dev); | |
1495 | DMEMIT("%s ", buf); | |
1496 | ||
1497 | format_dev_t(buf, clone->source_dev->bdev->bd_dev); | |
1498 | DMEMIT("%s", buf); | |
1499 | ||
1500 | for (i = 0; i < clone->nr_ctr_args; i++) | |
1501 | DMEMIT(" %s", clone->ctr_args[i]); | |
8ec45662 TS |
1502 | break; |
1503 | ||
1504 | case STATUSTYPE_IMA: | |
1505 | *result = '\0'; | |
1506 | break; | |
7431b783 NT |
1507 | } |
1508 | ||
1509 | return; | |
1510 | ||
1511 | error: | |
1512 | DMEMIT("Error"); | |
1513 | } | |
1514 | ||
7431b783 NT |
1515 | static sector_t get_dev_size(struct dm_dev *dev) |
1516 | { | |
6dcbb52c | 1517 | return bdev_nr_sectors(dev->bdev); |
7431b783 NT |
1518 | } |
1519 | ||
1520 | /*---------------------------------------------------------------------------*/ | |
1521 | ||
1522 | /* | |
1523 | * Construct a clone device mapping: | |
1524 | * | |
1525 | * clone <metadata dev> <destination dev> <source dev> <region size> | |
1526 | * [<#feature args> [<feature arg>]* [<#core args> [key value]*]] | |
1527 | * | |
1528 | * metadata dev: Fast device holding the persistent metadata | |
1529 | * destination dev: The destination device, which will become a clone of the | |
1530 | * source device | |
1531 | * source dev: The read-only source device that gets cloned | |
1532 | * region size: dm-clone unit size in sectors | |
1533 | * | |
1534 | * #feature args: Number of feature arguments passed | |
1535 | * feature args: E.g. no_hydration, no_discard_passdown | |
1536 | * | |
1537 | * #core arguments: An even number of core arguments | |
1538 | * core arguments: Key/value pairs for tuning the core | |
1539 | * E.g. 'hydration_threshold 256' | |
1540 | */ | |
1541 | static int parse_feature_args(struct dm_arg_set *as, struct clone *clone) | |
1542 | { | |
1543 | int r; | |
1544 | unsigned int argc; | |
1545 | const char *arg_name; | |
1546 | struct dm_target *ti = clone->ti; | |
1547 | ||
1548 | const struct dm_arg args = { | |
1549 | .min = 0, | |
1550 | .max = 2, | |
1551 | .error = "Invalid number of feature arguments" | |
1552 | }; | |
1553 | ||
1554 | /* No feature arguments supplied */ | |
1555 | if (!as->argc) | |
1556 | return 0; | |
1557 | ||
1558 | r = dm_read_arg_group(&args, as, &argc, &ti->error); | |
1559 | if (r) | |
1560 | return r; | |
1561 | ||
1562 | while (argc) { | |
1563 | arg_name = dm_shift_arg(as); | |
1564 | argc--; | |
1565 | ||
1566 | if (!strcasecmp(arg_name, "no_hydration")) { | |
1567 | __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | |
1568 | } else if (!strcasecmp(arg_name, "no_discard_passdown")) { | |
1569 | __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | |
1570 | } else { | |
1571 | ti->error = "Invalid feature argument"; | |
1572 | return -EINVAL; | |
1573 | } | |
1574 | } | |
1575 | ||
1576 | return 0; | |
1577 | } | |
1578 | ||
1579 | static int parse_core_args(struct dm_arg_set *as, struct clone *clone) | |
1580 | { | |
1581 | int r; | |
1582 | unsigned int argc; | |
1583 | unsigned int value; | |
1584 | const char *arg_name; | |
1585 | struct dm_target *ti = clone->ti; | |
1586 | ||
1587 | const struct dm_arg args = { | |
1588 | .min = 0, | |
1589 | .max = 4, | |
1590 | .error = "Invalid number of core arguments" | |
1591 | }; | |
1592 | ||
1593 | /* Initialize core arguments */ | |
1594 | clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE; | |
1595 | clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD; | |
1596 | ||
1597 | /* No core arguments supplied */ | |
1598 | if (!as->argc) | |
1599 | return 0; | |
1600 | ||
1601 | r = dm_read_arg_group(&args, as, &argc, &ti->error); | |
1602 | if (r) | |
1603 | return r; | |
1604 | ||
1605 | if (argc & 1) { | |
1606 | ti->error = "Number of core arguments must be even"; | |
1607 | return -EINVAL; | |
1608 | } | |
1609 | ||
1610 | while (argc) { | |
1611 | arg_name = dm_shift_arg(as); | |
1612 | argc -= 2; | |
1613 | ||
1614 | if (!strcasecmp(arg_name, "hydration_threshold")) { | |
1615 | if (kstrtouint(dm_shift_arg(as), 10, &value)) { | |
1616 | ti->error = "Invalid value for argument `hydration_threshold'"; | |
1617 | return -EINVAL; | |
1618 | } | |
1619 | clone->hydration_threshold = value; | |
1620 | } else if (!strcasecmp(arg_name, "hydration_batch_size")) { | |
1621 | if (kstrtouint(dm_shift_arg(as), 10, &value)) { | |
1622 | ti->error = "Invalid value for argument `hydration_batch_size'"; | |
1623 | return -EINVAL; | |
1624 | } | |
1625 | clone->hydration_batch_size = value; | |
1626 | } else { | |
1627 | ti->error = "Invalid core argument"; | |
1628 | return -EINVAL; | |
1629 | } | |
1630 | } | |
1631 | ||
1632 | return 0; | |
1633 | } | |
1634 | ||
1635 | static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error) | |
1636 | { | |
1637 | int r; | |
1638 | unsigned int region_size; | |
1639 | struct dm_arg arg; | |
1640 | ||
1641 | arg.min = MIN_REGION_SIZE; | |
1642 | arg.max = MAX_REGION_SIZE; | |
1643 | arg.error = "Invalid region size"; | |
1644 | ||
1645 | r = dm_read_arg(&arg, as, ®ion_size, error); | |
1646 | if (r) | |
1647 | return r; | |
1648 | ||
1649 | /* Check region size is a power of 2 */ | |
1650 | if (!is_power_of_2(region_size)) { | |
1651 | *error = "Region size is not a power of 2"; | |
1652 | return -EINVAL; | |
1653 | } | |
1654 | ||
1655 | /* Validate the region size against the device logical block size */ | |
1656 | if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) || | |
1657 | region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) { | |
1658 | *error = "Region size is not a multiple of device logical block size"; | |
1659 | return -EINVAL; | |
1660 | } | |
1661 | ||
1662 | clone->region_size = region_size; | |
1663 | ||
1664 | return 0; | |
1665 | } | |
1666 | ||
1667 | static int validate_nr_regions(unsigned long n, char **error) | |
1668 | { | |
1669 | /* | |
1670 | * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us | |
1671 | * further to 2^31 regions. | |
1672 | */ | |
1673 | if (n > (1UL << 31)) { | |
1674 | *error = "Too many regions. Consider increasing the region size"; | |
1675 | return -EINVAL; | |
1676 | } | |
1677 | ||
1678 | return 0; | |
1679 | } | |
1680 | ||
1681 | static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error) | |
1682 | { | |
1683 | int r; | |
1684 | sector_t metadata_dev_size; | |
7431b783 NT |
1685 | |
1686 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | |
1687 | &clone->metadata_dev); | |
1688 | if (r) { | |
1689 | *error = "Error opening metadata device"; | |
1690 | return r; | |
1691 | } | |
1692 | ||
1693 | metadata_dev_size = get_dev_size(clone->metadata_dev); | |
1694 | if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING) | |
385411ff CH |
1695 | DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", |
1696 | clone->metadata_dev->bdev, DM_CLONE_METADATA_MAX_SECTORS); | |
7431b783 NT |
1697 | |
1698 | return 0; | |
1699 | } | |
1700 | ||
1701 | static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error) | |
1702 | { | |
1703 | int r; | |
1704 | sector_t dest_dev_size; | |
1705 | ||
1706 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | |
1707 | &clone->dest_dev); | |
1708 | if (r) { | |
1709 | *error = "Error opening destination device"; | |
1710 | return r; | |
1711 | } | |
1712 | ||
1713 | dest_dev_size = get_dev_size(clone->dest_dev); | |
1714 | if (dest_dev_size < clone->ti->len) { | |
1715 | dm_put_device(clone->ti, clone->dest_dev); | |
1716 | *error = "Device size larger than destination device"; | |
1717 | return -EINVAL; | |
1718 | } | |
1719 | ||
1720 | return 0; | |
1721 | } | |
1722 | ||
1723 | static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error) | |
1724 | { | |
1725 | int r; | |
1726 | sector_t source_dev_size; | |
1727 | ||
1728 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ, | |
1729 | &clone->source_dev); | |
1730 | if (r) { | |
1731 | *error = "Error opening source device"; | |
1732 | return r; | |
1733 | } | |
1734 | ||
1735 | source_dev_size = get_dev_size(clone->source_dev); | |
1736 | if (source_dev_size < clone->ti->len) { | |
1737 | dm_put_device(clone->ti, clone->source_dev); | |
1738 | *error = "Device size larger than source device"; | |
1739 | return -EINVAL; | |
1740 | } | |
1741 | ||
1742 | return 0; | |
1743 | } | |
1744 | ||
1745 | static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error) | |
1746 | { | |
1747 | unsigned int i; | |
1748 | const char **copy; | |
1749 | ||
1750 | copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); | |
1751 | if (!copy) | |
1752 | goto error; | |
1753 | ||
1754 | for (i = 0; i < argc; i++) { | |
1755 | copy[i] = kstrdup(argv[i], GFP_KERNEL); | |
1756 | ||
1757 | if (!copy[i]) { | |
1758 | while (i--) | |
1759 | kfree(copy[i]); | |
1760 | kfree(copy); | |
1761 | goto error; | |
1762 | } | |
1763 | } | |
1764 | ||
1765 | clone->nr_ctr_args = argc; | |
1766 | clone->ctr_args = copy; | |
1767 | return 0; | |
1768 | ||
1769 | error: | |
1770 | *error = "Failed to allocate memory for table line"; | |
1771 | return -ENOMEM; | |
1772 | } | |
1773 | ||
1774 | static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |
1775 | { | |
1776 | int r; | |
cd481c12 | 1777 | sector_t nr_regions; |
7431b783 NT |
1778 | struct clone *clone; |
1779 | struct dm_arg_set as; | |
1780 | ||
1781 | if (argc < 4) { | |
1782 | ti->error = "Invalid number of arguments"; | |
1783 | return -EINVAL; | |
1784 | } | |
1785 | ||
1786 | as.argc = argc; | |
1787 | as.argv = argv; | |
1788 | ||
1789 | clone = kzalloc(sizeof(*clone), GFP_KERNEL); | |
1790 | if (!clone) { | |
1791 | ti->error = "Failed to allocate clone structure"; | |
1792 | return -ENOMEM; | |
1793 | } | |
1794 | ||
1795 | clone->ti = ti; | |
1796 | ||
1797 | /* Initialize dm-clone flags */ | |
1798 | __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | |
1799 | __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | |
1800 | __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | |
1801 | ||
1802 | r = parse_metadata_dev(clone, &as, &ti->error); | |
1803 | if (r) | |
1804 | goto out_with_clone; | |
1805 | ||
1806 | r = parse_dest_dev(clone, &as, &ti->error); | |
1807 | if (r) | |
1808 | goto out_with_meta_dev; | |
1809 | ||
1810 | r = parse_source_dev(clone, &as, &ti->error); | |
1811 | if (r) | |
1812 | goto out_with_dest_dev; | |
1813 | ||
1814 | r = parse_region_size(clone, &as, &ti->error); | |
1815 | if (r) | |
1816 | goto out_with_source_dev; | |
1817 | ||
1818 | clone->region_shift = __ffs(clone->region_size); | |
cd481c12 NT |
1819 | nr_regions = dm_sector_div_up(ti->len, clone->region_size); |
1820 | ||
1821 | /* Check for overflow */ | |
1822 | if (nr_regions != (unsigned long)nr_regions) { | |
1823 | ti->error = "Too many regions. Consider increasing the region size"; | |
1824 | r = -EOVERFLOW; | |
1825 | goto out_with_source_dev; | |
1826 | } | |
1827 | ||
1828 | clone->nr_regions = nr_regions; | |
7431b783 NT |
1829 | |
1830 | r = validate_nr_regions(clone->nr_regions, &ti->error); | |
1831 | if (r) | |
1832 | goto out_with_source_dev; | |
1833 | ||
1834 | r = dm_set_target_max_io_len(ti, clone->region_size); | |
1835 | if (r) { | |
1836 | ti->error = "Failed to set max io len"; | |
1837 | goto out_with_source_dev; | |
1838 | } | |
1839 | ||
1840 | r = parse_feature_args(&as, clone); | |
1841 | if (r) | |
1842 | goto out_with_source_dev; | |
1843 | ||
1844 | r = parse_core_args(&as, clone); | |
1845 | if (r) | |
1846 | goto out_with_source_dev; | |
1847 | ||
1848 | /* Load metadata */ | |
1849 | clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len, | |
1850 | clone->region_size); | |
1851 | if (IS_ERR(clone->cmd)) { | |
1852 | ti->error = "Failed to load metadata"; | |
1853 | r = PTR_ERR(clone->cmd); | |
1854 | goto out_with_source_dev; | |
1855 | } | |
1856 | ||
1857 | __set_clone_mode(clone, CM_WRITE); | |
1858 | ||
1859 | if (get_clone_mode(clone) != CM_WRITE) { | |
1860 | ti->error = "Unable to get write access to metadata, please check/repair metadata"; | |
1861 | r = -EPERM; | |
1862 | goto out_with_metadata; | |
1863 | } | |
1864 | ||
1865 | clone->last_commit_jiffies = jiffies; | |
1866 | ||
1867 | /* Allocate hydration hash table */ | |
1868 | r = hash_table_init(clone); | |
1869 | if (r) { | |
1870 | ti->error = "Failed to allocate hydration hash table"; | |
1871 | goto out_with_metadata; | |
1872 | } | |
1873 | ||
1874 | atomic_set(&clone->ios_in_flight, 0); | |
1875 | init_waitqueue_head(&clone->hydration_stopped); | |
1876 | spin_lock_init(&clone->lock); | |
1877 | bio_list_init(&clone->deferred_bios); | |
1878 | bio_list_init(&clone->deferred_discard_bios); | |
1879 | bio_list_init(&clone->deferred_flush_bios); | |
1880 | bio_list_init(&clone->deferred_flush_completions); | |
1881 | clone->hydration_offset = 0; | |
1882 | atomic_set(&clone->hydrations_in_flight, 0); | |
1883 | ||
1884 | clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); | |
1885 | if (!clone->wq) { | |
1886 | ti->error = "Failed to allocate workqueue"; | |
1887 | r = -ENOMEM; | |
1888 | goto out_with_ht; | |
1889 | } | |
1890 | ||
1891 | INIT_WORK(&clone->worker, do_worker); | |
1892 | INIT_DELAYED_WORK(&clone->waker, do_waker); | |
1893 | ||
1894 | clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); | |
1895 | if (IS_ERR(clone->kcopyd_client)) { | |
1896 | r = PTR_ERR(clone->kcopyd_client); | |
1897 | goto out_with_wq; | |
1898 | } | |
1899 | ||
1900 | r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS, | |
1901 | _hydration_cache); | |
1902 | if (r) { | |
1903 | ti->error = "Failed to create dm_clone_region_hydration memory pool"; | |
1904 | goto out_with_kcopyd; | |
1905 | } | |
1906 | ||
1907 | /* Save a copy of the table line */ | |
1908 | r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error); | |
1909 | if (r) | |
1910 | goto out_with_mempool; | |
1911 | ||
1912 | mutex_init(&clone->commit_lock); | |
7431b783 NT |
1913 | |
1914 | /* Enable flushes */ | |
1915 | ti->num_flush_bios = 1; | |
1916 | ti->flush_supported = true; | |
1917 | ||
1918 | /* Enable discards */ | |
1919 | ti->discards_supported = true; | |
1920 | ti->num_discard_bios = 1; | |
1921 | ||
1922 | ti->private = clone; | |
1923 | ||
1924 | return 0; | |
1925 | ||
1926 | out_with_mempool: | |
1927 | mempool_exit(&clone->hydration_pool); | |
1928 | out_with_kcopyd: | |
1929 | dm_kcopyd_client_destroy(clone->kcopyd_client); | |
1930 | out_with_wq: | |
1931 | destroy_workqueue(clone->wq); | |
1932 | out_with_ht: | |
1933 | hash_table_exit(clone); | |
1934 | out_with_metadata: | |
1935 | dm_clone_metadata_close(clone->cmd); | |
1936 | out_with_source_dev: | |
1937 | dm_put_device(ti, clone->source_dev); | |
1938 | out_with_dest_dev: | |
1939 | dm_put_device(ti, clone->dest_dev); | |
1940 | out_with_meta_dev: | |
1941 | dm_put_device(ti, clone->metadata_dev); | |
1942 | out_with_clone: | |
1943 | kfree(clone); | |
1944 | ||
1945 | return r; | |
1946 | } | |
1947 | ||
1948 | static void clone_dtr(struct dm_target *ti) | |
1949 | { | |
1950 | unsigned int i; | |
1951 | struct clone *clone = ti->private; | |
1952 | ||
1953 | mutex_destroy(&clone->commit_lock); | |
1954 | ||
1955 | for (i = 0; i < clone->nr_ctr_args; i++) | |
1956 | kfree(clone->ctr_args[i]); | |
1957 | kfree(clone->ctr_args); | |
1958 | ||
1959 | mempool_exit(&clone->hydration_pool); | |
1960 | dm_kcopyd_client_destroy(clone->kcopyd_client); | |
e4b5957c | 1961 | cancel_delayed_work_sync(&clone->waker); |
7431b783 NT |
1962 | destroy_workqueue(clone->wq); |
1963 | hash_table_exit(clone); | |
1964 | dm_clone_metadata_close(clone->cmd); | |
1965 | dm_put_device(ti, clone->source_dev); | |
1966 | dm_put_device(ti, clone->dest_dev); | |
1967 | dm_put_device(ti, clone->metadata_dev); | |
1968 | ||
1969 | kfree(clone); | |
1970 | } | |
1971 | ||
1972 | /*---------------------------------------------------------------------------*/ | |
1973 | ||
1974 | static void clone_postsuspend(struct dm_target *ti) | |
1975 | { | |
1976 | struct clone *clone = ti->private; | |
1977 | ||
1978 | /* | |
1979 | * To successfully suspend the device: | |
1980 | * | |
1981 | * - We cancel the delayed work for periodic commits and wait for | |
1982 | * it to finish. | |
1983 | * | |
1984 | * - We stop the background hydration, i.e. we prevent new region | |
1985 | * hydrations from starting. | |
1986 | * | |
1987 | * - We wait for any in-flight hydrations to finish. | |
1988 | * | |
1989 | * - We flush the workqueue. | |
1990 | * | |
1991 | * - We commit the metadata. | |
1992 | */ | |
1993 | cancel_delayed_work_sync(&clone->waker); | |
1994 | ||
1995 | set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | |
1996 | ||
1997 | /* | |
1998 | * Make sure set_bit() is ordered before atomic_read(), otherwise we | |
1999 | * might race with do_hydration() and miss some started region | |
2000 | * hydrations. | |
2001 | * | |
2002 | * This is paired with smp_mb__after_atomic() in do_hydration(). | |
2003 | */ | |
2004 | smp_mb__after_atomic(); | |
2005 | ||
2006 | wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight)); | |
2007 | flush_workqueue(clone->wq); | |
2008 | ||
8b3fd1f5 | 2009 | (void) commit_metadata(clone, NULL); |
7431b783 NT |
2010 | } |
2011 | ||
2012 | static void clone_resume(struct dm_target *ti) | |
2013 | { | |
2014 | struct clone *clone = ti->private; | |
2015 | ||
2016 | clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | |
2017 | do_waker(&clone->waker.work); | |
2018 | } | |
2019 | ||
7431b783 NT |
2020 | /* |
2021 | * If discard_passdown was enabled verify that the destination device supports | |
2022 | * discards. Disable discard_passdown if not. | |
2023 | */ | |
2024 | static void disable_passdown_if_not_supported(struct clone *clone) | |
2025 | { | |
2026 | struct block_device *dest_dev = clone->dest_dev->bdev; | |
2027 | struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; | |
2028 | const char *reason = NULL; | |
7431b783 NT |
2029 | |
2030 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) | |
2031 | return; | |
2032 | ||
70200574 | 2033 | if (!bdev_max_discard_sectors(dest_dev)) |
7431b783 NT |
2034 | reason = "discard unsupported"; |
2035 | else if (dest_limits->max_discard_sectors < clone->region_size) | |
2036 | reason = "max discard sectors smaller than a region"; | |
2037 | ||
2038 | if (reason) { | |
5434ee8d | 2039 | DMWARN("Destination device (%pg) %s: Disabling discard passdown.", |
385411ff | 2040 | dest_dev, reason); |
7431b783 NT |
2041 | clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); |
2042 | } | |
2043 | } | |
2044 | ||
2045 | static void set_discard_limits(struct clone *clone, struct queue_limits *limits) | |
2046 | { | |
2047 | struct block_device *dest_bdev = clone->dest_dev->bdev; | |
2048 | struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; | |
2049 | ||
2050 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { | |
2051 | /* No passdown is done so we set our own virtual limits */ | |
2052 | limits->discard_granularity = clone->region_size << SECTOR_SHIFT; | |
2053 | limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size); | |
2054 | return; | |
2055 | } | |
2056 | ||
2057 | /* | |
2058 | * clone_iterate_devices() is stacking both the source and destination | |
2059 | * device limits but discards aren't passed to the source device, so | |
2060 | * inherit destination's limits. | |
2061 | */ | |
2062 | limits->max_discard_sectors = dest_limits->max_discard_sectors; | |
2063 | limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; | |
2064 | limits->discard_granularity = dest_limits->discard_granularity; | |
2065 | limits->discard_alignment = dest_limits->discard_alignment; | |
2066 | limits->discard_misaligned = dest_limits->discard_misaligned; | |
2067 | limits->max_discard_segments = dest_limits->max_discard_segments; | |
2068 | } | |
2069 | ||
2070 | static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits) | |
2071 | { | |
2072 | struct clone *clone = ti->private; | |
2073 | u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; | |
2074 | ||
2075 | /* | |
2076 | * If the system-determined stacked limits are compatible with | |
2077 | * dm-clone's region size (io_opt is a factor) do not override them. | |
2078 | */ | |
2079 | if (io_opt_sectors < clone->region_size || | |
2080 | do_div(io_opt_sectors, clone->region_size)) { | |
2081 | blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT); | |
2082 | blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT); | |
2083 | } | |
2084 | ||
2085 | disable_passdown_if_not_supported(clone); | |
2086 | set_discard_limits(clone, limits); | |
2087 | } | |
2088 | ||
2089 | static int clone_iterate_devices(struct dm_target *ti, | |
2090 | iterate_devices_callout_fn fn, void *data) | |
2091 | { | |
2092 | int ret; | |
2093 | struct clone *clone = ti->private; | |
2094 | struct dm_dev *dest_dev = clone->dest_dev; | |
2095 | struct dm_dev *source_dev = clone->source_dev; | |
2096 | ||
2097 | ret = fn(ti, source_dev, 0, ti->len, data); | |
2098 | if (!ret) | |
2099 | ret = fn(ti, dest_dev, 0, ti->len, data); | |
2100 | return ret; | |
2101 | } | |
2102 | ||
2103 | /* | |
2104 | * dm-clone message functions. | |
2105 | */ | |
2106 | static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions) | |
2107 | { | |
2108 | WRITE_ONCE(clone->hydration_threshold, nr_regions); | |
2109 | ||
2110 | /* | |
2111 | * If user space sets hydration_threshold to zero then the hydration | |
2112 | * will stop. If at a later time the hydration_threshold is increased | |
2113 | * we must restart the hydration process by waking up the worker. | |
2114 | */ | |
2115 | wake_worker(clone); | |
2116 | } | |
2117 | ||
2118 | static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions) | |
2119 | { | |
2120 | WRITE_ONCE(clone->hydration_batch_size, nr_regions); | |
2121 | } | |
2122 | ||
2123 | static void enable_hydration(struct clone *clone) | |
2124 | { | |
2125 | if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) | |
2126 | wake_worker(clone); | |
2127 | } | |
2128 | ||
2129 | static void disable_hydration(struct clone *clone) | |
2130 | { | |
2131 | clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | |
2132 | } | |
2133 | ||
2134 | static int clone_message(struct dm_target *ti, unsigned int argc, char **argv, | |
2135 | char *result, unsigned int maxlen) | |
2136 | { | |
2137 | struct clone *clone = ti->private; | |
2138 | unsigned int value; | |
2139 | ||
2140 | if (!argc) | |
2141 | return -EINVAL; | |
2142 | ||
2143 | if (!strcasecmp(argv[0], "enable_hydration")) { | |
2144 | enable_hydration(clone); | |
2145 | return 0; | |
2146 | } | |
2147 | ||
2148 | if (!strcasecmp(argv[0], "disable_hydration")) { | |
2149 | disable_hydration(clone); | |
2150 | return 0; | |
2151 | } | |
2152 | ||
2153 | if (argc != 2) | |
2154 | return -EINVAL; | |
2155 | ||
2156 | if (!strcasecmp(argv[0], "hydration_threshold")) { | |
2157 | if (kstrtouint(argv[1], 10, &value)) | |
2158 | return -EINVAL; | |
2159 | ||
2160 | set_hydration_threshold(clone, value); | |
2161 | ||
2162 | return 0; | |
2163 | } | |
2164 | ||
2165 | if (!strcasecmp(argv[0], "hydration_batch_size")) { | |
2166 | if (kstrtouint(argv[1], 10, &value)) | |
2167 | return -EINVAL; | |
2168 | ||
2169 | set_hydration_batch_size(clone, value); | |
2170 | ||
2171 | return 0; | |
2172 | } | |
2173 | ||
2174 | DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]); | |
2175 | return -EINVAL; | |
2176 | } | |
2177 | ||
2178 | static struct target_type clone_target = { | |
2179 | .name = "clone", | |
2180 | .version = {1, 0, 0}, | |
2181 | .module = THIS_MODULE, | |
2182 | .ctr = clone_ctr, | |
2183 | .dtr = clone_dtr, | |
2184 | .map = clone_map, | |
2185 | .end_io = clone_endio, | |
2186 | .postsuspend = clone_postsuspend, | |
2187 | .resume = clone_resume, | |
2188 | .status = clone_status, | |
2189 | .message = clone_message, | |
2190 | .io_hints = clone_io_hints, | |
2191 | .iterate_devices = clone_iterate_devices, | |
2192 | }; | |
2193 | ||
2194 | /*---------------------------------------------------------------------------*/ | |
2195 | ||
2196 | /* Module functions */ | |
2197 | static int __init dm_clone_init(void) | |
2198 | { | |
2199 | int r; | |
2200 | ||
2201 | _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0); | |
2202 | if (!_hydration_cache) | |
2203 | return -ENOMEM; | |
2204 | ||
2205 | r = dm_register_target(&clone_target); | |
2206 | if (r < 0) { | |
2207 | DMERR("Failed to register clone target"); | |
2208 | return r; | |
2209 | } | |
2210 | ||
2211 | return 0; | |
2212 | } | |
2213 | ||
2214 | static void __exit dm_clone_exit(void) | |
2215 | { | |
2216 | dm_unregister_target(&clone_target); | |
2217 | ||
2218 | kmem_cache_destroy(_hydration_cache); | |
2219 | _hydration_cache = NULL; | |
2220 | } | |
2221 | ||
2222 | /* Module hooks */ | |
2223 | module_init(dm_clone_init); | |
2224 | module_exit(dm_clone_exit); | |
2225 | ||
2226 | MODULE_DESCRIPTION(DM_NAME " clone target"); | |
2227 | MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>"); | |
2228 | MODULE_LICENSE("GPL"); |