Commit | Line | Data |
---|---|---|
7431b783 NT |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* | |
3 | * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. | |
4 | */ | |
5 | ||
6 | #include <linux/mm.h> | |
7 | #include <linux/bio.h> | |
8 | #include <linux/err.h> | |
9 | #include <linux/hash.h> | |
10 | #include <linux/list.h> | |
11 | #include <linux/log2.h> | |
12 | #include <linux/init.h> | |
13 | #include <linux/slab.h> | |
14 | #include <linux/wait.h> | |
15 | #include <linux/dm-io.h> | |
16 | #include <linux/mutex.h> | |
17 | #include <linux/atomic.h> | |
18 | #include <linux/bitops.h> | |
19 | #include <linux/blkdev.h> | |
20 | #include <linux/kdev_t.h> | |
21 | #include <linux/kernel.h> | |
22 | #include <linux/module.h> | |
23 | #include <linux/jiffies.h> | |
24 | #include <linux/mempool.h> | |
25 | #include <linux/spinlock.h> | |
26 | #include <linux/blk_types.h> | |
27 | #include <linux/dm-kcopyd.h> | |
28 | #include <linux/workqueue.h> | |
29 | #include <linux/backing-dev.h> | |
30 | #include <linux/device-mapper.h> | |
31 | ||
32 | #include "dm.h" | |
33 | #include "dm-clone-metadata.h" | |
34 | ||
35 | #define DM_MSG_PREFIX "clone" | |
36 | ||
37 | /* | |
38 | * Minimum and maximum allowed region sizes | |
39 | */ | |
40 | #define MIN_REGION_SIZE (1 << 3) /* 4KB */ | |
41 | #define MAX_REGION_SIZE (1 << 21) /* 1GB */ | |
42 | ||
43 | #define MIN_HYDRATIONS 256 /* Size of hydration mempool */ | |
44 | #define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */ | |
45 | #define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */ | |
46 | ||
47 | #define COMMIT_PERIOD HZ /* 1 sec */ | |
48 | ||
49 | /* | |
50 | * Hydration hash table size: 1 << HASH_TABLE_BITS | |
51 | */ | |
52 | #define HASH_TABLE_BITS 15 | |
53 | ||
54 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle, | |
55 | "A percentage of time allocated for hydrating regions"); | |
56 | ||
57 | /* Slab cache for struct dm_clone_region_hydration */ | |
58 | static struct kmem_cache *_hydration_cache; | |
59 | ||
60 | /* dm-clone metadata modes */ | |
61 | enum clone_metadata_mode { | |
62 | CM_WRITE, /* metadata may be changed */ | |
63 | CM_READ_ONLY, /* metadata may not be changed */ | |
64 | CM_FAIL, /* all metadata I/O fails */ | |
65 | }; | |
66 | ||
67 | struct hash_table_bucket; | |
68 | ||
69 | struct clone { | |
70 | struct dm_target *ti; | |
71 | struct dm_target_callbacks callbacks; | |
72 | ||
73 | struct dm_dev *metadata_dev; | |
74 | struct dm_dev *dest_dev; | |
75 | struct dm_dev *source_dev; | |
76 | ||
77 | unsigned long nr_regions; | |
78 | sector_t region_size; | |
79 | unsigned int region_shift; | |
80 | ||
81 | /* | |
82 | * A metadata commit and the actions taken in case it fails should run | |
83 | * as a single atomic step. | |
84 | */ | |
85 | struct mutex commit_lock; | |
86 | ||
87 | struct dm_clone_metadata *cmd; | |
88 | ||
89 | /* Region hydration hash table */ | |
90 | struct hash_table_bucket *ht; | |
91 | ||
92 | atomic_t ios_in_flight; | |
93 | ||
94 | wait_queue_head_t hydration_stopped; | |
95 | ||
96 | mempool_t hydration_pool; | |
97 | ||
98 | unsigned long last_commit_jiffies; | |
99 | ||
100 | /* | |
101 | * We defer incoming WRITE bios for regions that are not hydrated, | |
102 | * until after these regions have been hydrated. | |
103 | * | |
104 | * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the | |
105 | * metadata have been committed. | |
106 | */ | |
107 | spinlock_t lock; | |
108 | struct bio_list deferred_bios; | |
109 | struct bio_list deferred_discard_bios; | |
110 | struct bio_list deferred_flush_bios; | |
111 | struct bio_list deferred_flush_completions; | |
112 | ||
113 | /* Maximum number of regions being copied during background hydration. */ | |
114 | unsigned int hydration_threshold; | |
115 | ||
116 | /* Number of regions to batch together during background hydration. */ | |
117 | unsigned int hydration_batch_size; | |
118 | ||
119 | /* Which region to hydrate next */ | |
120 | unsigned long hydration_offset; | |
121 | ||
122 | atomic_t hydrations_in_flight; | |
123 | ||
124 | /* | |
125 | * Save a copy of the table line rather than reconstructing it for the | |
126 | * status. | |
127 | */ | |
128 | unsigned int nr_ctr_args; | |
129 | const char **ctr_args; | |
130 | ||
131 | struct workqueue_struct *wq; | |
132 | struct work_struct worker; | |
133 | struct delayed_work waker; | |
134 | ||
135 | struct dm_kcopyd_client *kcopyd_client; | |
136 | ||
137 | enum clone_metadata_mode mode; | |
138 | unsigned long flags; | |
139 | }; | |
140 | ||
141 | /* | |
142 | * dm-clone flags | |
143 | */ | |
144 | #define DM_CLONE_DISCARD_PASSDOWN 0 | |
145 | #define DM_CLONE_HYDRATION_ENABLED 1 | |
146 | #define DM_CLONE_HYDRATION_SUSPENDED 2 | |
147 | ||
148 | /*---------------------------------------------------------------------------*/ | |
149 | ||
150 | /* | |
151 | * Metadata failure handling. | |
152 | */ | |
153 | static enum clone_metadata_mode get_clone_mode(struct clone *clone) | |
154 | { | |
155 | return READ_ONCE(clone->mode); | |
156 | } | |
157 | ||
158 | static const char *clone_device_name(struct clone *clone) | |
159 | { | |
160 | return dm_table_device_name(clone->ti->table); | |
161 | } | |
162 | ||
163 | static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode) | |
164 | { | |
165 | const char *descs[] = { | |
166 | "read-write", | |
167 | "read-only", | |
168 | "fail" | |
169 | }; | |
170 | ||
171 | enum clone_metadata_mode old_mode = get_clone_mode(clone); | |
172 | ||
173 | /* Never move out of fail mode */ | |
174 | if (old_mode == CM_FAIL) | |
175 | new_mode = CM_FAIL; | |
176 | ||
177 | switch (new_mode) { | |
178 | case CM_FAIL: | |
179 | case CM_READ_ONLY: | |
180 | dm_clone_metadata_set_read_only(clone->cmd); | |
181 | break; | |
182 | ||
183 | case CM_WRITE: | |
184 | dm_clone_metadata_set_read_write(clone->cmd); | |
185 | break; | |
186 | } | |
187 | ||
188 | WRITE_ONCE(clone->mode, new_mode); | |
189 | ||
190 | if (new_mode != old_mode) { | |
191 | dm_table_event(clone->ti->table); | |
192 | DMINFO("%s: Switching to %s mode", clone_device_name(clone), | |
193 | descs[(int)new_mode]); | |
194 | } | |
195 | } | |
196 | ||
197 | static void __abort_transaction(struct clone *clone) | |
198 | { | |
199 | const char *dev_name = clone_device_name(clone); | |
200 | ||
201 | if (get_clone_mode(clone) >= CM_READ_ONLY) | |
202 | return; | |
203 | ||
204 | DMERR("%s: Aborting current metadata transaction", dev_name); | |
205 | if (dm_clone_metadata_abort(clone->cmd)) { | |
206 | DMERR("%s: Failed to abort metadata transaction", dev_name); | |
207 | __set_clone_mode(clone, CM_FAIL); | |
208 | } | |
209 | } | |
210 | ||
211 | static void __reload_in_core_bitset(struct clone *clone) | |
212 | { | |
213 | const char *dev_name = clone_device_name(clone); | |
214 | ||
215 | if (get_clone_mode(clone) == CM_FAIL) | |
216 | return; | |
217 | ||
218 | /* Reload the on-disk bitset */ | |
219 | DMINFO("%s: Reloading on-disk bitmap", dev_name); | |
220 | if (dm_clone_reload_in_core_bitset(clone->cmd)) { | |
221 | DMERR("%s: Failed to reload on-disk bitmap", dev_name); | |
222 | __set_clone_mode(clone, CM_FAIL); | |
223 | } | |
224 | } | |
225 | ||
226 | static void __metadata_operation_failed(struct clone *clone, const char *op, int r) | |
227 | { | |
228 | DMERR("%s: Metadata operation `%s' failed: error = %d", | |
229 | clone_device_name(clone), op, r); | |
230 | ||
231 | __abort_transaction(clone); | |
232 | __set_clone_mode(clone, CM_READ_ONLY); | |
233 | ||
234 | /* | |
235 | * dm_clone_reload_in_core_bitset() may run concurrently with either | |
236 | * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but | |
237 | * it's safe as we have already set the metadata to read-only mode. | |
238 | */ | |
239 | __reload_in_core_bitset(clone); | |
240 | } | |
241 | ||
242 | /*---------------------------------------------------------------------------*/ | |
243 | ||
244 | /* Wake up anyone waiting for region hydrations to stop */ | |
245 | static inline void wakeup_hydration_waiters(struct clone *clone) | |
246 | { | |
247 | wake_up_all(&clone->hydration_stopped); | |
248 | } | |
249 | ||
250 | static inline void wake_worker(struct clone *clone) | |
251 | { | |
252 | queue_work(clone->wq, &clone->worker); | |
253 | } | |
254 | ||
255 | /*---------------------------------------------------------------------------*/ | |
256 | ||
257 | /* | |
258 | * bio helper functions. | |
259 | */ | |
260 | static inline void remap_to_source(struct clone *clone, struct bio *bio) | |
261 | { | |
262 | bio_set_dev(bio, clone->source_dev->bdev); | |
263 | } | |
264 | ||
265 | static inline void remap_to_dest(struct clone *clone, struct bio *bio) | |
266 | { | |
267 | bio_set_dev(bio, clone->dest_dev->bdev); | |
268 | } | |
269 | ||
270 | static bool bio_triggers_commit(struct clone *clone, struct bio *bio) | |
271 | { | |
272 | return op_is_flush(bio->bi_opf) && | |
273 | dm_clone_changed_this_transaction(clone->cmd); | |
274 | } | |
275 | ||
276 | /* Get the address of the region in sectors */ | |
277 | static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) | |
278 | { | |
279 | return (region_nr << clone->region_shift); | |
280 | } | |
281 | ||
282 | /* Get the region number of the bio */ | |
283 | static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) | |
284 | { | |
285 | return (bio->bi_iter.bi_sector >> clone->region_shift); | |
286 | } | |
287 | ||
288 | /* Get the region range covered by the bio */ | |
289 | static void bio_region_range(struct clone *clone, struct bio *bio, | |
290 | unsigned long *rs, unsigned long *re) | |
291 | { | |
292 | *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); | |
293 | *re = bio_end_sector(bio) >> clone->region_shift; | |
294 | } | |
295 | ||
296 | /* Check whether a bio overwrites a region */ | |
297 | static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio) | |
298 | { | |
299 | return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size); | |
300 | } | |
301 | ||
302 | static void fail_bios(struct bio_list *bios, blk_status_t status) | |
303 | { | |
304 | struct bio *bio; | |
305 | ||
306 | while ((bio = bio_list_pop(bios))) { | |
307 | bio->bi_status = status; | |
308 | bio_endio(bio); | |
309 | } | |
310 | } | |
311 | ||
312 | static void submit_bios(struct bio_list *bios) | |
313 | { | |
314 | struct bio *bio; | |
315 | struct blk_plug plug; | |
316 | ||
317 | blk_start_plug(&plug); | |
318 | ||
319 | while ((bio = bio_list_pop(bios))) | |
320 | generic_make_request(bio); | |
321 | ||
322 | blk_finish_plug(&plug); | |
323 | } | |
324 | ||
325 | /* | |
326 | * Submit bio to the underlying device. | |
327 | * | |
328 | * If the bio triggers a commit, delay it, until after the metadata have been | |
329 | * committed. | |
330 | * | |
331 | * NOTE: The bio remapping must be performed by the caller. | |
332 | */ | |
333 | static void issue_bio(struct clone *clone, struct bio *bio) | |
334 | { | |
7431b783 NT |
335 | if (!bio_triggers_commit(clone, bio)) { |
336 | generic_make_request(bio); | |
337 | return; | |
338 | } | |
339 | ||
340 | /* | |
341 | * If the metadata mode is RO or FAIL we won't be able to commit the | |
342 | * metadata, so we complete the bio with an error. | |
343 | */ | |
344 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
345 | bio_io_error(bio); | |
346 | return; | |
347 | } | |
348 | ||
349 | /* | |
350 | * Batch together any bios that trigger commits and then issue a single | |
351 | * commit for them in process_deferred_flush_bios(). | |
352 | */ | |
6ca43ed8 | 353 | spin_lock_irq(&clone->lock); |
7431b783 | 354 | bio_list_add(&clone->deferred_flush_bios, bio); |
6ca43ed8 | 355 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
356 | |
357 | wake_worker(clone); | |
358 | } | |
359 | ||
360 | /* | |
361 | * Remap bio to the destination device and submit it. | |
362 | * | |
363 | * If the bio triggers a commit, delay it, until after the metadata have been | |
364 | * committed. | |
365 | */ | |
366 | static void remap_and_issue(struct clone *clone, struct bio *bio) | |
367 | { | |
368 | remap_to_dest(clone, bio); | |
369 | issue_bio(clone, bio); | |
370 | } | |
371 | ||
372 | /* | |
373 | * Issue bios that have been deferred until after their region has finished | |
374 | * hydrating. | |
375 | * | |
376 | * We delegate the bio submission to the worker thread, so this is safe to call | |
377 | * from interrupt context. | |
378 | */ | |
379 | static void issue_deferred_bios(struct clone *clone, struct bio_list *bios) | |
380 | { | |
381 | struct bio *bio; | |
382 | unsigned long flags; | |
383 | struct bio_list flush_bios = BIO_EMPTY_LIST; | |
384 | struct bio_list normal_bios = BIO_EMPTY_LIST; | |
385 | ||
386 | if (bio_list_empty(bios)) | |
387 | return; | |
388 | ||
389 | while ((bio = bio_list_pop(bios))) { | |
390 | if (bio_triggers_commit(clone, bio)) | |
391 | bio_list_add(&flush_bios, bio); | |
392 | else | |
393 | bio_list_add(&normal_bios, bio); | |
394 | } | |
395 | ||
396 | spin_lock_irqsave(&clone->lock, flags); | |
397 | bio_list_merge(&clone->deferred_bios, &normal_bios); | |
398 | bio_list_merge(&clone->deferred_flush_bios, &flush_bios); | |
399 | spin_unlock_irqrestore(&clone->lock, flags); | |
400 | ||
401 | wake_worker(clone); | |
402 | } | |
403 | ||
404 | static void complete_overwrite_bio(struct clone *clone, struct bio *bio) | |
405 | { | |
406 | unsigned long flags; | |
407 | ||
408 | /* | |
409 | * If the bio has the REQ_FUA flag set we must commit the metadata | |
410 | * before signaling its completion. | |
411 | * | |
412 | * complete_overwrite_bio() is only called by hydration_complete(), | |
413 | * after having successfully updated the metadata. This means we don't | |
414 | * need to call dm_clone_changed_this_transaction() to check if the | |
415 | * metadata has changed and thus we can avoid taking the metadata spin | |
416 | * lock. | |
417 | */ | |
418 | if (!(bio->bi_opf & REQ_FUA)) { | |
419 | bio_endio(bio); | |
420 | return; | |
421 | } | |
422 | ||
423 | /* | |
424 | * If the metadata mode is RO or FAIL we won't be able to commit the | |
425 | * metadata, so we complete the bio with an error. | |
426 | */ | |
427 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
428 | bio_io_error(bio); | |
429 | return; | |
430 | } | |
431 | ||
432 | /* | |
433 | * Batch together any bios that trigger commits and then issue a single | |
434 | * commit for them in process_deferred_flush_bios(). | |
435 | */ | |
436 | spin_lock_irqsave(&clone->lock, flags); | |
437 | bio_list_add(&clone->deferred_flush_completions, bio); | |
438 | spin_unlock_irqrestore(&clone->lock, flags); | |
439 | ||
440 | wake_worker(clone); | |
441 | } | |
442 | ||
443 | static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) | |
444 | { | |
445 | bio->bi_iter.bi_sector = sector; | |
446 | bio->bi_iter.bi_size = to_bytes(len); | |
447 | } | |
448 | ||
449 | static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) | |
450 | { | |
451 | unsigned long rs, re; | |
452 | ||
453 | /* | |
454 | * If the destination device supports discards, remap and trim the | |
455 | * discard bio and pass it down. Otherwise complete the bio | |
456 | * immediately. | |
457 | */ | |
458 | if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { | |
459 | remap_to_dest(clone, bio); | |
460 | bio_region_range(clone, bio, &rs, &re); | |
461 | trim_bio(bio, rs << clone->region_shift, | |
462 | (re - rs) << clone->region_shift); | |
463 | generic_make_request(bio); | |
464 | } else | |
465 | bio_endio(bio); | |
466 | } | |
467 | ||
468 | static void process_discard_bio(struct clone *clone, struct bio *bio) | |
469 | { | |
6ca43ed8 | 470 | unsigned long rs, re; |
7431b783 NT |
471 | |
472 | bio_region_range(clone, bio, &rs, &re); | |
473 | BUG_ON(re > clone->nr_regions); | |
474 | ||
475 | if (unlikely(rs == re)) { | |
476 | bio_endio(bio); | |
477 | return; | |
478 | } | |
479 | ||
480 | /* | |
481 | * The covered regions are already hydrated so we just need to pass | |
482 | * down the discard. | |
483 | */ | |
484 | if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) { | |
485 | complete_discard_bio(clone, bio, true); | |
486 | return; | |
487 | } | |
488 | ||
489 | /* | |
490 | * If the metadata mode is RO or FAIL we won't be able to update the | |
491 | * metadata for the regions covered by the discard so we just ignore | |
492 | * it. | |
493 | */ | |
494 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
495 | bio_endio(bio); | |
496 | return; | |
497 | } | |
498 | ||
499 | /* | |
500 | * Defer discard processing. | |
501 | */ | |
6ca43ed8 | 502 | spin_lock_irq(&clone->lock); |
7431b783 | 503 | bio_list_add(&clone->deferred_discard_bios, bio); |
6ca43ed8 | 504 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
505 | |
506 | wake_worker(clone); | |
507 | } | |
508 | ||
509 | /*---------------------------------------------------------------------------*/ | |
510 | ||
511 | /* | |
512 | * dm-clone region hydrations. | |
513 | */ | |
514 | struct dm_clone_region_hydration { | |
515 | struct clone *clone; | |
516 | unsigned long region_nr; | |
517 | ||
518 | struct bio *overwrite_bio; | |
519 | bio_end_io_t *overwrite_bio_end_io; | |
520 | ||
521 | struct bio_list deferred_bios; | |
522 | ||
523 | blk_status_t status; | |
524 | ||
525 | /* Used by hydration batching */ | |
526 | struct list_head list; | |
527 | ||
528 | /* Used by hydration hash table */ | |
529 | struct hlist_node h; | |
530 | }; | |
531 | ||
532 | /* | |
533 | * Hydration hash table implementation. | |
534 | * | |
535 | * Ideally we would like to use list_bl, which uses bit spin locks and employs | |
536 | * the least significant bit of the list head to lock the corresponding bucket, | |
537 | * reducing the memory overhead for the locks. But, currently, list_bl and bit | |
538 | * spin locks don't support IRQ safe versions. Since we have to take the lock | |
539 | * in both process and interrupt context, we must fall back to using regular | |
540 | * spin locks; one per hash table bucket. | |
541 | */ | |
542 | struct hash_table_bucket { | |
543 | struct hlist_head head; | |
544 | ||
545 | /* Spinlock protecting the bucket */ | |
546 | spinlock_t lock; | |
547 | }; | |
548 | ||
549 | #define bucket_lock_irqsave(bucket, flags) \ | |
550 | spin_lock_irqsave(&(bucket)->lock, flags) | |
551 | ||
552 | #define bucket_unlock_irqrestore(bucket, flags) \ | |
553 | spin_unlock_irqrestore(&(bucket)->lock, flags) | |
554 | ||
555 | static int hash_table_init(struct clone *clone) | |
556 | { | |
557 | unsigned int i, sz; | |
558 | struct hash_table_bucket *bucket; | |
559 | ||
560 | sz = 1 << HASH_TABLE_BITS; | |
561 | ||
562 | clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL); | |
563 | if (!clone->ht) | |
564 | return -ENOMEM; | |
565 | ||
566 | for (i = 0; i < sz; i++) { | |
567 | bucket = clone->ht + i; | |
568 | ||
569 | INIT_HLIST_HEAD(&bucket->head); | |
570 | spin_lock_init(&bucket->lock); | |
571 | } | |
572 | ||
573 | return 0; | |
574 | } | |
575 | ||
576 | static void hash_table_exit(struct clone *clone) | |
577 | { | |
578 | kvfree(clone->ht); | |
579 | } | |
580 | ||
581 | static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone, | |
582 | unsigned long region_nr) | |
583 | { | |
584 | return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)]; | |
585 | } | |
586 | ||
587 | /* | |
588 | * Search hash table for a hydration with hd->region_nr == region_nr | |
589 | * | |
590 | * NOTE: Must be called with the bucket lock held | |
591 | */ | |
0a005856 Y |
592 | static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, |
593 | unsigned long region_nr) | |
7431b783 NT |
594 | { |
595 | struct dm_clone_region_hydration *hd; | |
596 | ||
597 | hlist_for_each_entry(hd, &bucket->head, h) { | |
598 | if (hd->region_nr == region_nr) | |
599 | return hd; | |
600 | } | |
601 | ||
602 | return NULL; | |
603 | } | |
604 | ||
605 | /* | |
606 | * Insert a hydration into the hash table. | |
607 | * | |
608 | * NOTE: Must be called with the bucket lock held. | |
609 | */ | |
610 | static inline void __insert_region_hydration(struct hash_table_bucket *bucket, | |
611 | struct dm_clone_region_hydration *hd) | |
612 | { | |
613 | hlist_add_head(&hd->h, &bucket->head); | |
614 | } | |
615 | ||
616 | /* | |
617 | * This function inserts a hydration into the hash table, unless someone else | |
618 | * managed to insert a hydration for the same region first. In the latter case | |
619 | * it returns the existing hydration descriptor for this region. | |
620 | * | |
621 | * NOTE: Must be called with the hydration hash table lock held. | |
622 | */ | |
623 | static struct dm_clone_region_hydration * | |
624 | __find_or_insert_region_hydration(struct hash_table_bucket *bucket, | |
625 | struct dm_clone_region_hydration *hd) | |
626 | { | |
627 | struct dm_clone_region_hydration *hd2; | |
628 | ||
629 | hd2 = __hash_find(bucket, hd->region_nr); | |
630 | if (hd2) | |
631 | return hd2; | |
632 | ||
633 | __insert_region_hydration(bucket, hd); | |
634 | ||
635 | return hd; | |
636 | } | |
637 | ||
638 | /*---------------------------------------------------------------------------*/ | |
639 | ||
640 | /* Allocate a hydration */ | |
641 | static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone) | |
642 | { | |
643 | struct dm_clone_region_hydration *hd; | |
644 | ||
645 | /* | |
646 | * Allocate a hydration from the hydration mempool. | |
647 | * This might block but it can't fail. | |
648 | */ | |
649 | hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO); | |
650 | hd->clone = clone; | |
651 | ||
652 | return hd; | |
653 | } | |
654 | ||
655 | static inline void free_hydration(struct dm_clone_region_hydration *hd) | |
656 | { | |
657 | mempool_free(hd, &hd->clone->hydration_pool); | |
658 | } | |
659 | ||
660 | /* Initialize a hydration */ | |
661 | static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr) | |
662 | { | |
663 | hd->region_nr = region_nr; | |
664 | hd->overwrite_bio = NULL; | |
665 | bio_list_init(&hd->deferred_bios); | |
666 | hd->status = 0; | |
667 | ||
668 | INIT_LIST_HEAD(&hd->list); | |
669 | INIT_HLIST_NODE(&hd->h); | |
670 | } | |
671 | ||
672 | /*---------------------------------------------------------------------------*/ | |
673 | ||
674 | /* | |
675 | * Update dm-clone's metadata after a region has finished hydrating and remove | |
676 | * hydration from the hash table. | |
677 | */ | |
678 | static int hydration_update_metadata(struct dm_clone_region_hydration *hd) | |
679 | { | |
680 | int r = 0; | |
681 | unsigned long flags; | |
682 | struct hash_table_bucket *bucket; | |
683 | struct clone *clone = hd->clone; | |
684 | ||
685 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | |
686 | r = -EPERM; | |
687 | ||
688 | /* Update the metadata */ | |
689 | if (likely(!r) && hd->status == BLK_STS_OK) | |
690 | r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr); | |
691 | ||
692 | bucket = get_hash_table_bucket(clone, hd->region_nr); | |
693 | ||
694 | /* Remove hydration from hash table */ | |
695 | bucket_lock_irqsave(bucket, flags); | |
696 | hlist_del(&hd->h); | |
697 | bucket_unlock_irqrestore(bucket, flags); | |
698 | ||
699 | return r; | |
700 | } | |
701 | ||
702 | /* | |
703 | * Complete a region's hydration: | |
704 | * | |
705 | * 1. Update dm-clone's metadata. | |
706 | * 2. Remove hydration from hash table. | |
707 | * 3. Complete overwrite bio. | |
708 | * 4. Issue deferred bios. | |
709 | * 5. If this was the last hydration, wake up anyone waiting for | |
710 | * hydrations to finish. | |
711 | */ | |
712 | static void hydration_complete(struct dm_clone_region_hydration *hd) | |
713 | { | |
714 | int r; | |
715 | blk_status_t status; | |
716 | struct clone *clone = hd->clone; | |
717 | ||
718 | r = hydration_update_metadata(hd); | |
719 | ||
720 | if (hd->status == BLK_STS_OK && likely(!r)) { | |
721 | if (hd->overwrite_bio) | |
722 | complete_overwrite_bio(clone, hd->overwrite_bio); | |
723 | ||
724 | issue_deferred_bios(clone, &hd->deferred_bios); | |
725 | } else { | |
726 | status = r ? BLK_STS_IOERR : hd->status; | |
727 | ||
728 | if (hd->overwrite_bio) | |
729 | bio_list_add(&hd->deferred_bios, hd->overwrite_bio); | |
730 | ||
731 | fail_bios(&hd->deferred_bios, status); | |
732 | } | |
733 | ||
734 | free_hydration(hd); | |
735 | ||
736 | if (atomic_dec_and_test(&clone->hydrations_in_flight)) | |
737 | wakeup_hydration_waiters(clone); | |
738 | } | |
739 | ||
740 | static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context) | |
741 | { | |
742 | blk_status_t status; | |
743 | ||
744 | struct dm_clone_region_hydration *tmp, *hd = context; | |
745 | struct clone *clone = hd->clone; | |
746 | ||
747 | LIST_HEAD(batched_hydrations); | |
748 | ||
749 | if (read_err || write_err) { | |
750 | DMERR_LIMIT("%s: hydration failed", clone_device_name(clone)); | |
751 | status = BLK_STS_IOERR; | |
752 | } else { | |
753 | status = BLK_STS_OK; | |
754 | } | |
755 | list_splice_tail(&hd->list, &batched_hydrations); | |
756 | ||
757 | hd->status = status; | |
758 | hydration_complete(hd); | |
759 | ||
760 | /* Complete batched hydrations */ | |
761 | list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) { | |
762 | hd->status = status; | |
763 | hydration_complete(hd); | |
764 | } | |
765 | ||
766 | /* Continue background hydration, if there is no I/O in-flight */ | |
767 | if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && | |
768 | !atomic_read(&clone->ios_in_flight)) | |
769 | wake_worker(clone); | |
770 | } | |
771 | ||
772 | static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions) | |
773 | { | |
774 | unsigned long region_start, region_end; | |
775 | sector_t tail_size, region_size, total_size; | |
776 | struct dm_io_region from, to; | |
777 | struct clone *clone = hd->clone; | |
778 | ||
779 | region_size = clone->region_size; | |
780 | region_start = hd->region_nr; | |
781 | region_end = region_start + nr_regions - 1; | |
782 | ||
783 | total_size = (nr_regions - 1) << clone->region_shift; | |
784 | ||
785 | if (region_end == clone->nr_regions - 1) { | |
786 | /* | |
787 | * The last region of the target might be smaller than | |
788 | * region_size. | |
789 | */ | |
790 | tail_size = clone->ti->len & (region_size - 1); | |
791 | if (!tail_size) | |
792 | tail_size = region_size; | |
793 | } else { | |
794 | tail_size = region_size; | |
795 | } | |
796 | ||
797 | total_size += tail_size; | |
798 | ||
799 | from.bdev = clone->source_dev->bdev; | |
800 | from.sector = region_to_sector(clone, region_start); | |
801 | from.count = total_size; | |
802 | ||
803 | to.bdev = clone->dest_dev->bdev; | |
804 | to.sector = from.sector; | |
805 | to.count = from.count; | |
806 | ||
807 | /* Issue copy */ | |
808 | atomic_add(nr_regions, &clone->hydrations_in_flight); | |
809 | dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0, | |
810 | hydration_kcopyd_callback, hd); | |
811 | } | |
812 | ||
813 | static void overwrite_endio(struct bio *bio) | |
814 | { | |
815 | struct dm_clone_region_hydration *hd = bio->bi_private; | |
816 | ||
817 | bio->bi_end_io = hd->overwrite_bio_end_io; | |
818 | hd->status = bio->bi_status; | |
819 | ||
820 | hydration_complete(hd); | |
821 | } | |
822 | ||
823 | static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio) | |
824 | { | |
825 | /* | |
826 | * We don't need to save and restore bio->bi_private because device | |
827 | * mapper core generates a new bio for us to use, with clean | |
828 | * bi_private. | |
829 | */ | |
830 | hd->overwrite_bio = bio; | |
831 | hd->overwrite_bio_end_io = bio->bi_end_io; | |
832 | ||
833 | bio->bi_end_io = overwrite_endio; | |
834 | bio->bi_private = hd; | |
835 | ||
836 | atomic_inc(&hd->clone->hydrations_in_flight); | |
837 | generic_make_request(bio); | |
838 | } | |
839 | ||
840 | /* | |
841 | * Hydrate bio's region. | |
842 | * | |
843 | * This function starts the hydration of the bio's region and puts the bio in | |
844 | * the list of deferred bios for this region. In case, by the time this | |
845 | * function is called, the region has finished hydrating it's submitted to the | |
846 | * destination device. | |
847 | * | |
848 | * NOTE: The bio remapping must be performed by the caller. | |
849 | */ | |
850 | static void hydrate_bio_region(struct clone *clone, struct bio *bio) | |
851 | { | |
852 | unsigned long flags; | |
853 | unsigned long region_nr; | |
854 | struct hash_table_bucket *bucket; | |
855 | struct dm_clone_region_hydration *hd, *hd2; | |
856 | ||
857 | region_nr = bio_to_region(clone, bio); | |
858 | bucket = get_hash_table_bucket(clone, region_nr); | |
859 | ||
860 | bucket_lock_irqsave(bucket, flags); | |
861 | ||
862 | hd = __hash_find(bucket, region_nr); | |
863 | if (hd) { | |
864 | /* Someone else is hydrating the region */ | |
865 | bio_list_add(&hd->deferred_bios, bio); | |
866 | bucket_unlock_irqrestore(bucket, flags); | |
867 | return; | |
868 | } | |
869 | ||
870 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | |
871 | /* The region has been hydrated */ | |
872 | bucket_unlock_irqrestore(bucket, flags); | |
873 | issue_bio(clone, bio); | |
874 | return; | |
875 | } | |
876 | ||
877 | /* | |
878 | * We must allocate a hydration descriptor and start the hydration of | |
879 | * the corresponding region. | |
880 | */ | |
881 | bucket_unlock_irqrestore(bucket, flags); | |
882 | ||
883 | hd = alloc_hydration(clone); | |
884 | hydration_init(hd, region_nr); | |
885 | ||
886 | bucket_lock_irqsave(bucket, flags); | |
887 | ||
888 | /* Check if the region has been hydrated in the meantime. */ | |
889 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | |
890 | bucket_unlock_irqrestore(bucket, flags); | |
891 | free_hydration(hd); | |
892 | issue_bio(clone, bio); | |
893 | return; | |
894 | } | |
895 | ||
896 | hd2 = __find_or_insert_region_hydration(bucket, hd); | |
897 | if (hd2 != hd) { | |
898 | /* Someone else started the region's hydration. */ | |
899 | bio_list_add(&hd2->deferred_bios, bio); | |
900 | bucket_unlock_irqrestore(bucket, flags); | |
901 | free_hydration(hd); | |
902 | return; | |
903 | } | |
904 | ||
905 | /* | |
906 | * If the metadata mode is RO or FAIL then there is no point starting a | |
907 | * hydration, since we will not be able to update the metadata when the | |
908 | * hydration finishes. | |
909 | */ | |
910 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
911 | hlist_del(&hd->h); | |
912 | bucket_unlock_irqrestore(bucket, flags); | |
913 | free_hydration(hd); | |
914 | bio_io_error(bio); | |
915 | return; | |
916 | } | |
917 | ||
918 | /* | |
919 | * Start region hydration. | |
920 | * | |
921 | * If a bio overwrites a region, i.e., its size is equal to the | |
922 | * region's size, then we don't need to copy the region from the source | |
923 | * to the destination device. | |
924 | */ | |
925 | if (is_overwrite_bio(clone, bio)) { | |
926 | bucket_unlock_irqrestore(bucket, flags); | |
927 | hydration_overwrite(hd, bio); | |
928 | } else { | |
929 | bio_list_add(&hd->deferred_bios, bio); | |
930 | bucket_unlock_irqrestore(bucket, flags); | |
931 | hydration_copy(hd, 1); | |
932 | } | |
933 | } | |
934 | ||
935 | /*---------------------------------------------------------------------------*/ | |
936 | ||
937 | /* | |
938 | * Background hydrations. | |
939 | */ | |
940 | ||
941 | /* | |
942 | * Batch region hydrations. | |
943 | * | |
944 | * To better utilize device bandwidth we batch together the hydration of | |
945 | * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which | |
946 | * is good for small, random write performance (because of the overwriting of | |
947 | * un-hydrated regions) and at the same time issue big copy requests to kcopyd | |
948 | * to achieve high hydration bandwidth. | |
949 | */ | |
950 | struct batch_info { | |
951 | struct dm_clone_region_hydration *head; | |
952 | unsigned int nr_batched_regions; | |
953 | }; | |
954 | ||
955 | static void __batch_hydration(struct batch_info *batch, | |
956 | struct dm_clone_region_hydration *hd) | |
957 | { | |
958 | struct clone *clone = hd->clone; | |
959 | unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size); | |
960 | ||
961 | if (batch->head) { | |
962 | /* Try to extend the current batch */ | |
963 | if (batch->nr_batched_regions < max_batch_size && | |
964 | (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) { | |
965 | list_add_tail(&hd->list, &batch->head->list); | |
966 | batch->nr_batched_regions++; | |
967 | hd = NULL; | |
968 | } | |
969 | ||
970 | /* Check if we should issue the current batch */ | |
971 | if (batch->nr_batched_regions >= max_batch_size || hd) { | |
972 | hydration_copy(batch->head, batch->nr_batched_regions); | |
973 | batch->head = NULL; | |
974 | batch->nr_batched_regions = 0; | |
975 | } | |
976 | } | |
977 | ||
978 | if (!hd) | |
979 | return; | |
980 | ||
981 | /* We treat max batch sizes of zero and one equivalently */ | |
982 | if (max_batch_size <= 1) { | |
983 | hydration_copy(hd, 1); | |
984 | return; | |
985 | } | |
986 | ||
987 | /* Start a new batch */ | |
988 | BUG_ON(!list_empty(&hd->list)); | |
989 | batch->head = hd; | |
990 | batch->nr_batched_regions = 1; | |
991 | } | |
992 | ||
993 | static unsigned long __start_next_hydration(struct clone *clone, | |
994 | unsigned long offset, | |
995 | struct batch_info *batch) | |
996 | { | |
997 | unsigned long flags; | |
998 | struct hash_table_bucket *bucket; | |
999 | struct dm_clone_region_hydration *hd; | |
1000 | unsigned long nr_regions = clone->nr_regions; | |
1001 | ||
1002 | hd = alloc_hydration(clone); | |
1003 | ||
1004 | /* Try to find a region to hydrate. */ | |
1005 | do { | |
1006 | offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset); | |
1007 | if (offset == nr_regions) | |
1008 | break; | |
1009 | ||
1010 | bucket = get_hash_table_bucket(clone, offset); | |
1011 | bucket_lock_irqsave(bucket, flags); | |
1012 | ||
1013 | if (!dm_clone_is_region_hydrated(clone->cmd, offset) && | |
1014 | !__hash_find(bucket, offset)) { | |
1015 | hydration_init(hd, offset); | |
1016 | __insert_region_hydration(bucket, hd); | |
1017 | bucket_unlock_irqrestore(bucket, flags); | |
1018 | ||
1019 | /* Batch hydration */ | |
1020 | __batch_hydration(batch, hd); | |
1021 | ||
1022 | return (offset + 1); | |
1023 | } | |
1024 | ||
1025 | bucket_unlock_irqrestore(bucket, flags); | |
1026 | ||
1027 | } while (++offset < nr_regions); | |
1028 | ||
1029 | if (hd) | |
1030 | free_hydration(hd); | |
1031 | ||
1032 | return offset; | |
1033 | } | |
1034 | ||
1035 | /* | |
1036 | * This function searches for regions that still reside in the source device | |
1037 | * and starts their hydration. | |
1038 | */ | |
1039 | static void do_hydration(struct clone *clone) | |
1040 | { | |
1041 | unsigned int current_volume; | |
1042 | unsigned long offset, nr_regions = clone->nr_regions; | |
1043 | ||
1044 | struct batch_info batch = { | |
1045 | .head = NULL, | |
1046 | .nr_batched_regions = 0, | |
1047 | }; | |
1048 | ||
1049 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | |
1050 | return; | |
1051 | ||
1052 | if (dm_clone_is_hydration_done(clone->cmd)) | |
1053 | return; | |
1054 | ||
1055 | /* | |
1056 | * Avoid race with device suspension. | |
1057 | */ | |
1058 | atomic_inc(&clone->hydrations_in_flight); | |
1059 | ||
1060 | /* | |
1061 | * Make sure atomic_inc() is ordered before test_bit(), otherwise we | |
1062 | * might race with clone_postsuspend() and start a region hydration | |
1063 | * after the target has been suspended. | |
1064 | * | |
1065 | * This is paired with the smp_mb__after_atomic() in | |
1066 | * clone_postsuspend(). | |
1067 | */ | |
1068 | smp_mb__after_atomic(); | |
1069 | ||
1070 | offset = clone->hydration_offset; | |
1071 | while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) && | |
1072 | !atomic_read(&clone->ios_in_flight) && | |
1073 | test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && | |
1074 | offset < nr_regions) { | |
1075 | current_volume = atomic_read(&clone->hydrations_in_flight); | |
1076 | current_volume += batch.nr_batched_regions; | |
1077 | ||
1078 | if (current_volume > READ_ONCE(clone->hydration_threshold)) | |
1079 | break; | |
1080 | ||
1081 | offset = __start_next_hydration(clone, offset, &batch); | |
1082 | } | |
1083 | ||
1084 | if (batch.head) | |
1085 | hydration_copy(batch.head, batch.nr_batched_regions); | |
1086 | ||
1087 | if (offset >= nr_regions) | |
1088 | offset = 0; | |
1089 | ||
1090 | clone->hydration_offset = offset; | |
1091 | ||
1092 | if (atomic_dec_and_test(&clone->hydrations_in_flight)) | |
1093 | wakeup_hydration_waiters(clone); | |
1094 | } | |
1095 | ||
1096 | /*---------------------------------------------------------------------------*/ | |
1097 | ||
1098 | static bool need_commit_due_to_time(struct clone *clone) | |
1099 | { | |
1100 | return !time_in_range(jiffies, clone->last_commit_jiffies, | |
1101 | clone->last_commit_jiffies + COMMIT_PERIOD); | |
1102 | } | |
1103 | ||
1104 | /* | |
1105 | * A non-zero return indicates read-only or fail mode. | |
1106 | */ | |
1107 | static int commit_metadata(struct clone *clone) | |
1108 | { | |
1109 | int r = 0; | |
1110 | ||
1111 | mutex_lock(&clone->commit_lock); | |
1112 | ||
1113 | if (!dm_clone_changed_this_transaction(clone->cmd)) | |
1114 | goto out; | |
1115 | ||
1116 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | |
1117 | r = -EPERM; | |
1118 | goto out; | |
1119 | } | |
1120 | ||
1121 | r = dm_clone_metadata_commit(clone->cmd); | |
1122 | ||
1123 | if (unlikely(r)) { | |
1124 | __metadata_operation_failed(clone, "dm_clone_metadata_commit", r); | |
1125 | goto out; | |
1126 | } | |
1127 | ||
1128 | if (dm_clone_is_hydration_done(clone->cmd)) | |
1129 | dm_table_event(clone->ti->table); | |
1130 | out: | |
1131 | mutex_unlock(&clone->commit_lock); | |
1132 | ||
1133 | return r; | |
1134 | } | |
1135 | ||
1136 | static void process_deferred_discards(struct clone *clone) | |
1137 | { | |
1138 | int r = -EPERM; | |
1139 | struct bio *bio; | |
1140 | struct blk_plug plug; | |
6ca43ed8 | 1141 | unsigned long rs, re; |
7431b783 NT |
1142 | struct bio_list discards = BIO_EMPTY_LIST; |
1143 | ||
6ca43ed8 | 1144 | spin_lock_irq(&clone->lock); |
7431b783 NT |
1145 | bio_list_merge(&discards, &clone->deferred_discard_bios); |
1146 | bio_list_init(&clone->deferred_discard_bios); | |
6ca43ed8 | 1147 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
1148 | |
1149 | if (bio_list_empty(&discards)) | |
1150 | return; | |
1151 | ||
1152 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | |
1153 | goto out; | |
1154 | ||
1155 | /* Update the metadata */ | |
1156 | bio_list_for_each(bio, &discards) { | |
1157 | bio_region_range(clone, bio, &rs, &re); | |
1158 | /* | |
1159 | * A discard request might cover regions that have been already | |
1160 | * hydrated. There is no need to update the metadata for these | |
1161 | * regions. | |
1162 | */ | |
1163 | r = dm_clone_cond_set_range(clone->cmd, rs, re - rs); | |
1164 | ||
1165 | if (unlikely(r)) | |
1166 | break; | |
1167 | } | |
1168 | out: | |
1169 | blk_start_plug(&plug); | |
1170 | while ((bio = bio_list_pop(&discards))) | |
1171 | complete_discard_bio(clone, bio, r == 0); | |
1172 | blk_finish_plug(&plug); | |
1173 | } | |
1174 | ||
1175 | static void process_deferred_bios(struct clone *clone) | |
1176 | { | |
7431b783 NT |
1177 | struct bio_list bios = BIO_EMPTY_LIST; |
1178 | ||
6ca43ed8 | 1179 | spin_lock_irq(&clone->lock); |
7431b783 NT |
1180 | bio_list_merge(&bios, &clone->deferred_bios); |
1181 | bio_list_init(&clone->deferred_bios); | |
6ca43ed8 | 1182 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
1183 | |
1184 | if (bio_list_empty(&bios)) | |
1185 | return; | |
1186 | ||
1187 | submit_bios(&bios); | |
1188 | } | |
1189 | ||
1190 | static void process_deferred_flush_bios(struct clone *clone) | |
1191 | { | |
1192 | struct bio *bio; | |
7431b783 NT |
1193 | struct bio_list bios = BIO_EMPTY_LIST; |
1194 | struct bio_list bio_completions = BIO_EMPTY_LIST; | |
1195 | ||
1196 | /* | |
1197 | * If there are any deferred flush bios, we must commit the metadata | |
1198 | * before issuing them or signaling their completion. | |
1199 | */ | |
6ca43ed8 | 1200 | spin_lock_irq(&clone->lock); |
7431b783 NT |
1201 | bio_list_merge(&bios, &clone->deferred_flush_bios); |
1202 | bio_list_init(&clone->deferred_flush_bios); | |
1203 | ||
1204 | bio_list_merge(&bio_completions, &clone->deferred_flush_completions); | |
1205 | bio_list_init(&clone->deferred_flush_completions); | |
6ca43ed8 | 1206 | spin_unlock_irq(&clone->lock); |
7431b783 NT |
1207 | |
1208 | if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && | |
1209 | !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) | |
1210 | return; | |
1211 | ||
1212 | if (commit_metadata(clone)) { | |
1213 | bio_list_merge(&bios, &bio_completions); | |
1214 | ||
1215 | while ((bio = bio_list_pop(&bios))) | |
1216 | bio_io_error(bio); | |
1217 | ||
1218 | return; | |
1219 | } | |
1220 | ||
1221 | clone->last_commit_jiffies = jiffies; | |
1222 | ||
1223 | while ((bio = bio_list_pop(&bio_completions))) | |
1224 | bio_endio(bio); | |
1225 | ||
1226 | while ((bio = bio_list_pop(&bios))) | |
1227 | generic_make_request(bio); | |
1228 | } | |
1229 | ||
1230 | static void do_worker(struct work_struct *work) | |
1231 | { | |
1232 | struct clone *clone = container_of(work, typeof(*clone), worker); | |
1233 | ||
1234 | process_deferred_bios(clone); | |
1235 | process_deferred_discards(clone); | |
1236 | ||
1237 | /* | |
1238 | * process_deferred_flush_bios(): | |
1239 | * | |
1240 | * - Commit metadata | |
1241 | * | |
1242 | * - Process deferred REQ_FUA completions | |
1243 | * | |
1244 | * - Process deferred REQ_PREFLUSH bios | |
1245 | */ | |
1246 | process_deferred_flush_bios(clone); | |
1247 | ||
1248 | /* Background hydration */ | |
1249 | do_hydration(clone); | |
1250 | } | |
1251 | ||
1252 | /* | |
1253 | * Commit periodically so that not too much unwritten data builds up. | |
1254 | * | |
1255 | * Also, restart background hydration, if it has been stopped by in-flight I/O. | |
1256 | */ | |
1257 | static void do_waker(struct work_struct *work) | |
1258 | { | |
1259 | struct clone *clone = container_of(to_delayed_work(work), struct clone, waker); | |
1260 | ||
1261 | wake_worker(clone); | |
1262 | queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD); | |
1263 | } | |
1264 | ||
1265 | /*---------------------------------------------------------------------------*/ | |
1266 | ||
1267 | /* | |
1268 | * Target methods | |
1269 | */ | |
1270 | static int clone_map(struct dm_target *ti, struct bio *bio) | |
1271 | { | |
1272 | struct clone *clone = ti->private; | |
1273 | unsigned long region_nr; | |
1274 | ||
1275 | atomic_inc(&clone->ios_in_flight); | |
1276 | ||
1277 | if (unlikely(get_clone_mode(clone) == CM_FAIL)) | |
1278 | return DM_MAPIO_KILL; | |
1279 | ||
1280 | /* | |
1281 | * REQ_PREFLUSH bios carry no data: | |
1282 | * | |
1283 | * - Commit metadata, if changed | |
1284 | * | |
1285 | * - Pass down to destination device | |
1286 | */ | |
1287 | if (bio->bi_opf & REQ_PREFLUSH) { | |
1288 | remap_and_issue(clone, bio); | |
1289 | return DM_MAPIO_SUBMITTED; | |
1290 | } | |
1291 | ||
1292 | bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); | |
1293 | ||
1294 | /* | |
1295 | * dm-clone interprets discards and performs a fast hydration of the | |
1296 | * discarded regions, i.e., we skip the copy from the source device and | |
1297 | * just mark the regions as hydrated. | |
1298 | */ | |
1299 | if (bio_op(bio) == REQ_OP_DISCARD) { | |
1300 | process_discard_bio(clone, bio); | |
1301 | return DM_MAPIO_SUBMITTED; | |
1302 | } | |
1303 | ||
1304 | /* | |
1305 | * If the bio's region is hydrated, redirect it to the destination | |
1306 | * device. | |
1307 | * | |
1308 | * If the region is not hydrated and the bio is a READ, redirect it to | |
1309 | * the source device. | |
1310 | * | |
1311 | * Else, defer WRITE bio until after its region has been hydrated and | |
1312 | * start the region's hydration immediately. | |
1313 | */ | |
1314 | region_nr = bio_to_region(clone, bio); | |
1315 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | |
1316 | remap_and_issue(clone, bio); | |
1317 | return DM_MAPIO_SUBMITTED; | |
1318 | } else if (bio_data_dir(bio) == READ) { | |
1319 | remap_to_source(clone, bio); | |
1320 | return DM_MAPIO_REMAPPED; | |
1321 | } | |
1322 | ||
1323 | remap_to_dest(clone, bio); | |
1324 | hydrate_bio_region(clone, bio); | |
1325 | ||
1326 | return DM_MAPIO_SUBMITTED; | |
1327 | } | |
1328 | ||
1329 | static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error) | |
1330 | { | |
1331 | struct clone *clone = ti->private; | |
1332 | ||
1333 | atomic_dec(&clone->ios_in_flight); | |
1334 | ||
1335 | return DM_ENDIO_DONE; | |
1336 | } | |
1337 | ||
1338 | static void emit_flags(struct clone *clone, char *result, unsigned int maxlen, | |
1339 | ssize_t *sz_ptr) | |
1340 | { | |
1341 | ssize_t sz = *sz_ptr; | |
1342 | unsigned int count; | |
1343 | ||
1344 | count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | |
1345 | count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | |
1346 | ||
1347 | DMEMIT("%u ", count); | |
1348 | ||
1349 | if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) | |
1350 | DMEMIT("no_hydration "); | |
1351 | ||
1352 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) | |
1353 | DMEMIT("no_discard_passdown "); | |
1354 | ||
1355 | *sz_ptr = sz; | |
1356 | } | |
1357 | ||
1358 | static void emit_core_args(struct clone *clone, char *result, | |
1359 | unsigned int maxlen, ssize_t *sz_ptr) | |
1360 | { | |
1361 | ssize_t sz = *sz_ptr; | |
1362 | unsigned int count = 4; | |
1363 | ||
1364 | DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count, | |
1365 | READ_ONCE(clone->hydration_threshold), | |
1366 | READ_ONCE(clone->hydration_batch_size)); | |
1367 | ||
1368 | *sz_ptr = sz; | |
1369 | } | |
1370 | ||
1371 | /* | |
1372 | * Status format: | |
1373 | * | |
1374 | * <metadata block size> <#used metadata blocks>/<#total metadata blocks> | |
1375 | * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions> | |
1376 | * <#features> <features>* <#core args> <core args>* <clone metadata mode> | |
1377 | */ | |
1378 | static void clone_status(struct dm_target *ti, status_type_t type, | |
1379 | unsigned int status_flags, char *result, | |
1380 | unsigned int maxlen) | |
1381 | { | |
1382 | int r; | |
1383 | unsigned int i; | |
1384 | ssize_t sz = 0; | |
1385 | dm_block_t nr_free_metadata_blocks = 0; | |
1386 | dm_block_t nr_metadata_blocks = 0; | |
1387 | char buf[BDEVNAME_SIZE]; | |
1388 | struct clone *clone = ti->private; | |
1389 | ||
1390 | switch (type) { | |
1391 | case STATUSTYPE_INFO: | |
1392 | if (get_clone_mode(clone) == CM_FAIL) { | |
1393 | DMEMIT("Fail"); | |
1394 | break; | |
1395 | } | |
1396 | ||
1397 | /* Commit to ensure statistics aren't out-of-date */ | |
1398 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) | |
1399 | (void) commit_metadata(clone); | |
1400 | ||
1401 | r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks); | |
1402 | ||
1403 | if (r) { | |
1404 | DMERR("%s: dm_clone_get_free_metadata_block_count returned %d", | |
1405 | clone_device_name(clone), r); | |
1406 | goto error; | |
1407 | } | |
1408 | ||
1409 | r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks); | |
1410 | ||
1411 | if (r) { | |
1412 | DMERR("%s: dm_clone_get_metadata_dev_size returned %d", | |
1413 | clone_device_name(clone), r); | |
1414 | goto error; | |
1415 | } | |
1416 | ||
1417 | DMEMIT("%u %llu/%llu %llu %lu/%lu %u ", | |
1418 | DM_CLONE_METADATA_BLOCK_SIZE, | |
1419 | (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), | |
1420 | (unsigned long long)nr_metadata_blocks, | |
1421 | (unsigned long long)clone->region_size, | |
1422 | dm_clone_nr_of_hydrated_regions(clone->cmd), | |
1423 | clone->nr_regions, | |
1424 | atomic_read(&clone->hydrations_in_flight)); | |
1425 | ||
1426 | emit_flags(clone, result, maxlen, &sz); | |
1427 | emit_core_args(clone, result, maxlen, &sz); | |
1428 | ||
1429 | switch (get_clone_mode(clone)) { | |
1430 | case CM_WRITE: | |
1431 | DMEMIT("rw"); | |
1432 | break; | |
1433 | case CM_READ_ONLY: | |
1434 | DMEMIT("ro"); | |
1435 | break; | |
1436 | case CM_FAIL: | |
1437 | DMEMIT("Fail"); | |
1438 | } | |
1439 | ||
1440 | break; | |
1441 | ||
1442 | case STATUSTYPE_TABLE: | |
1443 | format_dev_t(buf, clone->metadata_dev->bdev->bd_dev); | |
1444 | DMEMIT("%s ", buf); | |
1445 | ||
1446 | format_dev_t(buf, clone->dest_dev->bdev->bd_dev); | |
1447 | DMEMIT("%s ", buf); | |
1448 | ||
1449 | format_dev_t(buf, clone->source_dev->bdev->bd_dev); | |
1450 | DMEMIT("%s", buf); | |
1451 | ||
1452 | for (i = 0; i < clone->nr_ctr_args; i++) | |
1453 | DMEMIT(" %s", clone->ctr_args[i]); | |
1454 | } | |
1455 | ||
1456 | return; | |
1457 | ||
1458 | error: | |
1459 | DMEMIT("Error"); | |
1460 | } | |
1461 | ||
1462 | static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | |
1463 | { | |
1464 | struct request_queue *dest_q, *source_q; | |
1465 | struct clone *clone = container_of(cb, struct clone, callbacks); | |
1466 | ||
1467 | source_q = bdev_get_queue(clone->source_dev->bdev); | |
1468 | dest_q = bdev_get_queue(clone->dest_dev->bdev); | |
1469 | ||
1470 | return (bdi_congested(dest_q->backing_dev_info, bdi_bits) | | |
1471 | bdi_congested(source_q->backing_dev_info, bdi_bits)); | |
1472 | } | |
1473 | ||
1474 | static sector_t get_dev_size(struct dm_dev *dev) | |
1475 | { | |
1476 | return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; | |
1477 | } | |
1478 | ||
1479 | /*---------------------------------------------------------------------------*/ | |
1480 | ||
1481 | /* | |
1482 | * Construct a clone device mapping: | |
1483 | * | |
1484 | * clone <metadata dev> <destination dev> <source dev> <region size> | |
1485 | * [<#feature args> [<feature arg>]* [<#core args> [key value]*]] | |
1486 | * | |
1487 | * metadata dev: Fast device holding the persistent metadata | |
1488 | * destination dev: The destination device, which will become a clone of the | |
1489 | * source device | |
1490 | * source dev: The read-only source device that gets cloned | |
1491 | * region size: dm-clone unit size in sectors | |
1492 | * | |
1493 | * #feature args: Number of feature arguments passed | |
1494 | * feature args: E.g. no_hydration, no_discard_passdown | |
1495 | * | |
1496 | * #core arguments: An even number of core arguments | |
1497 | * core arguments: Key/value pairs for tuning the core | |
1498 | * E.g. 'hydration_threshold 256' | |
1499 | */ | |
1500 | static int parse_feature_args(struct dm_arg_set *as, struct clone *clone) | |
1501 | { | |
1502 | int r; | |
1503 | unsigned int argc; | |
1504 | const char *arg_name; | |
1505 | struct dm_target *ti = clone->ti; | |
1506 | ||
1507 | const struct dm_arg args = { | |
1508 | .min = 0, | |
1509 | .max = 2, | |
1510 | .error = "Invalid number of feature arguments" | |
1511 | }; | |
1512 | ||
1513 | /* No feature arguments supplied */ | |
1514 | if (!as->argc) | |
1515 | return 0; | |
1516 | ||
1517 | r = dm_read_arg_group(&args, as, &argc, &ti->error); | |
1518 | if (r) | |
1519 | return r; | |
1520 | ||
1521 | while (argc) { | |
1522 | arg_name = dm_shift_arg(as); | |
1523 | argc--; | |
1524 | ||
1525 | if (!strcasecmp(arg_name, "no_hydration")) { | |
1526 | __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | |
1527 | } else if (!strcasecmp(arg_name, "no_discard_passdown")) { | |
1528 | __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | |
1529 | } else { | |
1530 | ti->error = "Invalid feature argument"; | |
1531 | return -EINVAL; | |
1532 | } | |
1533 | } | |
1534 | ||
1535 | return 0; | |
1536 | } | |
1537 | ||
1538 | static int parse_core_args(struct dm_arg_set *as, struct clone *clone) | |
1539 | { | |
1540 | int r; | |
1541 | unsigned int argc; | |
1542 | unsigned int value; | |
1543 | const char *arg_name; | |
1544 | struct dm_target *ti = clone->ti; | |
1545 | ||
1546 | const struct dm_arg args = { | |
1547 | .min = 0, | |
1548 | .max = 4, | |
1549 | .error = "Invalid number of core arguments" | |
1550 | }; | |
1551 | ||
1552 | /* Initialize core arguments */ | |
1553 | clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE; | |
1554 | clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD; | |
1555 | ||
1556 | /* No core arguments supplied */ | |
1557 | if (!as->argc) | |
1558 | return 0; | |
1559 | ||
1560 | r = dm_read_arg_group(&args, as, &argc, &ti->error); | |
1561 | if (r) | |
1562 | return r; | |
1563 | ||
1564 | if (argc & 1) { | |
1565 | ti->error = "Number of core arguments must be even"; | |
1566 | return -EINVAL; | |
1567 | } | |
1568 | ||
1569 | while (argc) { | |
1570 | arg_name = dm_shift_arg(as); | |
1571 | argc -= 2; | |
1572 | ||
1573 | if (!strcasecmp(arg_name, "hydration_threshold")) { | |
1574 | if (kstrtouint(dm_shift_arg(as), 10, &value)) { | |
1575 | ti->error = "Invalid value for argument `hydration_threshold'"; | |
1576 | return -EINVAL; | |
1577 | } | |
1578 | clone->hydration_threshold = value; | |
1579 | } else if (!strcasecmp(arg_name, "hydration_batch_size")) { | |
1580 | if (kstrtouint(dm_shift_arg(as), 10, &value)) { | |
1581 | ti->error = "Invalid value for argument `hydration_batch_size'"; | |
1582 | return -EINVAL; | |
1583 | } | |
1584 | clone->hydration_batch_size = value; | |
1585 | } else { | |
1586 | ti->error = "Invalid core argument"; | |
1587 | return -EINVAL; | |
1588 | } | |
1589 | } | |
1590 | ||
1591 | return 0; | |
1592 | } | |
1593 | ||
1594 | static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error) | |
1595 | { | |
1596 | int r; | |
1597 | unsigned int region_size; | |
1598 | struct dm_arg arg; | |
1599 | ||
1600 | arg.min = MIN_REGION_SIZE; | |
1601 | arg.max = MAX_REGION_SIZE; | |
1602 | arg.error = "Invalid region size"; | |
1603 | ||
1604 | r = dm_read_arg(&arg, as, ®ion_size, error); | |
1605 | if (r) | |
1606 | return r; | |
1607 | ||
1608 | /* Check region size is a power of 2 */ | |
1609 | if (!is_power_of_2(region_size)) { | |
1610 | *error = "Region size is not a power of 2"; | |
1611 | return -EINVAL; | |
1612 | } | |
1613 | ||
1614 | /* Validate the region size against the device logical block size */ | |
1615 | if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) || | |
1616 | region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) { | |
1617 | *error = "Region size is not a multiple of device logical block size"; | |
1618 | return -EINVAL; | |
1619 | } | |
1620 | ||
1621 | clone->region_size = region_size; | |
1622 | ||
1623 | return 0; | |
1624 | } | |
1625 | ||
1626 | static int validate_nr_regions(unsigned long n, char **error) | |
1627 | { | |
1628 | /* | |
1629 | * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us | |
1630 | * further to 2^31 regions. | |
1631 | */ | |
1632 | if (n > (1UL << 31)) { | |
1633 | *error = "Too many regions. Consider increasing the region size"; | |
1634 | return -EINVAL; | |
1635 | } | |
1636 | ||
1637 | return 0; | |
1638 | } | |
1639 | ||
1640 | static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error) | |
1641 | { | |
1642 | int r; | |
1643 | sector_t metadata_dev_size; | |
1644 | char b[BDEVNAME_SIZE]; | |
1645 | ||
1646 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | |
1647 | &clone->metadata_dev); | |
1648 | if (r) { | |
1649 | *error = "Error opening metadata device"; | |
1650 | return r; | |
1651 | } | |
1652 | ||
1653 | metadata_dev_size = get_dev_size(clone->metadata_dev); | |
1654 | if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING) | |
1655 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", | |
1656 | bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS); | |
1657 | ||
1658 | return 0; | |
1659 | } | |
1660 | ||
1661 | static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error) | |
1662 | { | |
1663 | int r; | |
1664 | sector_t dest_dev_size; | |
1665 | ||
1666 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | |
1667 | &clone->dest_dev); | |
1668 | if (r) { | |
1669 | *error = "Error opening destination device"; | |
1670 | return r; | |
1671 | } | |
1672 | ||
1673 | dest_dev_size = get_dev_size(clone->dest_dev); | |
1674 | if (dest_dev_size < clone->ti->len) { | |
1675 | dm_put_device(clone->ti, clone->dest_dev); | |
1676 | *error = "Device size larger than destination device"; | |
1677 | return -EINVAL; | |
1678 | } | |
1679 | ||
1680 | return 0; | |
1681 | } | |
1682 | ||
1683 | static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error) | |
1684 | { | |
1685 | int r; | |
1686 | sector_t source_dev_size; | |
1687 | ||
1688 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ, | |
1689 | &clone->source_dev); | |
1690 | if (r) { | |
1691 | *error = "Error opening source device"; | |
1692 | return r; | |
1693 | } | |
1694 | ||
1695 | source_dev_size = get_dev_size(clone->source_dev); | |
1696 | if (source_dev_size < clone->ti->len) { | |
1697 | dm_put_device(clone->ti, clone->source_dev); | |
1698 | *error = "Device size larger than source device"; | |
1699 | return -EINVAL; | |
1700 | } | |
1701 | ||
1702 | return 0; | |
1703 | } | |
1704 | ||
1705 | static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error) | |
1706 | { | |
1707 | unsigned int i; | |
1708 | const char **copy; | |
1709 | ||
1710 | copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); | |
1711 | if (!copy) | |
1712 | goto error; | |
1713 | ||
1714 | for (i = 0; i < argc; i++) { | |
1715 | copy[i] = kstrdup(argv[i], GFP_KERNEL); | |
1716 | ||
1717 | if (!copy[i]) { | |
1718 | while (i--) | |
1719 | kfree(copy[i]); | |
1720 | kfree(copy); | |
1721 | goto error; | |
1722 | } | |
1723 | } | |
1724 | ||
1725 | clone->nr_ctr_args = argc; | |
1726 | clone->ctr_args = copy; | |
1727 | return 0; | |
1728 | ||
1729 | error: | |
1730 | *error = "Failed to allocate memory for table line"; | |
1731 | return -ENOMEM; | |
1732 | } | |
1733 | ||
1734 | static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |
1735 | { | |
1736 | int r; | |
1737 | struct clone *clone; | |
1738 | struct dm_arg_set as; | |
1739 | ||
1740 | if (argc < 4) { | |
1741 | ti->error = "Invalid number of arguments"; | |
1742 | return -EINVAL; | |
1743 | } | |
1744 | ||
1745 | as.argc = argc; | |
1746 | as.argv = argv; | |
1747 | ||
1748 | clone = kzalloc(sizeof(*clone), GFP_KERNEL); | |
1749 | if (!clone) { | |
1750 | ti->error = "Failed to allocate clone structure"; | |
1751 | return -ENOMEM; | |
1752 | } | |
1753 | ||
1754 | clone->ti = ti; | |
1755 | ||
1756 | /* Initialize dm-clone flags */ | |
1757 | __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | |
1758 | __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | |
1759 | __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | |
1760 | ||
1761 | r = parse_metadata_dev(clone, &as, &ti->error); | |
1762 | if (r) | |
1763 | goto out_with_clone; | |
1764 | ||
1765 | r = parse_dest_dev(clone, &as, &ti->error); | |
1766 | if (r) | |
1767 | goto out_with_meta_dev; | |
1768 | ||
1769 | r = parse_source_dev(clone, &as, &ti->error); | |
1770 | if (r) | |
1771 | goto out_with_dest_dev; | |
1772 | ||
1773 | r = parse_region_size(clone, &as, &ti->error); | |
1774 | if (r) | |
1775 | goto out_with_source_dev; | |
1776 | ||
1777 | clone->region_shift = __ffs(clone->region_size); | |
1778 | clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size); | |
1779 | ||
1780 | r = validate_nr_regions(clone->nr_regions, &ti->error); | |
1781 | if (r) | |
1782 | goto out_with_source_dev; | |
1783 | ||
1784 | r = dm_set_target_max_io_len(ti, clone->region_size); | |
1785 | if (r) { | |
1786 | ti->error = "Failed to set max io len"; | |
1787 | goto out_with_source_dev; | |
1788 | } | |
1789 | ||
1790 | r = parse_feature_args(&as, clone); | |
1791 | if (r) | |
1792 | goto out_with_source_dev; | |
1793 | ||
1794 | r = parse_core_args(&as, clone); | |
1795 | if (r) | |
1796 | goto out_with_source_dev; | |
1797 | ||
1798 | /* Load metadata */ | |
1799 | clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len, | |
1800 | clone->region_size); | |
1801 | if (IS_ERR(clone->cmd)) { | |
1802 | ti->error = "Failed to load metadata"; | |
1803 | r = PTR_ERR(clone->cmd); | |
1804 | goto out_with_source_dev; | |
1805 | } | |
1806 | ||
1807 | __set_clone_mode(clone, CM_WRITE); | |
1808 | ||
1809 | if (get_clone_mode(clone) != CM_WRITE) { | |
1810 | ti->error = "Unable to get write access to metadata, please check/repair metadata"; | |
1811 | r = -EPERM; | |
1812 | goto out_with_metadata; | |
1813 | } | |
1814 | ||
1815 | clone->last_commit_jiffies = jiffies; | |
1816 | ||
1817 | /* Allocate hydration hash table */ | |
1818 | r = hash_table_init(clone); | |
1819 | if (r) { | |
1820 | ti->error = "Failed to allocate hydration hash table"; | |
1821 | goto out_with_metadata; | |
1822 | } | |
1823 | ||
1824 | atomic_set(&clone->ios_in_flight, 0); | |
1825 | init_waitqueue_head(&clone->hydration_stopped); | |
1826 | spin_lock_init(&clone->lock); | |
1827 | bio_list_init(&clone->deferred_bios); | |
1828 | bio_list_init(&clone->deferred_discard_bios); | |
1829 | bio_list_init(&clone->deferred_flush_bios); | |
1830 | bio_list_init(&clone->deferred_flush_completions); | |
1831 | clone->hydration_offset = 0; | |
1832 | atomic_set(&clone->hydrations_in_flight, 0); | |
1833 | ||
1834 | clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); | |
1835 | if (!clone->wq) { | |
1836 | ti->error = "Failed to allocate workqueue"; | |
1837 | r = -ENOMEM; | |
1838 | goto out_with_ht; | |
1839 | } | |
1840 | ||
1841 | INIT_WORK(&clone->worker, do_worker); | |
1842 | INIT_DELAYED_WORK(&clone->waker, do_waker); | |
1843 | ||
1844 | clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); | |
1845 | if (IS_ERR(clone->kcopyd_client)) { | |
1846 | r = PTR_ERR(clone->kcopyd_client); | |
1847 | goto out_with_wq; | |
1848 | } | |
1849 | ||
1850 | r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS, | |
1851 | _hydration_cache); | |
1852 | if (r) { | |
1853 | ti->error = "Failed to create dm_clone_region_hydration memory pool"; | |
1854 | goto out_with_kcopyd; | |
1855 | } | |
1856 | ||
1857 | /* Save a copy of the table line */ | |
1858 | r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error); | |
1859 | if (r) | |
1860 | goto out_with_mempool; | |
1861 | ||
1862 | mutex_init(&clone->commit_lock); | |
1863 | clone->callbacks.congested_fn = clone_is_congested; | |
1864 | dm_table_add_target_callbacks(ti->table, &clone->callbacks); | |
1865 | ||
1866 | /* Enable flushes */ | |
1867 | ti->num_flush_bios = 1; | |
1868 | ti->flush_supported = true; | |
1869 | ||
1870 | /* Enable discards */ | |
1871 | ti->discards_supported = true; | |
1872 | ti->num_discard_bios = 1; | |
1873 | ||
1874 | ti->private = clone; | |
1875 | ||
1876 | return 0; | |
1877 | ||
1878 | out_with_mempool: | |
1879 | mempool_exit(&clone->hydration_pool); | |
1880 | out_with_kcopyd: | |
1881 | dm_kcopyd_client_destroy(clone->kcopyd_client); | |
1882 | out_with_wq: | |
1883 | destroy_workqueue(clone->wq); | |
1884 | out_with_ht: | |
1885 | hash_table_exit(clone); | |
1886 | out_with_metadata: | |
1887 | dm_clone_metadata_close(clone->cmd); | |
1888 | out_with_source_dev: | |
1889 | dm_put_device(ti, clone->source_dev); | |
1890 | out_with_dest_dev: | |
1891 | dm_put_device(ti, clone->dest_dev); | |
1892 | out_with_meta_dev: | |
1893 | dm_put_device(ti, clone->metadata_dev); | |
1894 | out_with_clone: | |
1895 | kfree(clone); | |
1896 | ||
1897 | return r; | |
1898 | } | |
1899 | ||
1900 | static void clone_dtr(struct dm_target *ti) | |
1901 | { | |
1902 | unsigned int i; | |
1903 | struct clone *clone = ti->private; | |
1904 | ||
1905 | mutex_destroy(&clone->commit_lock); | |
1906 | ||
1907 | for (i = 0; i < clone->nr_ctr_args; i++) | |
1908 | kfree(clone->ctr_args[i]); | |
1909 | kfree(clone->ctr_args); | |
1910 | ||
1911 | mempool_exit(&clone->hydration_pool); | |
1912 | dm_kcopyd_client_destroy(clone->kcopyd_client); | |
1913 | destroy_workqueue(clone->wq); | |
1914 | hash_table_exit(clone); | |
1915 | dm_clone_metadata_close(clone->cmd); | |
1916 | dm_put_device(ti, clone->source_dev); | |
1917 | dm_put_device(ti, clone->dest_dev); | |
1918 | dm_put_device(ti, clone->metadata_dev); | |
1919 | ||
1920 | kfree(clone); | |
1921 | } | |
1922 | ||
1923 | /*---------------------------------------------------------------------------*/ | |
1924 | ||
1925 | static void clone_postsuspend(struct dm_target *ti) | |
1926 | { | |
1927 | struct clone *clone = ti->private; | |
1928 | ||
1929 | /* | |
1930 | * To successfully suspend the device: | |
1931 | * | |
1932 | * - We cancel the delayed work for periodic commits and wait for | |
1933 | * it to finish. | |
1934 | * | |
1935 | * - We stop the background hydration, i.e. we prevent new region | |
1936 | * hydrations from starting. | |
1937 | * | |
1938 | * - We wait for any in-flight hydrations to finish. | |
1939 | * | |
1940 | * - We flush the workqueue. | |
1941 | * | |
1942 | * - We commit the metadata. | |
1943 | */ | |
1944 | cancel_delayed_work_sync(&clone->waker); | |
1945 | ||
1946 | set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | |
1947 | ||
1948 | /* | |
1949 | * Make sure set_bit() is ordered before atomic_read(), otherwise we | |
1950 | * might race with do_hydration() and miss some started region | |
1951 | * hydrations. | |
1952 | * | |
1953 | * This is paired with smp_mb__after_atomic() in do_hydration(). | |
1954 | */ | |
1955 | smp_mb__after_atomic(); | |
1956 | ||
1957 | wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight)); | |
1958 | flush_workqueue(clone->wq); | |
1959 | ||
1960 | (void) commit_metadata(clone); | |
1961 | } | |
1962 | ||
1963 | static void clone_resume(struct dm_target *ti) | |
1964 | { | |
1965 | struct clone *clone = ti->private; | |
1966 | ||
1967 | clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | |
1968 | do_waker(&clone->waker.work); | |
1969 | } | |
1970 | ||
1971 | static bool bdev_supports_discards(struct block_device *bdev) | |
1972 | { | |
1973 | struct request_queue *q = bdev_get_queue(bdev); | |
1974 | ||
1975 | return (q && blk_queue_discard(q)); | |
1976 | } | |
1977 | ||
1978 | /* | |
1979 | * If discard_passdown was enabled verify that the destination device supports | |
1980 | * discards. Disable discard_passdown if not. | |
1981 | */ | |
1982 | static void disable_passdown_if_not_supported(struct clone *clone) | |
1983 | { | |
1984 | struct block_device *dest_dev = clone->dest_dev->bdev; | |
1985 | struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; | |
1986 | const char *reason = NULL; | |
1987 | char buf[BDEVNAME_SIZE]; | |
1988 | ||
1989 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) | |
1990 | return; | |
1991 | ||
1992 | if (!bdev_supports_discards(dest_dev)) | |
1993 | reason = "discard unsupported"; | |
1994 | else if (dest_limits->max_discard_sectors < clone->region_size) | |
1995 | reason = "max discard sectors smaller than a region"; | |
1996 | ||
1997 | if (reason) { | |
1998 | DMWARN("Destination device (%s) %s: Disabling discard passdown.", | |
1999 | bdevname(dest_dev, buf), reason); | |
2000 | clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | |
2001 | } | |
2002 | } | |
2003 | ||
2004 | static void set_discard_limits(struct clone *clone, struct queue_limits *limits) | |
2005 | { | |
2006 | struct block_device *dest_bdev = clone->dest_dev->bdev; | |
2007 | struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; | |
2008 | ||
2009 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { | |
2010 | /* No passdown is done so we set our own virtual limits */ | |
2011 | limits->discard_granularity = clone->region_size << SECTOR_SHIFT; | |
2012 | limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size); | |
2013 | return; | |
2014 | } | |
2015 | ||
2016 | /* | |
2017 | * clone_iterate_devices() is stacking both the source and destination | |
2018 | * device limits but discards aren't passed to the source device, so | |
2019 | * inherit destination's limits. | |
2020 | */ | |
2021 | limits->max_discard_sectors = dest_limits->max_discard_sectors; | |
2022 | limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; | |
2023 | limits->discard_granularity = dest_limits->discard_granularity; | |
2024 | limits->discard_alignment = dest_limits->discard_alignment; | |
2025 | limits->discard_misaligned = dest_limits->discard_misaligned; | |
2026 | limits->max_discard_segments = dest_limits->max_discard_segments; | |
2027 | } | |
2028 | ||
2029 | static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits) | |
2030 | { | |
2031 | struct clone *clone = ti->private; | |
2032 | u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; | |
2033 | ||
2034 | /* | |
2035 | * If the system-determined stacked limits are compatible with | |
2036 | * dm-clone's region size (io_opt is a factor) do not override them. | |
2037 | */ | |
2038 | if (io_opt_sectors < clone->region_size || | |
2039 | do_div(io_opt_sectors, clone->region_size)) { | |
2040 | blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT); | |
2041 | blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT); | |
2042 | } | |
2043 | ||
2044 | disable_passdown_if_not_supported(clone); | |
2045 | set_discard_limits(clone, limits); | |
2046 | } | |
2047 | ||
2048 | static int clone_iterate_devices(struct dm_target *ti, | |
2049 | iterate_devices_callout_fn fn, void *data) | |
2050 | { | |
2051 | int ret; | |
2052 | struct clone *clone = ti->private; | |
2053 | struct dm_dev *dest_dev = clone->dest_dev; | |
2054 | struct dm_dev *source_dev = clone->source_dev; | |
2055 | ||
2056 | ret = fn(ti, source_dev, 0, ti->len, data); | |
2057 | if (!ret) | |
2058 | ret = fn(ti, dest_dev, 0, ti->len, data); | |
2059 | return ret; | |
2060 | } | |
2061 | ||
2062 | /* | |
2063 | * dm-clone message functions. | |
2064 | */ | |
2065 | static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions) | |
2066 | { | |
2067 | WRITE_ONCE(clone->hydration_threshold, nr_regions); | |
2068 | ||
2069 | /* | |
2070 | * If user space sets hydration_threshold to zero then the hydration | |
2071 | * will stop. If at a later time the hydration_threshold is increased | |
2072 | * we must restart the hydration process by waking up the worker. | |
2073 | */ | |
2074 | wake_worker(clone); | |
2075 | } | |
2076 | ||
2077 | static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions) | |
2078 | { | |
2079 | WRITE_ONCE(clone->hydration_batch_size, nr_regions); | |
2080 | } | |
2081 | ||
2082 | static void enable_hydration(struct clone *clone) | |
2083 | { | |
2084 | if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) | |
2085 | wake_worker(clone); | |
2086 | } | |
2087 | ||
2088 | static void disable_hydration(struct clone *clone) | |
2089 | { | |
2090 | clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | |
2091 | } | |
2092 | ||
2093 | static int clone_message(struct dm_target *ti, unsigned int argc, char **argv, | |
2094 | char *result, unsigned int maxlen) | |
2095 | { | |
2096 | struct clone *clone = ti->private; | |
2097 | unsigned int value; | |
2098 | ||
2099 | if (!argc) | |
2100 | return -EINVAL; | |
2101 | ||
2102 | if (!strcasecmp(argv[0], "enable_hydration")) { | |
2103 | enable_hydration(clone); | |
2104 | return 0; | |
2105 | } | |
2106 | ||
2107 | if (!strcasecmp(argv[0], "disable_hydration")) { | |
2108 | disable_hydration(clone); | |
2109 | return 0; | |
2110 | } | |
2111 | ||
2112 | if (argc != 2) | |
2113 | return -EINVAL; | |
2114 | ||
2115 | if (!strcasecmp(argv[0], "hydration_threshold")) { | |
2116 | if (kstrtouint(argv[1], 10, &value)) | |
2117 | return -EINVAL; | |
2118 | ||
2119 | set_hydration_threshold(clone, value); | |
2120 | ||
2121 | return 0; | |
2122 | } | |
2123 | ||
2124 | if (!strcasecmp(argv[0], "hydration_batch_size")) { | |
2125 | if (kstrtouint(argv[1], 10, &value)) | |
2126 | return -EINVAL; | |
2127 | ||
2128 | set_hydration_batch_size(clone, value); | |
2129 | ||
2130 | return 0; | |
2131 | } | |
2132 | ||
2133 | DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]); | |
2134 | return -EINVAL; | |
2135 | } | |
2136 | ||
2137 | static struct target_type clone_target = { | |
2138 | .name = "clone", | |
2139 | .version = {1, 0, 0}, | |
2140 | .module = THIS_MODULE, | |
2141 | .ctr = clone_ctr, | |
2142 | .dtr = clone_dtr, | |
2143 | .map = clone_map, | |
2144 | .end_io = clone_endio, | |
2145 | .postsuspend = clone_postsuspend, | |
2146 | .resume = clone_resume, | |
2147 | .status = clone_status, | |
2148 | .message = clone_message, | |
2149 | .io_hints = clone_io_hints, | |
2150 | .iterate_devices = clone_iterate_devices, | |
2151 | }; | |
2152 | ||
2153 | /*---------------------------------------------------------------------------*/ | |
2154 | ||
2155 | /* Module functions */ | |
2156 | static int __init dm_clone_init(void) | |
2157 | { | |
2158 | int r; | |
2159 | ||
2160 | _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0); | |
2161 | if (!_hydration_cache) | |
2162 | return -ENOMEM; | |
2163 | ||
2164 | r = dm_register_target(&clone_target); | |
2165 | if (r < 0) { | |
2166 | DMERR("Failed to register clone target"); | |
2167 | return r; | |
2168 | } | |
2169 | ||
2170 | return 0; | |
2171 | } | |
2172 | ||
2173 | static void __exit dm_clone_exit(void) | |
2174 | { | |
2175 | dm_unregister_target(&clone_target); | |
2176 | ||
2177 | kmem_cache_destroy(_hydration_cache); | |
2178 | _hydration_cache = NULL; | |
2179 | } | |
2180 | ||
2181 | /* Module hooks */ | |
2182 | module_init(dm_clone_init); | |
2183 | module_exit(dm_clone_exit); | |
2184 | ||
2185 | MODULE_DESCRIPTION(DM_NAME " clone target"); | |
2186 | MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>"); | |
2187 | MODULE_LICENSE("GPL"); |