Commit | Line | Data |
---|---|---|
0e9cebe7 JB |
1 | /* |
2 | * Copyright (C) 2014 Facebook. All rights reserved. | |
3 | * | |
4 | * This file is released under the GPL. | |
5 | */ | |
6 | ||
7 | #include <linux/device-mapper.h> | |
8 | ||
9 | #include <linux/module.h> | |
10 | #include <linux/init.h> | |
11 | #include <linux/blkdev.h> | |
12 | #include <linux/bio.h> | |
13 | #include <linux/slab.h> | |
14 | #include <linux/kthread.h> | |
15 | #include <linux/freezer.h> | |
16 | ||
17 | #define DM_MSG_PREFIX "log-writes" | |
18 | ||
19 | /* | |
20 | * This target will sequentially log all writes to the target device onto the | |
21 | * log device. This is helpful for replaying writes to check for fs consistency | |
22 | * at all times. This target provides a mechanism to mark specific events to | |
23 | * check data at a later time. So for example you would: | |
24 | * | |
25 | * write data | |
26 | * fsync | |
27 | * dmsetup message /dev/whatever mark mymark | |
28 | * unmount /mnt/test | |
29 | * | |
30 | * Then replay the log up to mymark and check the contents of the replay to | |
31 | * verify it matches what was written. | |
32 | * | |
33 | * We log writes only after they have been flushed, this makes the log describe | |
34 | * close to the order in which the data hits the actual disk, not its cache. So | |
35 | * for example the following sequence (W means write, C means complete) | |
36 | * | |
37 | * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd | |
38 | * | |
39 | * Would result in the log looking like this: | |
40 | * | |
41 | * c,a,flush,fuad,b,<other writes>,<next flush> | |
42 | * | |
43 | * This is meant to help expose problems where file systems do not properly wait | |
44 | * on data being written before invoking a FLUSH. FUA bypasses cache so once it | |
45 | * completes it is added to the log as it should be on disk. | |
46 | * | |
47 | * We treat DISCARDs as if they don't bypass cache so that they are logged in | |
48 | * order of completion along with the normal writes. If we didn't do it this | |
49 | * way we would process all the discards first and then write all the data, when | |
50 | * in fact we want to do the data and the discard in the order that they | |
51 | * completed. | |
52 | */ | |
53 | #define LOG_FLUSH_FLAG (1 << 0) | |
54 | #define LOG_FUA_FLAG (1 << 1) | |
55 | #define LOG_DISCARD_FLAG (1 << 2) | |
56 | #define LOG_MARK_FLAG (1 << 3) | |
57 | ||
f4ad317a GU |
58 | #define WRITE_LOG_VERSION 1ULL |
59 | #define WRITE_LOG_MAGIC 0x6a736677736872ULL | |
0e9cebe7 JB |
60 | |
61 | /* | |
62 | * The disk format for this is braindead simple. | |
63 | * | |
64 | * At byte 0 we have our super, followed by the following sequence for | |
65 | * nr_entries: | |
66 | * | |
67 | * [ 1 sector ][ entry->nr_sectors ] | |
68 | * [log_write_entry][ data written ] | |
69 | * | |
70 | * The log_write_entry takes up a full sector so we can have arbitrary length | |
71 | * marks and it leaves us room for extra content in the future. | |
72 | */ | |
73 | ||
74 | /* | |
75 | * Basic info about the log for userspace. | |
76 | */ | |
77 | struct log_write_super { | |
78 | __le64 magic; | |
79 | __le64 version; | |
80 | __le64 nr_entries; | |
81 | __le32 sectorsize; | |
82 | }; | |
83 | ||
84 | /* | |
85 | * sector - the sector we wrote. | |
86 | * nr_sectors - the number of sectors we wrote. | |
87 | * flags - flags for this log entry. | |
88 | * data_len - the size of the data in this log entry, this is for private log | |
89 | * entry stuff, the MARK data provided by userspace for example. | |
90 | */ | |
91 | struct log_write_entry { | |
92 | __le64 sector; | |
93 | __le64 nr_sectors; | |
94 | __le64 flags; | |
95 | __le64 data_len; | |
96 | }; | |
97 | ||
98 | struct log_writes_c { | |
99 | struct dm_dev *dev; | |
100 | struct dm_dev *logdev; | |
101 | u64 logged_entries; | |
102 | u32 sectorsize; | |
103 | atomic_t io_blocks; | |
104 | atomic_t pending_blocks; | |
105 | sector_t next_sector; | |
106 | sector_t end_sector; | |
107 | bool logging_enabled; | |
108 | bool device_supports_discard; | |
109 | spinlock_t blocks_lock; | |
110 | struct list_head unflushed_blocks; | |
111 | struct list_head logging_blocks; | |
112 | wait_queue_head_t wait; | |
113 | struct task_struct *log_kthread; | |
114 | }; | |
115 | ||
116 | struct pending_block { | |
117 | int vec_cnt; | |
118 | u64 flags; | |
119 | sector_t sector; | |
120 | sector_t nr_sectors; | |
121 | char *data; | |
122 | u32 datalen; | |
123 | struct list_head list; | |
124 | struct bio_vec vecs[0]; | |
125 | }; | |
126 | ||
127 | struct per_bio_data { | |
128 | struct pending_block *block; | |
129 | }; | |
130 | ||
131 | static void put_pending_block(struct log_writes_c *lc) | |
132 | { | |
133 | if (atomic_dec_and_test(&lc->pending_blocks)) { | |
134 | smp_mb__after_atomic(); | |
135 | if (waitqueue_active(&lc->wait)) | |
136 | wake_up(&lc->wait); | |
137 | } | |
138 | } | |
139 | ||
140 | static void put_io_block(struct log_writes_c *lc) | |
141 | { | |
142 | if (atomic_dec_and_test(&lc->io_blocks)) { | |
143 | smp_mb__after_atomic(); | |
144 | if (waitqueue_active(&lc->wait)) | |
145 | wake_up(&lc->wait); | |
146 | } | |
147 | } | |
148 | ||
4246a0b6 | 149 | static void log_end_io(struct bio *bio) |
0e9cebe7 JB |
150 | { |
151 | struct log_writes_c *lc = bio->bi_private; | |
152 | struct bio_vec *bvec; | |
153 | int i; | |
154 | ||
4246a0b6 | 155 | if (bio->bi_error) { |
0e9cebe7 JB |
156 | unsigned long flags; |
157 | ||
4246a0b6 | 158 | DMERR("Error writing log block, error=%d", bio->bi_error); |
0e9cebe7 JB |
159 | spin_lock_irqsave(&lc->blocks_lock, flags); |
160 | lc->logging_enabled = false; | |
161 | spin_unlock_irqrestore(&lc->blocks_lock, flags); | |
162 | } | |
163 | ||
164 | bio_for_each_segment_all(bvec, bio, i) | |
165 | __free_page(bvec->bv_page); | |
166 | ||
167 | put_io_block(lc); | |
168 | bio_put(bio); | |
169 | } | |
170 | ||
171 | /* | |
172 | * Meant to be called if there is an error, it will free all the pages | |
173 | * associated with the block. | |
174 | */ | |
175 | static void free_pending_block(struct log_writes_c *lc, | |
176 | struct pending_block *block) | |
177 | { | |
178 | int i; | |
179 | ||
180 | for (i = 0; i < block->vec_cnt; i++) { | |
181 | if (block->vecs[i].bv_page) | |
182 | __free_page(block->vecs[i].bv_page); | |
183 | } | |
184 | kfree(block->data); | |
185 | kfree(block); | |
186 | put_pending_block(lc); | |
187 | } | |
188 | ||
189 | static int write_metadata(struct log_writes_c *lc, void *entry, | |
190 | size_t entrylen, void *data, size_t datalen, | |
191 | sector_t sector) | |
192 | { | |
193 | struct bio *bio; | |
194 | struct page *page; | |
195 | void *ptr; | |
196 | size_t ret; | |
197 | ||
198 | bio = bio_alloc(GFP_KERNEL, 1); | |
199 | if (!bio) { | |
200 | DMERR("Couldn't alloc log bio"); | |
201 | goto error; | |
202 | } | |
203 | bio->bi_iter.bi_size = 0; | |
204 | bio->bi_iter.bi_sector = sector; | |
205 | bio->bi_bdev = lc->logdev->bdev; | |
206 | bio->bi_end_io = log_end_io; | |
207 | bio->bi_private = lc; | |
0e9cebe7 JB |
208 | |
209 | page = alloc_page(GFP_KERNEL); | |
210 | if (!page) { | |
211 | DMERR("Couldn't alloc log page"); | |
212 | bio_put(bio); | |
213 | goto error; | |
214 | } | |
215 | ||
216 | ptr = kmap_atomic(page); | |
217 | memcpy(ptr, entry, entrylen); | |
218 | if (datalen) | |
219 | memcpy(ptr + entrylen, data, datalen); | |
220 | memset(ptr + entrylen + datalen, 0, | |
221 | lc->sectorsize - entrylen - datalen); | |
222 | kunmap_atomic(ptr); | |
223 | ||
224 | ret = bio_add_page(bio, page, lc->sectorsize, 0); | |
225 | if (ret != lc->sectorsize) { | |
226 | DMERR("Couldn't add page to the log block"); | |
227 | goto error_bio; | |
228 | } | |
229 | submit_bio(WRITE, bio); | |
230 | return 0; | |
231 | error_bio: | |
232 | bio_put(bio); | |
233 | __free_page(page); | |
234 | error: | |
235 | put_io_block(lc); | |
236 | return -1; | |
237 | } | |
238 | ||
239 | static int log_one_block(struct log_writes_c *lc, | |
240 | struct pending_block *block, sector_t sector) | |
241 | { | |
242 | struct bio *bio; | |
243 | struct log_write_entry entry; | |
244 | size_t ret; | |
245 | int i; | |
246 | ||
247 | entry.sector = cpu_to_le64(block->sector); | |
248 | entry.nr_sectors = cpu_to_le64(block->nr_sectors); | |
249 | entry.flags = cpu_to_le64(block->flags); | |
250 | entry.data_len = cpu_to_le64(block->datalen); | |
251 | if (write_metadata(lc, &entry, sizeof(entry), block->data, | |
252 | block->datalen, sector)) { | |
253 | free_pending_block(lc, block); | |
254 | return -1; | |
255 | } | |
256 | ||
257 | if (!block->vec_cnt) | |
258 | goto out; | |
259 | sector++; | |
260 | ||
261 | bio = bio_alloc(GFP_KERNEL, block->vec_cnt); | |
262 | if (!bio) { | |
263 | DMERR("Couldn't alloc log bio"); | |
264 | goto error; | |
265 | } | |
266 | atomic_inc(&lc->io_blocks); | |
267 | bio->bi_iter.bi_size = 0; | |
268 | bio->bi_iter.bi_sector = sector; | |
269 | bio->bi_bdev = lc->logdev->bdev; | |
270 | bio->bi_end_io = log_end_io; | |
271 | bio->bi_private = lc; | |
0e9cebe7 JB |
272 | |
273 | for (i = 0; i < block->vec_cnt; i++) { | |
274 | /* | |
275 | * The page offset is always 0 because we allocate a new page | |
276 | * for every bvec in the original bio for simplicity sake. | |
277 | */ | |
278 | ret = bio_add_page(bio, block->vecs[i].bv_page, | |
279 | block->vecs[i].bv_len, 0); | |
280 | if (ret != block->vecs[i].bv_len) { | |
281 | atomic_inc(&lc->io_blocks); | |
282 | submit_bio(WRITE, bio); | |
283 | bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i); | |
284 | if (!bio) { | |
285 | DMERR("Couldn't alloc log bio"); | |
286 | goto error; | |
287 | } | |
288 | bio->bi_iter.bi_size = 0; | |
289 | bio->bi_iter.bi_sector = sector; | |
290 | bio->bi_bdev = lc->logdev->bdev; | |
291 | bio->bi_end_io = log_end_io; | |
292 | bio->bi_private = lc; | |
0e9cebe7 JB |
293 | |
294 | ret = bio_add_page(bio, block->vecs[i].bv_page, | |
295 | block->vecs[i].bv_len, 0); | |
296 | if (ret != block->vecs[i].bv_len) { | |
297 | DMERR("Couldn't add page on new bio?"); | |
298 | bio_put(bio); | |
299 | goto error; | |
300 | } | |
301 | } | |
302 | sector += block->vecs[i].bv_len >> SECTOR_SHIFT; | |
303 | } | |
304 | submit_bio(WRITE, bio); | |
305 | out: | |
306 | kfree(block->data); | |
307 | kfree(block); | |
308 | put_pending_block(lc); | |
309 | return 0; | |
310 | error: | |
311 | free_pending_block(lc, block); | |
312 | put_io_block(lc); | |
313 | return -1; | |
314 | } | |
315 | ||
316 | static int log_super(struct log_writes_c *lc) | |
317 | { | |
318 | struct log_write_super super; | |
319 | ||
320 | super.magic = cpu_to_le64(WRITE_LOG_MAGIC); | |
321 | super.version = cpu_to_le64(WRITE_LOG_VERSION); | |
322 | super.nr_entries = cpu_to_le64(lc->logged_entries); | |
323 | super.sectorsize = cpu_to_le32(lc->sectorsize); | |
324 | ||
325 | if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) { | |
326 | DMERR("Couldn't write super"); | |
327 | return -1; | |
328 | } | |
329 | ||
330 | return 0; | |
331 | } | |
332 | ||
333 | static inline sector_t logdev_last_sector(struct log_writes_c *lc) | |
334 | { | |
335 | return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT; | |
336 | } | |
337 | ||
338 | static int log_writes_kthread(void *arg) | |
339 | { | |
340 | struct log_writes_c *lc = (struct log_writes_c *)arg; | |
341 | sector_t sector = 0; | |
342 | ||
343 | while (!kthread_should_stop()) { | |
344 | bool super = false; | |
345 | bool logging_enabled; | |
346 | struct pending_block *block = NULL; | |
347 | int ret; | |
348 | ||
349 | spin_lock_irq(&lc->blocks_lock); | |
350 | if (!list_empty(&lc->logging_blocks)) { | |
351 | block = list_first_entry(&lc->logging_blocks, | |
352 | struct pending_block, list); | |
353 | list_del_init(&block->list); | |
354 | if (!lc->logging_enabled) | |
355 | goto next; | |
356 | ||
357 | sector = lc->next_sector; | |
358 | if (block->flags & LOG_DISCARD_FLAG) | |
359 | lc->next_sector++; | |
360 | else | |
361 | lc->next_sector += block->nr_sectors + 1; | |
362 | ||
363 | /* | |
364 | * Apparently the size of the device may not be known | |
365 | * right away, so handle this properly. | |
366 | */ | |
367 | if (!lc->end_sector) | |
368 | lc->end_sector = logdev_last_sector(lc); | |
369 | if (lc->end_sector && | |
370 | lc->next_sector >= lc->end_sector) { | |
371 | DMERR("Ran out of space on the logdev"); | |
372 | lc->logging_enabled = false; | |
373 | goto next; | |
374 | } | |
375 | lc->logged_entries++; | |
376 | atomic_inc(&lc->io_blocks); | |
377 | ||
378 | super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG)); | |
379 | if (super) | |
380 | atomic_inc(&lc->io_blocks); | |
381 | } | |
382 | next: | |
383 | logging_enabled = lc->logging_enabled; | |
384 | spin_unlock_irq(&lc->blocks_lock); | |
385 | if (block) { | |
386 | if (logging_enabled) { | |
387 | ret = log_one_block(lc, block, sector); | |
388 | if (!ret && super) | |
389 | ret = log_super(lc); | |
390 | if (ret) { | |
391 | spin_lock_irq(&lc->blocks_lock); | |
392 | lc->logging_enabled = false; | |
393 | spin_unlock_irq(&lc->blocks_lock); | |
394 | } | |
395 | } else | |
396 | free_pending_block(lc, block); | |
397 | continue; | |
398 | } | |
399 | ||
400 | if (!try_to_freeze()) { | |
401 | set_current_state(TASK_INTERRUPTIBLE); | |
402 | if (!kthread_should_stop() && | |
403 | !atomic_read(&lc->pending_blocks)) | |
404 | schedule(); | |
405 | __set_current_state(TASK_RUNNING); | |
406 | } | |
407 | } | |
408 | return 0; | |
409 | } | |
410 | ||
411 | /* | |
412 | * Construct a log-writes mapping: | |
413 | * log-writes <dev_path> <log_dev_path> | |
414 | */ | |
415 | static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |
416 | { | |
417 | struct log_writes_c *lc; | |
418 | struct dm_arg_set as; | |
419 | const char *devname, *logdevname; | |
420 | ||
421 | as.argc = argc; | |
422 | as.argv = argv; | |
423 | ||
424 | if (argc < 2) { | |
425 | ti->error = "Invalid argument count"; | |
426 | return -EINVAL; | |
427 | } | |
428 | ||
429 | lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL); | |
430 | if (!lc) { | |
431 | ti->error = "Cannot allocate context"; | |
432 | return -ENOMEM; | |
433 | } | |
434 | spin_lock_init(&lc->blocks_lock); | |
435 | INIT_LIST_HEAD(&lc->unflushed_blocks); | |
436 | INIT_LIST_HEAD(&lc->logging_blocks); | |
437 | init_waitqueue_head(&lc->wait); | |
438 | lc->sectorsize = 1 << SECTOR_SHIFT; | |
439 | atomic_set(&lc->io_blocks, 0); | |
440 | atomic_set(&lc->pending_blocks, 0); | |
441 | ||
442 | devname = dm_shift_arg(&as); | |
443 | if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) { | |
444 | ti->error = "Device lookup failed"; | |
445 | goto bad; | |
446 | } | |
447 | ||
448 | logdevname = dm_shift_arg(&as); | |
449 | if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) { | |
450 | ti->error = "Log device lookup failed"; | |
451 | dm_put_device(ti, lc->dev); | |
452 | goto bad; | |
453 | } | |
454 | ||
455 | lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); | |
456 | if (!lc->log_kthread) { | |
457 | ti->error = "Couldn't alloc kthread"; | |
458 | dm_put_device(ti, lc->dev); | |
459 | dm_put_device(ti, lc->logdev); | |
460 | goto bad; | |
461 | } | |
462 | ||
463 | /* We put the super at sector 0, start logging at sector 1 */ | |
464 | lc->next_sector = 1; | |
465 | lc->logging_enabled = true; | |
466 | lc->end_sector = logdev_last_sector(lc); | |
467 | lc->device_supports_discard = true; | |
468 | ||
469 | ti->num_flush_bios = 1; | |
470 | ti->flush_supported = true; | |
471 | ti->num_discard_bios = 1; | |
472 | ti->discards_supported = true; | |
473 | ti->per_bio_data_size = sizeof(struct per_bio_data); | |
474 | ti->private = lc; | |
475 | return 0; | |
476 | ||
477 | bad: | |
478 | kfree(lc); | |
479 | return -EINVAL; | |
480 | } | |
481 | ||
482 | static int log_mark(struct log_writes_c *lc, char *data) | |
483 | { | |
484 | struct pending_block *block; | |
485 | size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry); | |
486 | ||
487 | block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); | |
488 | if (!block) { | |
489 | DMERR("Error allocating pending block"); | |
490 | return -ENOMEM; | |
491 | } | |
492 | ||
493 | block->data = kstrndup(data, maxsize, GFP_KERNEL); | |
494 | if (!block->data) { | |
495 | DMERR("Error copying mark data"); | |
496 | kfree(block); | |
497 | return -ENOMEM; | |
498 | } | |
499 | atomic_inc(&lc->pending_blocks); | |
500 | block->datalen = strlen(block->data); | |
501 | block->flags |= LOG_MARK_FLAG; | |
502 | spin_lock_irq(&lc->blocks_lock); | |
503 | list_add_tail(&block->list, &lc->logging_blocks); | |
504 | spin_unlock_irq(&lc->blocks_lock); | |
505 | wake_up_process(lc->log_kthread); | |
506 | return 0; | |
507 | } | |
508 | ||
509 | static void log_writes_dtr(struct dm_target *ti) | |
510 | { | |
511 | struct log_writes_c *lc = ti->private; | |
512 | ||
513 | spin_lock_irq(&lc->blocks_lock); | |
514 | list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks); | |
515 | spin_unlock_irq(&lc->blocks_lock); | |
516 | ||
517 | /* | |
518 | * This is just nice to have since it'll update the super to include the | |
519 | * unflushed blocks, if it fails we don't really care. | |
520 | */ | |
521 | log_mark(lc, "dm-log-writes-end"); | |
522 | wake_up_process(lc->log_kthread); | |
523 | wait_event(lc->wait, !atomic_read(&lc->io_blocks) && | |
524 | !atomic_read(&lc->pending_blocks)); | |
525 | kthread_stop(lc->log_kthread); | |
526 | ||
527 | WARN_ON(!list_empty(&lc->logging_blocks)); | |
528 | WARN_ON(!list_empty(&lc->unflushed_blocks)); | |
529 | dm_put_device(ti, lc->dev); | |
530 | dm_put_device(ti, lc->logdev); | |
531 | kfree(lc); | |
532 | } | |
533 | ||
534 | static void normal_map_bio(struct dm_target *ti, struct bio *bio) | |
535 | { | |
536 | struct log_writes_c *lc = ti->private; | |
537 | ||
538 | bio->bi_bdev = lc->dev->bdev; | |
539 | } | |
540 | ||
541 | static int log_writes_map(struct dm_target *ti, struct bio *bio) | |
542 | { | |
543 | struct log_writes_c *lc = ti->private; | |
544 | struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); | |
545 | struct pending_block *block; | |
546 | struct bvec_iter iter; | |
547 | struct bio_vec bv; | |
548 | size_t alloc_size; | |
549 | int i = 0; | |
550 | bool flush_bio = (bio->bi_rw & REQ_FLUSH); | |
551 | bool fua_bio = (bio->bi_rw & REQ_FUA); | |
552 | bool discard_bio = (bio->bi_rw & REQ_DISCARD); | |
553 | ||
554 | pb->block = NULL; | |
555 | ||
556 | /* Don't bother doing anything if logging has been disabled */ | |
557 | if (!lc->logging_enabled) | |
558 | goto map_bio; | |
559 | ||
560 | /* | |
561 | * Map reads as normal. | |
562 | */ | |
563 | if (bio_data_dir(bio) == READ) | |
564 | goto map_bio; | |
565 | ||
566 | /* No sectors and not a flush? Don't care */ | |
567 | if (!bio_sectors(bio) && !flush_bio) | |
568 | goto map_bio; | |
569 | ||
570 | /* | |
571 | * Discards will have bi_size set but there's no actual data, so just | |
572 | * allocate the size of the pending block. | |
573 | */ | |
574 | if (discard_bio) | |
575 | alloc_size = sizeof(struct pending_block); | |
576 | else | |
577 | alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio); | |
578 | ||
579 | block = kzalloc(alloc_size, GFP_NOIO); | |
580 | if (!block) { | |
581 | DMERR("Error allocating pending block"); | |
582 | spin_lock_irq(&lc->blocks_lock); | |
583 | lc->logging_enabled = false; | |
584 | spin_unlock_irq(&lc->blocks_lock); | |
585 | return -ENOMEM; | |
586 | } | |
587 | INIT_LIST_HEAD(&block->list); | |
588 | pb->block = block; | |
589 | atomic_inc(&lc->pending_blocks); | |
590 | ||
591 | if (flush_bio) | |
592 | block->flags |= LOG_FLUSH_FLAG; | |
593 | if (fua_bio) | |
594 | block->flags |= LOG_FUA_FLAG; | |
595 | if (discard_bio) | |
596 | block->flags |= LOG_DISCARD_FLAG; | |
597 | ||
598 | block->sector = bio->bi_iter.bi_sector; | |
599 | block->nr_sectors = bio_sectors(bio); | |
600 | ||
601 | /* We don't need the data, just submit */ | |
602 | if (discard_bio) { | |
603 | WARN_ON(flush_bio || fua_bio); | |
604 | if (lc->device_supports_discard) | |
605 | goto map_bio; | |
4246a0b6 | 606 | bio_endio(bio); |
0e9cebe7 JB |
607 | return DM_MAPIO_SUBMITTED; |
608 | } | |
609 | ||
610 | /* Flush bio, splice the unflushed blocks onto this list and submit */ | |
611 | if (flush_bio && !bio_sectors(bio)) { | |
612 | spin_lock_irq(&lc->blocks_lock); | |
613 | list_splice_init(&lc->unflushed_blocks, &block->list); | |
614 | spin_unlock_irq(&lc->blocks_lock); | |
615 | goto map_bio; | |
616 | } | |
617 | ||
618 | /* | |
619 | * We will write this bio somewhere else way later so we need to copy | |
620 | * the actual contents into new pages so we know the data will always be | |
621 | * there. | |
622 | * | |
623 | * We do this because this could be a bio from O_DIRECT in which case we | |
624 | * can't just hold onto the page until some later point, we have to | |
625 | * manually copy the contents. | |
626 | */ | |
627 | bio_for_each_segment(bv, bio, iter) { | |
628 | struct page *page; | |
629 | void *src, *dst; | |
630 | ||
631 | page = alloc_page(GFP_NOIO); | |
632 | if (!page) { | |
633 | DMERR("Error allocing page"); | |
634 | free_pending_block(lc, block); | |
635 | spin_lock_irq(&lc->blocks_lock); | |
636 | lc->logging_enabled = false; | |
637 | spin_unlock_irq(&lc->blocks_lock); | |
638 | return -ENOMEM; | |
639 | } | |
640 | ||
641 | src = kmap_atomic(bv.bv_page); | |
642 | dst = kmap_atomic(page); | |
643 | memcpy(dst, src + bv.bv_offset, bv.bv_len); | |
644 | kunmap_atomic(dst); | |
645 | kunmap_atomic(src); | |
646 | block->vecs[i].bv_page = page; | |
647 | block->vecs[i].bv_len = bv.bv_len; | |
648 | block->vec_cnt++; | |
649 | i++; | |
650 | } | |
651 | ||
652 | /* Had a flush with data in it, weird */ | |
653 | if (flush_bio) { | |
654 | spin_lock_irq(&lc->blocks_lock); | |
655 | list_splice_init(&lc->unflushed_blocks, &block->list); | |
656 | spin_unlock_irq(&lc->blocks_lock); | |
657 | } | |
658 | map_bio: | |
659 | normal_map_bio(ti, bio); | |
660 | return DM_MAPIO_REMAPPED; | |
661 | } | |
662 | ||
663 | static int normal_end_io(struct dm_target *ti, struct bio *bio, int error) | |
664 | { | |
665 | struct log_writes_c *lc = ti->private; | |
666 | struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); | |
667 | ||
668 | if (bio_data_dir(bio) == WRITE && pb->block) { | |
669 | struct pending_block *block = pb->block; | |
670 | unsigned long flags; | |
671 | ||
672 | spin_lock_irqsave(&lc->blocks_lock, flags); | |
673 | if (block->flags & LOG_FLUSH_FLAG) { | |
674 | list_splice_tail_init(&block->list, &lc->logging_blocks); | |
675 | list_add_tail(&block->list, &lc->logging_blocks); | |
676 | wake_up_process(lc->log_kthread); | |
677 | } else if (block->flags & LOG_FUA_FLAG) { | |
678 | list_add_tail(&block->list, &lc->logging_blocks); | |
679 | wake_up_process(lc->log_kthread); | |
680 | } else | |
681 | list_add_tail(&block->list, &lc->unflushed_blocks); | |
682 | spin_unlock_irqrestore(&lc->blocks_lock, flags); | |
683 | } | |
684 | ||
685 | return error; | |
686 | } | |
687 | ||
688 | /* | |
689 | * INFO format: <logged entries> <highest allocated sector> | |
690 | */ | |
691 | static void log_writes_status(struct dm_target *ti, status_type_t type, | |
692 | unsigned status_flags, char *result, | |
693 | unsigned maxlen) | |
694 | { | |
695 | unsigned sz = 0; | |
696 | struct log_writes_c *lc = ti->private; | |
697 | ||
698 | switch (type) { | |
699 | case STATUSTYPE_INFO: | |
700 | DMEMIT("%llu %llu", lc->logged_entries, | |
701 | (unsigned long long)lc->next_sector - 1); | |
702 | if (!lc->logging_enabled) | |
703 | DMEMIT(" logging_disabled"); | |
704 | break; | |
705 | ||
706 | case STATUSTYPE_TABLE: | |
707 | DMEMIT("%s %s", lc->dev->name, lc->logdev->name); | |
708 | break; | |
709 | } | |
710 | } | |
711 | ||
712 | static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd, | |
713 | unsigned long arg) | |
714 | { | |
715 | struct log_writes_c *lc = ti->private; | |
716 | struct dm_dev *dev = lc->dev; | |
717 | int r = 0; | |
718 | ||
719 | /* | |
720 | * Only pass ioctls through if the device sizes match exactly. | |
721 | */ | |
722 | if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) | |
723 | r = scsi_verify_blk_ioctl(NULL, cmd); | |
724 | ||
725 | return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); | |
726 | } | |
727 | ||
728 | static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |
729 | struct bio_vec *biovec, int max_size) | |
730 | { | |
731 | struct log_writes_c *lc = ti->private; | |
732 | struct request_queue *q = bdev_get_queue(lc->dev->bdev); | |
733 | ||
734 | if (!q->merge_bvec_fn) | |
735 | return max_size; | |
736 | ||
737 | bvm->bi_bdev = lc->dev->bdev; | |
738 | bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); | |
739 | ||
740 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | |
741 | } | |
742 | ||
743 | static int log_writes_iterate_devices(struct dm_target *ti, | |
744 | iterate_devices_callout_fn fn, | |
745 | void *data) | |
746 | { | |
747 | struct log_writes_c *lc = ti->private; | |
748 | ||
749 | return fn(ti, lc->dev, 0, ti->len, data); | |
750 | } | |
751 | ||
752 | /* | |
753 | * Messages supported: | |
754 | * mark <mark data> - specify the marked data. | |
755 | */ | |
756 | static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv) | |
757 | { | |
758 | int r = -EINVAL; | |
759 | struct log_writes_c *lc = ti->private; | |
760 | ||
761 | if (argc != 2) { | |
762 | DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc); | |
763 | return r; | |
764 | } | |
765 | ||
766 | if (!strcasecmp(argv[0], "mark")) | |
767 | r = log_mark(lc, argv[1]); | |
768 | else | |
769 | DMWARN("Unrecognised log writes target message received: %s", argv[0]); | |
770 | ||
771 | return r; | |
772 | } | |
773 | ||
774 | static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) | |
775 | { | |
776 | struct log_writes_c *lc = ti->private; | |
777 | struct request_queue *q = bdev_get_queue(lc->dev->bdev); | |
778 | ||
779 | if (!q || !blk_queue_discard(q)) { | |
780 | lc->device_supports_discard = false; | |
781 | limits->discard_granularity = 1 << SECTOR_SHIFT; | |
782 | limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); | |
783 | } | |
784 | } | |
785 | ||
786 | static struct target_type log_writes_target = { | |
787 | .name = "log-writes", | |
788 | .version = {1, 0, 0}, | |
789 | .module = THIS_MODULE, | |
790 | .ctr = log_writes_ctr, | |
791 | .dtr = log_writes_dtr, | |
792 | .map = log_writes_map, | |
793 | .end_io = normal_end_io, | |
794 | .status = log_writes_status, | |
795 | .ioctl = log_writes_ioctl, | |
796 | .merge = log_writes_merge, | |
797 | .message = log_writes_message, | |
798 | .iterate_devices = log_writes_iterate_devices, | |
799 | .io_hints = log_writes_io_hints, | |
800 | }; | |
801 | ||
802 | static int __init dm_log_writes_init(void) | |
803 | { | |
804 | int r = dm_register_target(&log_writes_target); | |
805 | ||
806 | if (r < 0) | |
807 | DMERR("register failed %d", r); | |
808 | ||
809 | return r; | |
810 | } | |
811 | ||
812 | static void __exit dm_log_writes_exit(void) | |
813 | { | |
814 | dm_unregister_target(&log_writes_target); | |
815 | } | |
816 | ||
817 | module_init(dm_log_writes_init); | |
818 | module_exit(dm_log_writes_exit); | |
819 | ||
820 | MODULE_DESCRIPTION(DM_NAME " log writes target"); | |
821 | MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>"); | |
822 | MODULE_LICENSE("GPL"); |