Commit | Line | Data |
---|---|---|
0e9cebe7 JB |
1 | /* |
2 | * Copyright (C) 2014 Facebook. All rights reserved. | |
3 | * | |
4 | * This file is released under the GPL. | |
5 | */ | |
6 | ||
7 | #include <linux/device-mapper.h> | |
8 | ||
9 | #include <linux/module.h> | |
10 | #include <linux/init.h> | |
11 | #include <linux/blkdev.h> | |
12 | #include <linux/bio.h> | |
98d82f48 | 13 | #include <linux/dax.h> |
0e9cebe7 JB |
14 | #include <linux/slab.h> |
15 | #include <linux/kthread.h> | |
16 | #include <linux/freezer.h> | |
98d82f48 | 17 | #include <linux/uio.h> |
0e9cebe7 JB |
18 | |
19 | #define DM_MSG_PREFIX "log-writes" | |
20 | ||
21 | /* | |
22 | * This target will sequentially log all writes to the target device onto the | |
23 | * log device. This is helpful for replaying writes to check for fs consistency | |
24 | * at all times. This target provides a mechanism to mark specific events to | |
25 | * check data at a later time. So for example you would: | |
26 | * | |
27 | * write data | |
28 | * fsync | |
29 | * dmsetup message /dev/whatever mark mymark | |
30 | * unmount /mnt/test | |
31 | * | |
32 | * Then replay the log up to mymark and check the contents of the replay to | |
33 | * verify it matches what was written. | |
34 | * | |
35 | * We log writes only after they have been flushed, this makes the log describe | |
36 | * close to the order in which the data hits the actual disk, not its cache. So | |
37 | * for example the following sequence (W means write, C means complete) | |
38 | * | |
39 | * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd | |
40 | * | |
41 | * Would result in the log looking like this: | |
42 | * | |
7537dad7 | 43 | * c,a,b,flush,fuad,<other writes>,<next flush> |
0e9cebe7 JB |
44 | * |
45 | * This is meant to help expose problems where file systems do not properly wait | |
46 | * on data being written before invoking a FLUSH. FUA bypasses cache so once it | |
47 | * completes it is added to the log as it should be on disk. | |
48 | * | |
49 | * We treat DISCARDs as if they don't bypass cache so that they are logged in | |
50 | * order of completion along with the normal writes. If we didn't do it this | |
51 | * way we would process all the discards first and then write all the data, when | |
52 | * in fact we want to do the data and the discard in the order that they | |
53 | * completed. | |
54 | */ | |
e5c4cb9b QW |
55 | #define LOG_FLUSH_FLAG (1 << 0) |
56 | #define LOG_FUA_FLAG (1 << 1) | |
57 | #define LOG_DISCARD_FLAG (1 << 2) | |
58 | #define LOG_MARK_FLAG (1 << 3) | |
59 | #define LOG_METADATA_FLAG (1 << 4) | |
0e9cebe7 | 60 | |
f4ad317a GU |
61 | #define WRITE_LOG_VERSION 1ULL |
62 | #define WRITE_LOG_MAGIC 0x6a736677736872ULL | |
211ad4b7 | 63 | #define WRITE_LOG_SUPER_SECTOR 0 |
0e9cebe7 JB |
64 | |
65 | /* | |
66 | * The disk format for this is braindead simple. | |
67 | * | |
68 | * At byte 0 we have our super, followed by the following sequence for | |
69 | * nr_entries: | |
70 | * | |
71 | * [ 1 sector ][ entry->nr_sectors ] | |
72 | * [log_write_entry][ data written ] | |
73 | * | |
74 | * The log_write_entry takes up a full sector so we can have arbitrary length | |
75 | * marks and it leaves us room for extra content in the future. | |
76 | */ | |
77 | ||
78 | /* | |
79 | * Basic info about the log for userspace. | |
80 | */ | |
81 | struct log_write_super { | |
82 | __le64 magic; | |
83 | __le64 version; | |
84 | __le64 nr_entries; | |
85 | __le32 sectorsize; | |
86 | }; | |
87 | ||
88 | /* | |
89 | * sector - the sector we wrote. | |
90 | * nr_sectors - the number of sectors we wrote. | |
91 | * flags - flags for this log entry. | |
92 | * data_len - the size of the data in this log entry, this is for private log | |
93 | * entry stuff, the MARK data provided by userspace for example. | |
94 | */ | |
95 | struct log_write_entry { | |
96 | __le64 sector; | |
97 | __le64 nr_sectors; | |
98 | __le64 flags; | |
99 | __le64 data_len; | |
100 | }; | |
101 | ||
102 | struct log_writes_c { | |
103 | struct dm_dev *dev; | |
104 | struct dm_dev *logdev; | |
105 | u64 logged_entries; | |
106 | u32 sectorsize; | |
228bb5b2 | 107 | u32 sectorshift; |
0e9cebe7 JB |
108 | atomic_t io_blocks; |
109 | atomic_t pending_blocks; | |
110 | sector_t next_sector; | |
111 | sector_t end_sector; | |
112 | bool logging_enabled; | |
113 | bool device_supports_discard; | |
114 | spinlock_t blocks_lock; | |
115 | struct list_head unflushed_blocks; | |
116 | struct list_head logging_blocks; | |
117 | wait_queue_head_t wait; | |
118 | struct task_struct *log_kthread; | |
211ad4b7 | 119 | struct completion super_done; |
0e9cebe7 JB |
120 | }; |
121 | ||
122 | struct pending_block { | |
123 | int vec_cnt; | |
124 | u64 flags; | |
125 | sector_t sector; | |
126 | sector_t nr_sectors; | |
127 | char *data; | |
128 | u32 datalen; | |
129 | struct list_head list; | |
b18ae8dd | 130 | struct bio_vec vecs[]; |
0e9cebe7 JB |
131 | }; |
132 | ||
133 | struct per_bio_data { | |
134 | struct pending_block *block; | |
135 | }; | |
136 | ||
228bb5b2 JB |
137 | static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc, |
138 | sector_t sectors) | |
139 | { | |
140 | return sectors >> (lc->sectorshift - SECTOR_SHIFT); | |
141 | } | |
142 | ||
143 | static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc, | |
144 | sector_t sectors) | |
145 | { | |
146 | return sectors << (lc->sectorshift - SECTOR_SHIFT); | |
147 | } | |
148 | ||
0e9cebe7 JB |
149 | static void put_pending_block(struct log_writes_c *lc) |
150 | { | |
151 | if (atomic_dec_and_test(&lc->pending_blocks)) { | |
152 | smp_mb__after_atomic(); | |
153 | if (waitqueue_active(&lc->wait)) | |
154 | wake_up(&lc->wait); | |
155 | } | |
156 | } | |
157 | ||
158 | static void put_io_block(struct log_writes_c *lc) | |
159 | { | |
160 | if (atomic_dec_and_test(&lc->io_blocks)) { | |
161 | smp_mb__after_atomic(); | |
162 | if (waitqueue_active(&lc->wait)) | |
163 | wake_up(&lc->wait); | |
164 | } | |
165 | } | |
166 | ||
4246a0b6 | 167 | static void log_end_io(struct bio *bio) |
0e9cebe7 JB |
168 | { |
169 | struct log_writes_c *lc = bio->bi_private; | |
0e9cebe7 | 170 | |
4e4cbee9 | 171 | if (bio->bi_status) { |
0e9cebe7 JB |
172 | unsigned long flags; |
173 | ||
4e4cbee9 | 174 | DMERR("Error writing log block, error=%d", bio->bi_status); |
0e9cebe7 JB |
175 | spin_lock_irqsave(&lc->blocks_lock, flags); |
176 | lc->logging_enabled = false; | |
177 | spin_unlock_irqrestore(&lc->blocks_lock, flags); | |
178 | } | |
179 | ||
491221f8 | 180 | bio_free_pages(bio); |
0e9cebe7 JB |
181 | put_io_block(lc); |
182 | bio_put(bio); | |
183 | } | |
184 | ||
211ad4b7 | 185 | static void log_end_super(struct bio *bio) |
186 | { | |
187 | struct log_writes_c *lc = bio->bi_private; | |
188 | ||
189 | complete(&lc->super_done); | |
190 | log_end_io(bio); | |
191 | } | |
192 | ||
0e9cebe7 JB |
193 | /* |
194 | * Meant to be called if there is an error, it will free all the pages | |
195 | * associated with the block. | |
196 | */ | |
197 | static void free_pending_block(struct log_writes_c *lc, | |
198 | struct pending_block *block) | |
199 | { | |
200 | int i; | |
201 | ||
202 | for (i = 0; i < block->vec_cnt; i++) { | |
203 | if (block->vecs[i].bv_page) | |
204 | __free_page(block->vecs[i].bv_page); | |
205 | } | |
206 | kfree(block->data); | |
207 | kfree(block); | |
208 | put_pending_block(lc); | |
209 | } | |
210 | ||
211 | static int write_metadata(struct log_writes_c *lc, void *entry, | |
212 | size_t entrylen, void *data, size_t datalen, | |
213 | sector_t sector) | |
214 | { | |
215 | struct bio *bio; | |
216 | struct page *page; | |
217 | void *ptr; | |
218 | size_t ret; | |
219 | ||
220 | bio = bio_alloc(GFP_KERNEL, 1); | |
221 | if (!bio) { | |
222 | DMERR("Couldn't alloc log bio"); | |
223 | goto error; | |
224 | } | |
225 | bio->bi_iter.bi_size = 0; | |
226 | bio->bi_iter.bi_sector = sector; | |
74d46992 | 227 | bio_set_dev(bio, lc->logdev->bdev); |
211ad4b7 | 228 | bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ? |
229 | log_end_super : log_end_io; | |
0e9cebe7 | 230 | bio->bi_private = lc; |
e6047149 | 231 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); |
0e9cebe7 JB |
232 | |
233 | page = alloc_page(GFP_KERNEL); | |
234 | if (!page) { | |
235 | DMERR("Couldn't alloc log page"); | |
236 | bio_put(bio); | |
237 | goto error; | |
238 | } | |
239 | ||
240 | ptr = kmap_atomic(page); | |
241 | memcpy(ptr, entry, entrylen); | |
242 | if (datalen) | |
243 | memcpy(ptr + entrylen, data, datalen); | |
244 | memset(ptr + entrylen + datalen, 0, | |
245 | lc->sectorsize - entrylen - datalen); | |
246 | kunmap_atomic(ptr); | |
247 | ||
248 | ret = bio_add_page(bio, page, lc->sectorsize, 0); | |
249 | if (ret != lc->sectorsize) { | |
250 | DMERR("Couldn't add page to the log block"); | |
251 | goto error_bio; | |
252 | } | |
4e49ea4a | 253 | submit_bio(bio); |
0e9cebe7 JB |
254 | return 0; |
255 | error_bio: | |
256 | bio_put(bio); | |
257 | __free_page(page); | |
258 | error: | |
259 | put_io_block(lc); | |
260 | return -1; | |
261 | } | |
262 | ||
e5a20660 RZ |
263 | static int write_inline_data(struct log_writes_c *lc, void *entry, |
264 | size_t entrylen, void *data, size_t datalen, | |
265 | sector_t sector) | |
266 | { | |
5f7136db | 267 | int bio_pages, pg_datalen, pg_sectorlen, i; |
e5a20660 RZ |
268 | struct page *page; |
269 | struct bio *bio; | |
270 | size_t ret; | |
271 | void *ptr; | |
272 | ||
273 | while (datalen) { | |
5f7136db | 274 | bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE)); |
e5a20660 RZ |
275 | |
276 | atomic_inc(&lc->io_blocks); | |
277 | ||
278 | bio = bio_alloc(GFP_KERNEL, bio_pages); | |
279 | if (!bio) { | |
280 | DMERR("Couldn't alloc inline data bio"); | |
281 | goto error; | |
282 | } | |
283 | ||
284 | bio->bi_iter.bi_size = 0; | |
285 | bio->bi_iter.bi_sector = sector; | |
286 | bio_set_dev(bio, lc->logdev->bdev); | |
287 | bio->bi_end_io = log_end_io; | |
288 | bio->bi_private = lc; | |
289 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | |
290 | ||
291 | for (i = 0; i < bio_pages; i++) { | |
292 | pg_datalen = min_t(int, datalen, PAGE_SIZE); | |
293 | pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize); | |
294 | ||
295 | page = alloc_page(GFP_KERNEL); | |
296 | if (!page) { | |
297 | DMERR("Couldn't alloc inline data page"); | |
298 | goto error_bio; | |
299 | } | |
300 | ||
301 | ptr = kmap_atomic(page); | |
302 | memcpy(ptr, data, pg_datalen); | |
303 | if (pg_sectorlen > pg_datalen) | |
304 | memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen); | |
305 | kunmap_atomic(ptr); | |
306 | ||
307 | ret = bio_add_page(bio, page, pg_sectorlen, 0); | |
308 | if (ret != pg_sectorlen) { | |
309 | DMERR("Couldn't add page of inline data"); | |
310 | __free_page(page); | |
311 | goto error_bio; | |
312 | } | |
313 | ||
314 | datalen -= pg_datalen; | |
315 | data += pg_datalen; | |
316 | } | |
317 | submit_bio(bio); | |
318 | ||
319 | sector += bio_pages * PAGE_SECTORS; | |
320 | } | |
321 | return 0; | |
322 | error_bio: | |
323 | bio_free_pages(bio); | |
324 | bio_put(bio); | |
325 | error: | |
326 | put_io_block(lc); | |
327 | return -1; | |
328 | } | |
329 | ||
0e9cebe7 JB |
330 | static int log_one_block(struct log_writes_c *lc, |
331 | struct pending_block *block, sector_t sector) | |
332 | { | |
333 | struct bio *bio; | |
334 | struct log_write_entry entry; | |
e5a20660 | 335 | size_t metadatalen, ret; |
0e9cebe7 JB |
336 | int i; |
337 | ||
338 | entry.sector = cpu_to_le64(block->sector); | |
339 | entry.nr_sectors = cpu_to_le64(block->nr_sectors); | |
340 | entry.flags = cpu_to_le64(block->flags); | |
341 | entry.data_len = cpu_to_le64(block->datalen); | |
e5a20660 RZ |
342 | |
343 | metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0; | |
0e9cebe7 | 344 | if (write_metadata(lc, &entry, sizeof(entry), block->data, |
e5a20660 | 345 | metadatalen, sector)) { |
0e9cebe7 JB |
346 | free_pending_block(lc, block); |
347 | return -1; | |
348 | } | |
349 | ||
e5a20660 RZ |
350 | sector += dev_to_bio_sectors(lc, 1); |
351 | ||
352 | if (block->datalen && metadatalen == 0) { | |
353 | if (write_inline_data(lc, &entry, sizeof(entry), block->data, | |
354 | block->datalen, sector)) { | |
355 | free_pending_block(lc, block); | |
356 | return -1; | |
357 | } | |
358 | /* we don't support both inline data & bio data */ | |
359 | goto out; | |
360 | } | |
361 | ||
0e9cebe7 JB |
362 | if (!block->vec_cnt) |
363 | goto out; | |
0e9cebe7 | 364 | |
a5d60783 | 365 | atomic_inc(&lc->io_blocks); |
5f7136db | 366 | bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt)); |
0e9cebe7 JB |
367 | if (!bio) { |
368 | DMERR("Couldn't alloc log bio"); | |
369 | goto error; | |
370 | } | |
0e9cebe7 JB |
371 | bio->bi_iter.bi_size = 0; |
372 | bio->bi_iter.bi_sector = sector; | |
74d46992 | 373 | bio_set_dev(bio, lc->logdev->bdev); |
0e9cebe7 JB |
374 | bio->bi_end_io = log_end_io; |
375 | bio->bi_private = lc; | |
e6047149 | 376 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); |
0e9cebe7 JB |
377 | |
378 | for (i = 0; i < block->vec_cnt; i++) { | |
379 | /* | |
380 | * The page offset is always 0 because we allocate a new page | |
381 | * for every bvec in the original bio for simplicity sake. | |
382 | */ | |
383 | ret = bio_add_page(bio, block->vecs[i].bv_page, | |
384 | block->vecs[i].bv_len, 0); | |
385 | if (ret != block->vecs[i].bv_len) { | |
386 | atomic_inc(&lc->io_blocks); | |
4e49ea4a | 387 | submit_bio(bio); |
5f7136db MWO |
388 | bio = bio_alloc(GFP_KERNEL, |
389 | bio_max_segs(block->vec_cnt - i)); | |
0e9cebe7 JB |
390 | if (!bio) { |
391 | DMERR("Couldn't alloc log bio"); | |
392 | goto error; | |
393 | } | |
394 | bio->bi_iter.bi_size = 0; | |
395 | bio->bi_iter.bi_sector = sector; | |
74d46992 | 396 | bio_set_dev(bio, lc->logdev->bdev); |
0e9cebe7 JB |
397 | bio->bi_end_io = log_end_io; |
398 | bio->bi_private = lc; | |
e6047149 | 399 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); |
0e9cebe7 JB |
400 | |
401 | ret = bio_add_page(bio, block->vecs[i].bv_page, | |
402 | block->vecs[i].bv_len, 0); | |
403 | if (ret != block->vecs[i].bv_len) { | |
404 | DMERR("Couldn't add page on new bio?"); | |
405 | bio_put(bio); | |
406 | goto error; | |
407 | } | |
408 | } | |
409 | sector += block->vecs[i].bv_len >> SECTOR_SHIFT; | |
410 | } | |
4e49ea4a | 411 | submit_bio(bio); |
0e9cebe7 JB |
412 | out: |
413 | kfree(block->data); | |
414 | kfree(block); | |
415 | put_pending_block(lc); | |
416 | return 0; | |
417 | error: | |
418 | free_pending_block(lc, block); | |
419 | put_io_block(lc); | |
420 | return -1; | |
421 | } | |
422 | ||
423 | static int log_super(struct log_writes_c *lc) | |
424 | { | |
425 | struct log_write_super super; | |
426 | ||
427 | super.magic = cpu_to_le64(WRITE_LOG_MAGIC); | |
428 | super.version = cpu_to_le64(WRITE_LOG_VERSION); | |
429 | super.nr_entries = cpu_to_le64(lc->logged_entries); | |
430 | super.sectorsize = cpu_to_le32(lc->sectorsize); | |
431 | ||
211ad4b7 | 432 | if (write_metadata(lc, &super, sizeof(super), NULL, 0, |
433 | WRITE_LOG_SUPER_SECTOR)) { | |
0e9cebe7 JB |
434 | DMERR("Couldn't write super"); |
435 | return -1; | |
436 | } | |
437 | ||
211ad4b7 | 438 | /* |
439 | * Super sector should be writen in-order, otherwise the | |
440 | * nr_entries could be rewritten incorrectly by an old bio. | |
441 | */ | |
442 | wait_for_completion_io(&lc->super_done); | |
443 | ||
0e9cebe7 JB |
444 | return 0; |
445 | } | |
446 | ||
447 | static inline sector_t logdev_last_sector(struct log_writes_c *lc) | |
448 | { | |
449 | return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT; | |
450 | } | |
451 | ||
452 | static int log_writes_kthread(void *arg) | |
453 | { | |
454 | struct log_writes_c *lc = (struct log_writes_c *)arg; | |
455 | sector_t sector = 0; | |
456 | ||
457 | while (!kthread_should_stop()) { | |
458 | bool super = false; | |
459 | bool logging_enabled; | |
460 | struct pending_block *block = NULL; | |
461 | int ret; | |
462 | ||
463 | spin_lock_irq(&lc->blocks_lock); | |
464 | if (!list_empty(&lc->logging_blocks)) { | |
465 | block = list_first_entry(&lc->logging_blocks, | |
466 | struct pending_block, list); | |
467 | list_del_init(&block->list); | |
468 | if (!lc->logging_enabled) | |
469 | goto next; | |
470 | ||
471 | sector = lc->next_sector; | |
228bb5b2 JB |
472 | if (!(block->flags & LOG_DISCARD_FLAG)) |
473 | lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors); | |
474 | lc->next_sector += dev_to_bio_sectors(lc, 1); | |
0e9cebe7 JB |
475 | |
476 | /* | |
477 | * Apparently the size of the device may not be known | |
478 | * right away, so handle this properly. | |
479 | */ | |
480 | if (!lc->end_sector) | |
481 | lc->end_sector = logdev_last_sector(lc); | |
482 | if (lc->end_sector && | |
483 | lc->next_sector >= lc->end_sector) { | |
484 | DMERR("Ran out of space on the logdev"); | |
485 | lc->logging_enabled = false; | |
486 | goto next; | |
487 | } | |
488 | lc->logged_entries++; | |
489 | atomic_inc(&lc->io_blocks); | |
490 | ||
491 | super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG)); | |
492 | if (super) | |
493 | atomic_inc(&lc->io_blocks); | |
494 | } | |
495 | next: | |
496 | logging_enabled = lc->logging_enabled; | |
497 | spin_unlock_irq(&lc->blocks_lock); | |
498 | if (block) { | |
499 | if (logging_enabled) { | |
500 | ret = log_one_block(lc, block, sector); | |
501 | if (!ret && super) | |
502 | ret = log_super(lc); | |
503 | if (ret) { | |
504 | spin_lock_irq(&lc->blocks_lock); | |
505 | lc->logging_enabled = false; | |
506 | spin_unlock_irq(&lc->blocks_lock); | |
507 | } | |
508 | } else | |
509 | free_pending_block(lc, block); | |
510 | continue; | |
511 | } | |
512 | ||
513 | if (!try_to_freeze()) { | |
514 | set_current_state(TASK_INTERRUPTIBLE); | |
515 | if (!kthread_should_stop() && | |
0c79c620 | 516 | list_empty(&lc->logging_blocks)) |
0e9cebe7 JB |
517 | schedule(); |
518 | __set_current_state(TASK_RUNNING); | |
519 | } | |
520 | } | |
521 | return 0; | |
522 | } | |
523 | ||
524 | /* | |
525 | * Construct a log-writes mapping: | |
526 | * log-writes <dev_path> <log_dev_path> | |
527 | */ | |
528 | static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |
529 | { | |
530 | struct log_writes_c *lc; | |
531 | struct dm_arg_set as; | |
532 | const char *devname, *logdevname; | |
e80d1c80 | 533 | int ret; |
0e9cebe7 JB |
534 | |
535 | as.argc = argc; | |
536 | as.argv = argv; | |
537 | ||
538 | if (argc < 2) { | |
539 | ti->error = "Invalid argument count"; | |
540 | return -EINVAL; | |
541 | } | |
542 | ||
543 | lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL); | |
544 | if (!lc) { | |
545 | ti->error = "Cannot allocate context"; | |
546 | return -ENOMEM; | |
547 | } | |
548 | spin_lock_init(&lc->blocks_lock); | |
549 | INIT_LIST_HEAD(&lc->unflushed_blocks); | |
550 | INIT_LIST_HEAD(&lc->logging_blocks); | |
551 | init_waitqueue_head(&lc->wait); | |
211ad4b7 | 552 | init_completion(&lc->super_done); |
0e9cebe7 JB |
553 | atomic_set(&lc->io_blocks, 0); |
554 | atomic_set(&lc->pending_blocks, 0); | |
555 | ||
556 | devname = dm_shift_arg(&as); | |
e80d1c80 VG |
557 | ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev); |
558 | if (ret) { | |
0e9cebe7 JB |
559 | ti->error = "Device lookup failed"; |
560 | goto bad; | |
561 | } | |
562 | ||
563 | logdevname = dm_shift_arg(&as); | |
e80d1c80 VG |
564 | ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), |
565 | &lc->logdev); | |
566 | if (ret) { | |
0e9cebe7 JB |
567 | ti->error = "Log device lookup failed"; |
568 | dm_put_device(ti, lc->dev); | |
569 | goto bad; | |
570 | } | |
571 | ||
228bb5b2 JB |
572 | lc->sectorsize = bdev_logical_block_size(lc->dev->bdev); |
573 | lc->sectorshift = ilog2(lc->sectorsize); | |
0e9cebe7 | 574 | lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); |
91e630d9 VZ |
575 | if (IS_ERR(lc->log_kthread)) { |
576 | ret = PTR_ERR(lc->log_kthread); | |
0e9cebe7 JB |
577 | ti->error = "Couldn't alloc kthread"; |
578 | dm_put_device(ti, lc->dev); | |
579 | dm_put_device(ti, lc->logdev); | |
580 | goto bad; | |
581 | } | |
582 | ||
228bb5b2 JB |
583 | /* |
584 | * next_sector is in 512b sectors to correspond to what bi_sector expects. | |
585 | * The super starts at sector 0, and the next_sector is the next logical | |
586 | * one based on the sectorsize of the device. | |
587 | */ | |
588 | lc->next_sector = lc->sectorsize >> SECTOR_SHIFT; | |
0e9cebe7 JB |
589 | lc->logging_enabled = true; |
590 | lc->end_sector = logdev_last_sector(lc); | |
591 | lc->device_supports_discard = true; | |
592 | ||
593 | ti->num_flush_bios = 1; | |
594 | ti->flush_supported = true; | |
595 | ti->num_discard_bios = 1; | |
596 | ti->discards_supported = true; | |
30187e1d | 597 | ti->per_io_data_size = sizeof(struct per_bio_data); |
0e9cebe7 JB |
598 | ti->private = lc; |
599 | return 0; | |
600 | ||
601 | bad: | |
602 | kfree(lc); | |
e80d1c80 | 603 | return ret; |
0e9cebe7 JB |
604 | } |
605 | ||
606 | static int log_mark(struct log_writes_c *lc, char *data) | |
607 | { | |
608 | struct pending_block *block; | |
609 | size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry); | |
610 | ||
611 | block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); | |
612 | if (!block) { | |
613 | DMERR("Error allocating pending block"); | |
614 | return -ENOMEM; | |
615 | } | |
616 | ||
4b259fc4 | 617 | block->data = kstrndup(data, maxsize - 1, GFP_KERNEL); |
0e9cebe7 JB |
618 | if (!block->data) { |
619 | DMERR("Error copying mark data"); | |
620 | kfree(block); | |
621 | return -ENOMEM; | |
622 | } | |
623 | atomic_inc(&lc->pending_blocks); | |
624 | block->datalen = strlen(block->data); | |
625 | block->flags |= LOG_MARK_FLAG; | |
626 | spin_lock_irq(&lc->blocks_lock); | |
627 | list_add_tail(&block->list, &lc->logging_blocks); | |
628 | spin_unlock_irq(&lc->blocks_lock); | |
629 | wake_up_process(lc->log_kthread); | |
630 | return 0; | |
631 | } | |
632 | ||
633 | static void log_writes_dtr(struct dm_target *ti) | |
634 | { | |
635 | struct log_writes_c *lc = ti->private; | |
636 | ||
637 | spin_lock_irq(&lc->blocks_lock); | |
638 | list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks); | |
639 | spin_unlock_irq(&lc->blocks_lock); | |
640 | ||
641 | /* | |
642 | * This is just nice to have since it'll update the super to include the | |
643 | * unflushed blocks, if it fails we don't really care. | |
644 | */ | |
645 | log_mark(lc, "dm-log-writes-end"); | |
646 | wake_up_process(lc->log_kthread); | |
647 | wait_event(lc->wait, !atomic_read(&lc->io_blocks) && | |
648 | !atomic_read(&lc->pending_blocks)); | |
649 | kthread_stop(lc->log_kthread); | |
650 | ||
651 | WARN_ON(!list_empty(&lc->logging_blocks)); | |
652 | WARN_ON(!list_empty(&lc->unflushed_blocks)); | |
653 | dm_put_device(ti, lc->dev); | |
654 | dm_put_device(ti, lc->logdev); | |
655 | kfree(lc); | |
656 | } | |
657 | ||
658 | static void normal_map_bio(struct dm_target *ti, struct bio *bio) | |
659 | { | |
660 | struct log_writes_c *lc = ti->private; | |
661 | ||
74d46992 | 662 | bio_set_dev(bio, lc->dev->bdev); |
0e9cebe7 JB |
663 | } |
664 | ||
665 | static int log_writes_map(struct dm_target *ti, struct bio *bio) | |
666 | { | |
667 | struct log_writes_c *lc = ti->private; | |
668 | struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); | |
669 | struct pending_block *block; | |
670 | struct bvec_iter iter; | |
671 | struct bio_vec bv; | |
672 | size_t alloc_size; | |
673 | int i = 0; | |
1eff9d32 JA |
674 | bool flush_bio = (bio->bi_opf & REQ_PREFLUSH); |
675 | bool fua_bio = (bio->bi_opf & REQ_FUA); | |
e6047149 | 676 | bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); |
e5c4cb9b | 677 | bool meta_bio = (bio->bi_opf & REQ_META); |
0e9cebe7 JB |
678 | |
679 | pb->block = NULL; | |
680 | ||
681 | /* Don't bother doing anything if logging has been disabled */ | |
682 | if (!lc->logging_enabled) | |
683 | goto map_bio; | |
684 | ||
685 | /* | |
686 | * Map reads as normal. | |
687 | */ | |
688 | if (bio_data_dir(bio) == READ) | |
689 | goto map_bio; | |
690 | ||
691 | /* No sectors and not a flush? Don't care */ | |
692 | if (!bio_sectors(bio) && !flush_bio) | |
693 | goto map_bio; | |
694 | ||
695 | /* | |
696 | * Discards will have bi_size set but there's no actual data, so just | |
697 | * allocate the size of the pending block. | |
698 | */ | |
699 | if (discard_bio) | |
700 | alloc_size = sizeof(struct pending_block); | |
701 | else | |
d4e6e836 | 702 | alloc_size = struct_size(block, vecs, bio_segments(bio)); |
0e9cebe7 JB |
703 | |
704 | block = kzalloc(alloc_size, GFP_NOIO); | |
705 | if (!block) { | |
706 | DMERR("Error allocating pending block"); | |
707 | spin_lock_irq(&lc->blocks_lock); | |
708 | lc->logging_enabled = false; | |
709 | spin_unlock_irq(&lc->blocks_lock); | |
846785e6 | 710 | return DM_MAPIO_KILL; |
0e9cebe7 JB |
711 | } |
712 | INIT_LIST_HEAD(&block->list); | |
713 | pb->block = block; | |
714 | atomic_inc(&lc->pending_blocks); | |
715 | ||
716 | if (flush_bio) | |
717 | block->flags |= LOG_FLUSH_FLAG; | |
718 | if (fua_bio) | |
719 | block->flags |= LOG_FUA_FLAG; | |
720 | if (discard_bio) | |
721 | block->flags |= LOG_DISCARD_FLAG; | |
e5c4cb9b QW |
722 | if (meta_bio) |
723 | block->flags |= LOG_METADATA_FLAG; | |
0e9cebe7 | 724 | |
228bb5b2 JB |
725 | block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector); |
726 | block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio)); | |
0e9cebe7 JB |
727 | |
728 | /* We don't need the data, just submit */ | |
729 | if (discard_bio) { | |
730 | WARN_ON(flush_bio || fua_bio); | |
731 | if (lc->device_supports_discard) | |
732 | goto map_bio; | |
4246a0b6 | 733 | bio_endio(bio); |
0e9cebe7 JB |
734 | return DM_MAPIO_SUBMITTED; |
735 | } | |
736 | ||
737 | /* Flush bio, splice the unflushed blocks onto this list and submit */ | |
738 | if (flush_bio && !bio_sectors(bio)) { | |
739 | spin_lock_irq(&lc->blocks_lock); | |
740 | list_splice_init(&lc->unflushed_blocks, &block->list); | |
741 | spin_unlock_irq(&lc->blocks_lock); | |
742 | goto map_bio; | |
743 | } | |
744 | ||
745 | /* | |
746 | * We will write this bio somewhere else way later so we need to copy | |
747 | * the actual contents into new pages so we know the data will always be | |
748 | * there. | |
749 | * | |
750 | * We do this because this could be a bio from O_DIRECT in which case we | |
751 | * can't just hold onto the page until some later point, we have to | |
752 | * manually copy the contents. | |
753 | */ | |
754 | bio_for_each_segment(bv, bio, iter) { | |
755 | struct page *page; | |
756 | void *src, *dst; | |
757 | ||
758 | page = alloc_page(GFP_NOIO); | |
759 | if (!page) { | |
760 | DMERR("Error allocing page"); | |
761 | free_pending_block(lc, block); | |
762 | spin_lock_irq(&lc->blocks_lock); | |
763 | lc->logging_enabled = false; | |
764 | spin_unlock_irq(&lc->blocks_lock); | |
846785e6 | 765 | return DM_MAPIO_KILL; |
0e9cebe7 JB |
766 | } |
767 | ||
768 | src = kmap_atomic(bv.bv_page); | |
769 | dst = kmap_atomic(page); | |
770 | memcpy(dst, src + bv.bv_offset, bv.bv_len); | |
771 | kunmap_atomic(dst); | |
772 | kunmap_atomic(src); | |
773 | block->vecs[i].bv_page = page; | |
774 | block->vecs[i].bv_len = bv.bv_len; | |
775 | block->vec_cnt++; | |
776 | i++; | |
777 | } | |
778 | ||
779 | /* Had a flush with data in it, weird */ | |
780 | if (flush_bio) { | |
781 | spin_lock_irq(&lc->blocks_lock); | |
782 | list_splice_init(&lc->unflushed_blocks, &block->list); | |
783 | spin_unlock_irq(&lc->blocks_lock); | |
784 | } | |
785 | map_bio: | |
786 | normal_map_bio(ti, bio); | |
787 | return DM_MAPIO_REMAPPED; | |
788 | } | |
789 | ||
4e4cbee9 CH |
790 | static int normal_end_io(struct dm_target *ti, struct bio *bio, |
791 | blk_status_t *error) | |
0e9cebe7 JB |
792 | { |
793 | struct log_writes_c *lc = ti->private; | |
794 | struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); | |
795 | ||
796 | if (bio_data_dir(bio) == WRITE && pb->block) { | |
797 | struct pending_block *block = pb->block; | |
798 | unsigned long flags; | |
799 | ||
800 | spin_lock_irqsave(&lc->blocks_lock, flags); | |
801 | if (block->flags & LOG_FLUSH_FLAG) { | |
802 | list_splice_tail_init(&block->list, &lc->logging_blocks); | |
803 | list_add_tail(&block->list, &lc->logging_blocks); | |
804 | wake_up_process(lc->log_kthread); | |
805 | } else if (block->flags & LOG_FUA_FLAG) { | |
806 | list_add_tail(&block->list, &lc->logging_blocks); | |
807 | wake_up_process(lc->log_kthread); | |
808 | } else | |
809 | list_add_tail(&block->list, &lc->unflushed_blocks); | |
810 | spin_unlock_irqrestore(&lc->blocks_lock, flags); | |
811 | } | |
812 | ||
1be56909 | 813 | return DM_ENDIO_DONE; |
0e9cebe7 JB |
814 | } |
815 | ||
816 | /* | |
817 | * INFO format: <logged entries> <highest allocated sector> | |
818 | */ | |
819 | static void log_writes_status(struct dm_target *ti, status_type_t type, | |
820 | unsigned status_flags, char *result, | |
821 | unsigned maxlen) | |
822 | { | |
823 | unsigned sz = 0; | |
824 | struct log_writes_c *lc = ti->private; | |
825 | ||
826 | switch (type) { | |
827 | case STATUSTYPE_INFO: | |
828 | DMEMIT("%llu %llu", lc->logged_entries, | |
829 | (unsigned long long)lc->next_sector - 1); | |
830 | if (!lc->logging_enabled) | |
831 | DMEMIT(" logging_disabled"); | |
832 | break; | |
833 | ||
834 | case STATUSTYPE_TABLE: | |
835 | DMEMIT("%s %s", lc->dev->name, lc->logdev->name); | |
836 | break; | |
837 | } | |
838 | } | |
839 | ||
e56f81e0 | 840 | static int log_writes_prepare_ioctl(struct dm_target *ti, |
5bd5e8d8 | 841 | struct block_device **bdev) |
0e9cebe7 JB |
842 | { |
843 | struct log_writes_c *lc = ti->private; | |
844 | struct dm_dev *dev = lc->dev; | |
0e9cebe7 | 845 | |
e56f81e0 | 846 | *bdev = dev->bdev; |
0e9cebe7 JB |
847 | /* |
848 | * Only pass ioctls through if the device sizes match exactly. | |
849 | */ | |
850 | if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) | |
e56f81e0 CH |
851 | return 1; |
852 | return 0; | |
0e9cebe7 JB |
853 | } |
854 | ||
0e9cebe7 JB |
855 | static int log_writes_iterate_devices(struct dm_target *ti, |
856 | iterate_devices_callout_fn fn, | |
857 | void *data) | |
858 | { | |
859 | struct log_writes_c *lc = ti->private; | |
860 | ||
861 | return fn(ti, lc->dev, 0, ti->len, data); | |
862 | } | |
863 | ||
864 | /* | |
865 | * Messages supported: | |
866 | * mark <mark data> - specify the marked data. | |
867 | */ | |
1eb5fa84 MS |
868 | static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv, |
869 | char *result, unsigned maxlen) | |
0e9cebe7 JB |
870 | { |
871 | int r = -EINVAL; | |
872 | struct log_writes_c *lc = ti->private; | |
873 | ||
874 | if (argc != 2) { | |
875 | DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc); | |
876 | return r; | |
877 | } | |
878 | ||
879 | if (!strcasecmp(argv[0], "mark")) | |
880 | r = log_mark(lc, argv[1]); | |
881 | else | |
882 | DMWARN("Unrecognised log writes target message received: %s", argv[0]); | |
883 | ||
884 | return r; | |
885 | } | |
886 | ||
887 | static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) | |
888 | { | |
889 | struct log_writes_c *lc = ti->private; | |
890 | struct request_queue *q = bdev_get_queue(lc->dev->bdev); | |
891 | ||
892 | if (!q || !blk_queue_discard(q)) { | |
893 | lc->device_supports_discard = false; | |
228bb5b2 | 894 | limits->discard_granularity = lc->sectorsize; |
0e9cebe7 JB |
895 | limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); |
896 | } | |
228bb5b2 JB |
897 | limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev); |
898 | limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev); | |
899 | limits->io_min = limits->physical_block_size; | |
0e9cebe7 JB |
900 | } |
901 | ||
976431b0 DW |
902 | #if IS_ENABLED(CONFIG_DAX_DRIVER) |
903 | static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, | |
904 | struct iov_iter *i) | |
905 | { | |
906 | struct pending_block *block; | |
907 | ||
908 | if (!bytes) | |
909 | return 0; | |
910 | ||
911 | block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); | |
912 | if (!block) { | |
913 | DMERR("Error allocating dax pending block"); | |
914 | return -ENOMEM; | |
915 | } | |
916 | ||
917 | block->data = kzalloc(bytes, GFP_KERNEL); | |
918 | if (!block->data) { | |
919 | DMERR("Error allocating dax data space"); | |
920 | kfree(block); | |
921 | return -ENOMEM; | |
922 | } | |
923 | ||
924 | /* write data provided via the iterator */ | |
925 | if (!copy_from_iter(block->data, bytes, i)) { | |
926 | DMERR("Error copying dax data"); | |
927 | kfree(block->data); | |
928 | kfree(block); | |
929 | return -EIO; | |
930 | } | |
931 | ||
932 | /* rewind the iterator so that the block driver can use it */ | |
933 | iov_iter_revert(i, bytes); | |
934 | ||
935 | block->datalen = bytes; | |
936 | block->sector = bio_to_dev_sectors(lc, sector); | |
937 | block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; | |
938 | ||
939 | atomic_inc(&lc->pending_blocks); | |
940 | spin_lock_irq(&lc->blocks_lock); | |
941 | list_add_tail(&block->list, &lc->unflushed_blocks); | |
942 | spin_unlock_irq(&lc->blocks_lock); | |
943 | wake_up_process(lc->log_kthread); | |
944 | ||
945 | return 0; | |
946 | } | |
947 | ||
98d82f48 RZ |
948 | static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, |
949 | long nr_pages, void **kaddr, pfn_t *pfn) | |
950 | { | |
951 | struct log_writes_c *lc = ti->private; | |
952 | sector_t sector = pgoff * PAGE_SECTORS; | |
953 | int ret; | |
954 | ||
955 | ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff); | |
956 | if (ret) | |
957 | return ret; | |
958 | return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn); | |
959 | } | |
960 | ||
961 | static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, | |
962 | pgoff_t pgoff, void *addr, size_t bytes, | |
963 | struct iov_iter *i) | |
964 | { | |
965 | struct log_writes_c *lc = ti->private; | |
966 | sector_t sector = pgoff * PAGE_SECTORS; | |
967 | int err; | |
968 | ||
969 | if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) | |
970 | return 0; | |
971 | ||
972 | /* Don't bother doing anything if logging has been disabled */ | |
973 | if (!lc->logging_enabled) | |
974 | goto dax_copy; | |
975 | ||
976 | err = log_dax(lc, sector, bytes, i); | |
977 | if (err) { | |
978 | DMWARN("Error %d logging DAX write", err); | |
979 | return 0; | |
980 | } | |
981 | dax_copy: | |
982 | return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); | |
983 | } | |
b3a9a0c3 DW |
984 | |
985 | static size_t log_writes_dax_copy_to_iter(struct dm_target *ti, | |
986 | pgoff_t pgoff, void *addr, size_t bytes, | |
987 | struct iov_iter *i) | |
988 | { | |
989 | struct log_writes_c *lc = ti->private; | |
990 | sector_t sector = pgoff * PAGE_SECTORS; | |
991 | ||
992 | if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) | |
993 | return 0; | |
994 | return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); | |
995 | } | |
996 | ||
cdf6cdcd VG |
997 | static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, |
998 | size_t nr_pages) | |
999 | { | |
1000 | int ret; | |
1001 | struct log_writes_c *lc = ti->private; | |
1002 | sector_t sector = pgoff * PAGE_SECTORS; | |
1003 | ||
1004 | ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages << PAGE_SHIFT, | |
1005 | &pgoff); | |
1006 | if (ret) | |
1007 | return ret; | |
1008 | return dax_zero_page_range(lc->dev->dax_dev, pgoff, | |
1009 | nr_pages << PAGE_SHIFT); | |
1010 | } | |
1011 | ||
976431b0 DW |
1012 | #else |
1013 | #define log_writes_dax_direct_access NULL | |
1014 | #define log_writes_dax_copy_from_iter NULL | |
b3a9a0c3 | 1015 | #define log_writes_dax_copy_to_iter NULL |
cdf6cdcd | 1016 | #define log_writes_dax_zero_page_range NULL |
976431b0 | 1017 | #endif |
98d82f48 | 1018 | |
0e9cebe7 JB |
1019 | static struct target_type log_writes_target = { |
1020 | .name = "log-writes", | |
98d82f48 | 1021 | .version = {1, 1, 0}, |
0e9cebe7 JB |
1022 | .module = THIS_MODULE, |
1023 | .ctr = log_writes_ctr, | |
1024 | .dtr = log_writes_dtr, | |
1025 | .map = log_writes_map, | |
1026 | .end_io = normal_end_io, | |
1027 | .status = log_writes_status, | |
e56f81e0 | 1028 | .prepare_ioctl = log_writes_prepare_ioctl, |
0e9cebe7 JB |
1029 | .message = log_writes_message, |
1030 | .iterate_devices = log_writes_iterate_devices, | |
1031 | .io_hints = log_writes_io_hints, | |
98d82f48 RZ |
1032 | .direct_access = log_writes_dax_direct_access, |
1033 | .dax_copy_from_iter = log_writes_dax_copy_from_iter, | |
b3a9a0c3 | 1034 | .dax_copy_to_iter = log_writes_dax_copy_to_iter, |
cdf6cdcd | 1035 | .dax_zero_page_range = log_writes_dax_zero_page_range, |
0e9cebe7 JB |
1036 | }; |
1037 | ||
1038 | static int __init dm_log_writes_init(void) | |
1039 | { | |
1040 | int r = dm_register_target(&log_writes_target); | |
1041 | ||
1042 | if (r < 0) | |
1043 | DMERR("register failed %d", r); | |
1044 | ||
1045 | return r; | |
1046 | } | |
1047 | ||
1048 | static void __exit dm_log_writes_exit(void) | |
1049 | { | |
1050 | dm_unregister_target(&log_writes_target); | |
1051 | } | |
1052 | ||
1053 | module_init(dm_log_writes_init); | |
1054 | module_exit(dm_log_writes_exit); | |
1055 | ||
1056 | MODULE_DESCRIPTION(DM_NAME " log writes target"); | |
1057 | MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>"); | |
1058 | MODULE_LICENSE("GPL"); |