Commit | Line | Data |
---|---|---|
3bd94003 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
d3c7b35c HM |
2 | /* |
3 | * Copyright (C) 2020 Red Hat GmbH | |
4 | * | |
5 | * This file is released under the GPL. | |
6 | * | |
7 | * Device-mapper target to emulate smaller logical block | |
8 | * size on backing devices exposing (natively) larger ones. | |
9 | * | |
10 | * E.g. 512 byte sector emulation on 4K native disks. | |
11 | */ | |
12 | ||
13 | #include "dm.h" | |
14 | #include <linux/module.h> | |
15 | #include <linux/workqueue.h> | |
16 | #include <linux/dm-bufio.h> | |
17 | ||
18 | #define DM_MSG_PREFIX "ebs" | |
19 | ||
20 | static void ebs_dtr(struct dm_target *ti); | |
21 | ||
22 | /* Emulated block size context. */ | |
23 | struct ebs_c { | |
24 | struct dm_dev *dev; /* Underlying device to emulate block size on. */ | |
25 | struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */ | |
26 | struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */ | |
27 | struct work_struct ws; /* Work item used for ^. */ | |
28 | struct bio_list bios_in; /* Worker bios input list. */ | |
29 | spinlock_t lock; /* Guard bios input list above. */ | |
30 | sector_t start; /* <start> table line argument, see ebs_ctr below. */ | |
31 | unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ | |
1c72e023 | 32 | unsigned int u_bs; /* Underlying block size in sectors retrieved from/set on lower layer device. */ |
d3c7b35c HM |
33 | unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ |
34 | bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ | |
35 | }; | |
36 | ||
37 | static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector) | |
38 | { | |
39 | return sector >> ec->block_shift; | |
40 | } | |
41 | ||
42 | static inline sector_t __block_mod(sector_t sector, unsigned int bs) | |
43 | { | |
44 | return sector & (bs - 1); | |
45 | } | |
46 | ||
1c72e023 | 47 | /* Return number of blocks for a bio, accounting for misalignment of start and end sectors. */ |
d3c7b35c HM |
48 | static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) |
49 | { | |
50 | sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio); | |
51 | ||
52 | return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0); | |
53 | } | |
54 | ||
55 | static inline bool __ebs_check_bs(unsigned int bs) | |
56 | { | |
57 | return bs && is_power_of_2(bs); | |
58 | } | |
59 | ||
60 | /* | |
61 | * READ/WRITE: | |
62 | * | |
63 | * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. | |
64 | */ | |
67a7b9a5 BVA |
65 | static int __ebs_rw_bvec(struct ebs_c *ec, enum req_op op, struct bio_vec *bv, |
66 | struct bvec_iter *iter) | |
d3c7b35c HM |
67 | { |
68 | int r = 0; | |
69 | unsigned char *ba, *pa; | |
70 | unsigned int cur_len; | |
71 | unsigned int bv_len = bv->bv_len; | |
72 | unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs)); | |
73 | sector_t block = __sector_to_block(ec, iter->bi_sector); | |
74 | struct dm_buffer *b; | |
75 | ||
76 | if (unlikely(!bv->bv_page || !bv_len)) | |
77 | return -EIO; | |
78 | ||
3a8ba33b | 79 | pa = bvec_virt(bv); |
d3c7b35c HM |
80 | |
81 | /* Handle overlapping page <-> blocks */ | |
82 | while (bv_len) { | |
83 | cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); | |
84 | ||
85 | /* Avoid reading for writes in case bio vector's page overwrites block completely. */ | |
67a7b9a5 | 86 | if (op == REQ_OP_READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) |
d3c7b35c HM |
87 | ba = dm_bufio_read(ec->bufio, block, &b); |
88 | else | |
89 | ba = dm_bufio_new(ec->bufio, block, &b); | |
90 | ||
52252ade | 91 | if (IS_ERR(ba)) { |
d3c7b35c HM |
92 | /* |
93 | * Carry on with next buffer, if any, to issue all possible | |
94 | * data but return error. | |
95 | */ | |
96 | r = PTR_ERR(ba); | |
97 | } else { | |
98 | /* Copy data to/from bio to buffer if read/new was successful above. */ | |
99 | ba += buf_off; | |
67a7b9a5 | 100 | if (op == REQ_OP_READ) { |
d3c7b35c HM |
101 | memcpy(pa, ba, cur_len); |
102 | flush_dcache_page(bv->bv_page); | |
103 | } else { | |
104 | flush_dcache_page(bv->bv_page); | |
105 | memcpy(ba, pa, cur_len); | |
106 | dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len); | |
107 | } | |
108 | ||
109 | dm_bufio_release(b); | |
110 | } | |
111 | ||
112 | pa += cur_len; | |
113 | bv_len -= cur_len; | |
114 | buf_off = 0; | |
115 | block++; | |
116 | } | |
117 | ||
118 | return r; | |
119 | } | |
120 | ||
121 | /* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ | |
67a7b9a5 | 122 | static int __ebs_rw_bio(struct ebs_c *ec, enum req_op op, struct bio *bio) |
d3c7b35c HM |
123 | { |
124 | int r = 0, rr; | |
125 | struct bio_vec bv; | |
126 | struct bvec_iter iter; | |
127 | ||
128 | bio_for_each_bvec(bv, bio, iter) { | |
67a7b9a5 | 129 | rr = __ebs_rw_bvec(ec, op, &bv, &iter); |
d3c7b35c HM |
130 | if (rr) |
131 | r = rr; | |
132 | } | |
133 | ||
134 | return r; | |
135 | } | |
136 | ||
a5089a95 HM |
137 | /* |
138 | * Discard bio's blocks, i.e. pass discards down. | |
139 | * | |
140 | * Avoid discarding partial blocks at beginning and end; | |
141 | * return 0 in case no blocks can be discarded as a result. | |
142 | */ | |
143 | static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio) | |
144 | { | |
145 | sector_t block, blocks, sector = bio->bi_iter.bi_sector; | |
146 | ||
147 | block = __sector_to_block(ec, sector); | |
148 | blocks = __nr_blocks(ec, bio); | |
149 | ||
150 | /* | |
151 | * Partial first underlying block (__nr_blocks() may have | |
152 | * resulted in one block). | |
153 | */ | |
154 | if (__block_mod(sector, ec->u_bs)) { | |
155 | block++; | |
156 | blocks--; | |
157 | } | |
158 | ||
159 | /* Partial last underlying block if any. */ | |
160 | if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs)) | |
161 | blocks--; | |
162 | ||
163 | return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0; | |
164 | } | |
165 | ||
166 | /* Release blocks them from the bufio cache. */ | |
167 | static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) | |
d3c7b35c HM |
168 | { |
169 | sector_t blocks, sector = bio->bi_iter.bi_sector; | |
170 | ||
171 | blocks = __nr_blocks(ec, bio); | |
334b4fc1 MP |
172 | |
173 | dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks); | |
d3c7b35c HM |
174 | } |
175 | ||
1c72e023 | 176 | /* Worker function to process incoming bios. */ |
d3c7b35c HM |
177 | static void __ebs_process_bios(struct work_struct *ws) |
178 | { | |
179 | int r; | |
180 | bool write = false; | |
181 | sector_t block1, block2; | |
182 | struct ebs_c *ec = container_of(ws, struct ebs_c, ws); | |
183 | struct bio *bio; | |
184 | struct bio_list bios; | |
185 | ||
186 | bio_list_init(&bios); | |
187 | ||
188 | spin_lock_irq(&ec->lock); | |
189 | bios = ec->bios_in; | |
190 | bio_list_init(&ec->bios_in); | |
191 | spin_unlock_irq(&ec->lock); | |
192 | ||
193 | /* Prefetch all read and any mis-aligned write buffers */ | |
194 | bio_list_for_each(bio, &bios) { | |
195 | block1 = __sector_to_block(ec, bio->bi_iter.bi_sector); | |
196 | if (bio_op(bio) == REQ_OP_READ) | |
197 | dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio)); | |
198 | else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) { | |
199 | block2 = __sector_to_block(ec, bio_end_sector(bio)); | |
200 | if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs)) | |
201 | dm_bufio_prefetch(ec->bufio, block1, 1); | |
202 | if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1) | |
203 | dm_bufio_prefetch(ec->bufio, block2, 1); | |
204 | } | |
205 | } | |
206 | ||
207 | bio_list_for_each(bio, &bios) { | |
208 | r = -EIO; | |
209 | if (bio_op(bio) == REQ_OP_READ) | |
67a7b9a5 | 210 | r = __ebs_rw_bio(ec, REQ_OP_READ, bio); |
d3c7b35c HM |
211 | else if (bio_op(bio) == REQ_OP_WRITE) { |
212 | write = true; | |
67a7b9a5 | 213 | r = __ebs_rw_bio(ec, REQ_OP_WRITE, bio); |
d3c7b35c | 214 | } else if (bio_op(bio) == REQ_OP_DISCARD) { |
a5089a95 HM |
215 | __ebs_forget_bio(ec, bio); |
216 | r = __ebs_discard_bio(ec, bio); | |
d3c7b35c HM |
217 | } |
218 | ||
219 | if (r < 0) | |
220 | bio->bi_status = errno_to_blk_status(r); | |
221 | } | |
222 | ||
223 | /* | |
224 | * We write dirty buffers after processing I/O on them | |
225 | * but before we endio thus addressing REQ_FUA/REQ_SYNC. | |
226 | */ | |
227 | r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0; | |
228 | ||
229 | while ((bio = bio_list_pop(&bios))) { | |
230 | /* Any other request is endioed. */ | |
231 | if (unlikely(r && bio_op(bio) == REQ_OP_WRITE)) | |
232 | bio_io_error(bio); | |
233 | else | |
234 | bio_endio(bio); | |
235 | } | |
236 | } | |
237 | ||
238 | /* | |
239 | * Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>] | |
240 | * | |
241 | * <dev_path>: path of the underlying device | |
242 | * <offset>: offset in 512 bytes sectors into <dev_path> | |
243 | * <ebs>: emulated block size in units of 512 bytes exposed to the upper layer | |
244 | * [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer; | |
8ca817c4 | 245 | * optional, if not supplied, retrieve logical block size from underlying device |
d3c7b35c HM |
246 | */ |
247 | static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |
248 | { | |
249 | int r; | |
250 | unsigned short tmp1; | |
251 | unsigned long long tmp; | |
252 | char dummy; | |
253 | struct ebs_c *ec; | |
254 | ||
255 | if (argc < 3 || argc > 4) { | |
256 | ti->error = "Invalid argument count"; | |
257 | return -EINVAL; | |
258 | } | |
259 | ||
260 | ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL); | |
261 | if (!ec) { | |
262 | ti->error = "Cannot allocate ebs context"; | |
263 | return -ENOMEM; | |
264 | } | |
265 | ||
266 | r = -EINVAL; | |
267 | if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || | |
268 | tmp != (sector_t)tmp || | |
269 | (sector_t)tmp >= ti->len) { | |
270 | ti->error = "Invalid device offset sector"; | |
271 | goto bad; | |
272 | } | |
273 | ec->start = tmp; | |
274 | ||
275 | if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 || | |
276 | !__ebs_check_bs(tmp1) || | |
277 | to_bytes(tmp1) > PAGE_SIZE) { | |
278 | ti->error = "Invalid emulated block size"; | |
279 | goto bad; | |
280 | } | |
281 | ec->e_bs = tmp1; | |
282 | ||
283 | if (argc > 3) { | |
284 | if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) { | |
285 | ti->error = "Invalid underlying block size"; | |
286 | goto bad; | |
287 | } | |
288 | ec->u_bs = tmp1; | |
289 | ec->u_bs_set = true; | |
290 | } else | |
291 | ec->u_bs_set = false; | |
292 | ||
293 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev); | |
294 | if (r) { | |
295 | ti->error = "Device lookup failed"; | |
296 | ec->dev = NULL; | |
297 | goto bad; | |
298 | } | |
299 | ||
300 | r = -EINVAL; | |
301 | if (!ec->u_bs_set) { | |
302 | ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev)); | |
303 | if (!__ebs_check_bs(ec->u_bs)) { | |
304 | ti->error = "Invalid retrieved underlying block size"; | |
305 | goto bad; | |
306 | } | |
307 | } | |
308 | ||
309 | if (!ec->u_bs_set && ec->e_bs == ec->u_bs) | |
310 | DMINFO("Emulation superfluous: emulated equal to underlying block size"); | |
311 | ||
312 | if (__block_mod(ec->start, ec->u_bs)) { | |
313 | ti->error = "Device offset must be multiple of underlying block size"; | |
314 | goto bad; | |
315 | } | |
316 | ||
0fcb100d NH |
317 | ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, |
318 | 0, NULL, NULL, 0); | |
d3c7b35c HM |
319 | if (IS_ERR(ec->bufio)) { |
320 | ti->error = "Cannot create dm bufio client"; | |
321 | r = PTR_ERR(ec->bufio); | |
322 | ec->bufio = NULL; | |
323 | goto bad; | |
324 | } | |
325 | ||
326 | ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); | |
327 | if (!ec->wq) { | |
328 | ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue"; | |
329 | r = -ENOMEM; | |
330 | goto bad; | |
331 | } | |
332 | ||
333 | ec->block_shift = __ffs(ec->u_bs); | |
334 | INIT_WORK(&ec->ws, &__ebs_process_bios); | |
335 | bio_list_init(&ec->bios_in); | |
336 | spin_lock_init(&ec->lock); | |
337 | ||
338 | ti->num_flush_bios = 1; | |
339 | ti->num_discard_bios = 1; | |
340 | ti->num_secure_erase_bios = 0; | |
d3c7b35c HM |
341 | ti->num_write_zeroes_bios = 0; |
342 | return 0; | |
343 | bad: | |
344 | ebs_dtr(ti); | |
345 | return r; | |
346 | } | |
347 | ||
348 | static void ebs_dtr(struct dm_target *ti) | |
349 | { | |
350 | struct ebs_c *ec = ti->private; | |
351 | ||
352 | if (ec->wq) | |
353 | destroy_workqueue(ec->wq); | |
354 | if (ec->bufio) | |
355 | dm_bufio_client_destroy(ec->bufio); | |
356 | if (ec->dev) | |
357 | dm_put_device(ti, ec->dev); | |
358 | kfree(ec); | |
359 | } | |
360 | ||
361 | static int ebs_map(struct dm_target *ti, struct bio *bio) | |
362 | { | |
363 | struct ebs_c *ec = ti->private; | |
364 | ||
365 | bio_set_dev(bio, ec->dev->bdev); | |
366 | bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); | |
367 | ||
4cb6f226 | 368 | if (unlikely(bio_op(bio) == REQ_OP_FLUSH)) |
d3c7b35c HM |
369 | return DM_MAPIO_REMAPPED; |
370 | /* | |
371 | * Only queue for bufio processing in case of partial or overlapping buffers | |
372 | * -or- | |
373 | * emulation with ebs == ubs aiming for tests of dm-bufio overhead. | |
374 | */ | |
375 | if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) || | |
376 | __block_mod(bio_end_sector(bio), ec->u_bs) || | |
377 | ec->e_bs == ec->u_bs)) { | |
378 | spin_lock_irq(&ec->lock); | |
379 | bio_list_add(&ec->bios_in, bio); | |
380 | spin_unlock_irq(&ec->lock); | |
381 | ||
382 | queue_work(ec->wq, &ec->ws); | |
383 | ||
384 | return DM_MAPIO_SUBMITTED; | |
385 | } | |
386 | ||
387 | /* Forget any buffer content relative to this direct backing device I/O. */ | |
388 | __ebs_forget_bio(ec, bio); | |
389 | ||
390 | return DM_MAPIO_REMAPPED; | |
391 | } | |
392 | ||
393 | static void ebs_status(struct dm_target *ti, status_type_t type, | |
86a3238c | 394 | unsigned int status_flags, char *result, unsigned int maxlen) |
d3c7b35c HM |
395 | { |
396 | struct ebs_c *ec = ti->private; | |
397 | ||
398 | switch (type) { | |
399 | case STATUSTYPE_INFO: | |
400 | *result = '\0'; | |
401 | break; | |
402 | case STATUSTYPE_TABLE: | |
403 | snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u", | |
404 | ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs); | |
405 | break; | |
8ec45662 TS |
406 | case STATUSTYPE_IMA: |
407 | *result = '\0'; | |
408 | break; | |
d3c7b35c HM |
409 | } |
410 | } | |
411 | ||
412 | static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) | |
413 | { | |
414 | struct ebs_c *ec = ti->private; | |
415 | struct dm_dev *dev = ec->dev; | |
416 | ||
417 | /* | |
418 | * Only pass ioctls through if the device sizes match exactly. | |
419 | */ | |
420 | *bdev = dev->bdev; | |
6dcbb52c | 421 | return !!(ec->start || ti->len != bdev_nr_sectors(dev->bdev)); |
d3c7b35c HM |
422 | } |
423 | ||
424 | static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) | |
425 | { | |
426 | struct ebs_c *ec = ti->private; | |
427 | ||
428 | limits->logical_block_size = to_bytes(ec->e_bs); | |
429 | limits->physical_block_size = to_bytes(ec->u_bs); | |
430 | limits->alignment_offset = limits->physical_block_size; | |
431 | blk_limits_io_min(limits, limits->logical_block_size); | |
432 | } | |
433 | ||
434 | static int ebs_iterate_devices(struct dm_target *ti, | |
435 | iterate_devices_callout_fn fn, void *data) | |
436 | { | |
437 | struct ebs_c *ec = ti->private; | |
438 | ||
439 | return fn(ti, ec->dev, ec->start, ti->len, data); | |
440 | } | |
441 | ||
442 | static struct target_type ebs_target = { | |
443 | .name = "ebs", | |
a5089a95 | 444 | .version = {1, 0, 1}, |
d3c7b35c HM |
445 | .features = DM_TARGET_PASSES_INTEGRITY, |
446 | .module = THIS_MODULE, | |
447 | .ctr = ebs_ctr, | |
448 | .dtr = ebs_dtr, | |
449 | .map = ebs_map, | |
450 | .status = ebs_status, | |
451 | .io_hints = ebs_io_hints, | |
452 | .prepare_ioctl = ebs_prepare_ioctl, | |
453 | .iterate_devices = ebs_iterate_devices, | |
454 | }; | |
3664ff82 | 455 | module_dm(ebs); |
d3c7b35c | 456 | |
fa34e589 | 457 | MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@lists.linux.dev>"); |
d3c7b35c HM |
458 | MODULE_DESCRIPTION(DM_NAME " emulated block size target"); |
459 | MODULE_LICENSE("GPL"); |