Merge branch 'master' of ssh://router/data/git/binject
[binject.git] / main.c
CommitLineData
4f28f523
JA
1/*
2 * TODO
3 *
4 * - Proper ioctls
5 * - Get rid of device list?
6 */
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/poll.h>
11#include <linux/slab.h>
12#include <linux/idr.h>
13#include <linux/file.h>
14#include <linux/miscdevice.h>
15#include <linux/cdev.h>
16#include <linux/bio.h>
17#include <linux/blkdev.h>
18
19#include "kcompat.h"
20#include "binject.h"
21
22static LIST_HEAD(b_dev_list);
23static DEFINE_SPINLOCK(b_dev_lock);
24static DEFINE_IDR(b_minor_idr);
25static struct kmem_cache *b_slab;
26static struct class *b_class;
27static int b_major;
28
29#define B_MAX_DEVS 64
30
31struct b_dev {
32 struct list_head device_list;
33 struct list_head done_list;
34 atomic_t in_flight;
35 unsigned int done_cmds;
36 wait_queue_head_t wq_done;
37 struct block_device *bdev;
38 spinlock_t lock;
39 atomic_t ref;
40 struct file *file;
41 struct device *dev;
42 int minor;
43 struct rcu_head rcu_free;
44};
45
46struct b_cmd {
47 struct list_head list;
48 struct b_dev *bd;
49 struct bio *bio;
50 struct b_user_cmd cmd;
51 u64 issue_time;
52};
53
54static const unsigned long uc_flag_map[__B_FLAG_NR] = {
55 B_REQ_SYNC,
56 B_REQ_UNPLUG,
57 B_REQ_NOIDLE,
58 B_REQ_HARDBARRIER,
59 B_REQ_META,
60 B_REQ_RAHEAD,
61 B_REQ_FAILFAST_DEV,
62 B_REQ_FAILFAST_TRANSPORT,
63 B_REQ_FAILFAST_DRIVER
64};
65
66struct uc_map {
67 int type;
68 unsigned int data_transfer : 1;
69 unsigned int todevice : 1;
70 unsigned int map_zero : 1;
41c1852f 71 unsigned long rw_flags;
4f28f523
JA
72};
73
74static const struct uc_map uc_map[B_TYPE_NR] = {
75 {
76 .type = B_TYPE_READ,
77 .data_transfer = 1,
78 .todevice = 0,
79 .map_zero = 0,
80 },
81 {
82 .type = B_TYPE_WRITE,
83 .data_transfer = 1,
84 .todevice = 1,
85 .map_zero = 0,
41c1852f 86 .rw_flags = B_REQ_WRITE,
4f28f523
JA
87 },
88 {
89 .type = B_TYPE_DISCARD,
90 .data_transfer = 0,
91 .todevice = 0,
92 .map_zero = 0,
41c1852f 93 .rw_flags = B_REQ_DISCARD | B_REQ_WRITE,
4f28f523
JA
94 },
95 {
96 .type = B_TYPE_READVOID,
97 .data_transfer = 1,
98 .todevice = 0,
99 .map_zero = 1,
100 },
101 {
102 .type = B_TYPE_WRITEZERO,
103 .data_transfer = 1,
104 .todevice = 1,
105 .map_zero = 1,
41c1852f
JA
106 .rw_flags = B_REQ_WRITE,
107 },
108 {
109 .type = B_TYPE_READBARRIER,
110 .data_transfer = 1,
111 .todevice = 0,
112 .map_zero = 0,
113 .rw_flags = B_REQ_HARDBARRIER,
114 },
115 {
116 .type = B_TYPE_WRITEBARRIER,
117 .data_transfer = 1,
118 .todevice = 1,
119 .map_zero = 0,
120 .rw_flags = B_REQ_HARDBARRIER | B_REQ_FLUSH | B_REQ_WRITE,
4f28f523
JA
121 }
122};
123
124static void b_dev_complete_commands(struct b_dev *bd);
125
126static void b_dev_remove_lookup(struct b_dev *bd)
127{
128 if (!list_empty(&bd->device_list)) {
129 list_del_init(&bd->device_list);
130 idr_remove(&b_minor_idr, bd->minor);
131 }
132}
133
134static void bd_rcu_free(struct rcu_head *head)
135{
136 kfree(container_of(head, struct b_dev, rcu_free));
137}
138
139static void b_dev_put(struct b_dev *bd)
140{
141 if (!atomic_dec_and_test(&bd->ref))
142 return;
143
144 spin_lock(&b_dev_lock);
145 b_dev_remove_lookup(bd);
146 spin_unlock(&b_dev_lock);
147
148 b_dev_complete_commands(bd);
149
150 device_destroy(b_class, MKDEV(b_major, bd->minor));
151 fput(bd->file);
152 module_put(THIS_MODULE);
153
154 call_rcu(&bd->rcu_free, bd_rcu_free);
155}
156
157static struct b_cmd *get_free_command(struct b_dev *bd)
158{
159 struct b_cmd *bc;
160
161 bc = kmem_cache_alloc(b_slab, GFP_KERNEL);
162 if (bc) {
4f28f523
JA
163 bc->bd = bd;
164 return bc;
165 }
166
167 return ERR_PTR(-ENOMEM);
168}
169
170static struct b_cmd *get_completed_command(struct b_dev *bd)
171{
172 struct b_cmd *bc = NULL;
173
174 spin_lock_irq(&bd->lock);
175 if (!list_empty(&bd->done_list)) {
176 bc = list_entry(bd->done_list.next, struct b_cmd, list);
177 bd->done_cmds--;
146229b6 178 list_del_init(&bc->list);
4f28f523
JA
179 }
180 spin_unlock_irq(&bd->lock);
181 return bc;
182}
183
184static struct b_cmd *get_done_command(struct b_dev *bd, int block)
185{
186 struct b_cmd *bc;
187 int ret;
188
189 do {
190 bc = get_completed_command(bd);
191 if (bc)
192 break;
193
194 if (!block)
195 break;
196
197 ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
198 if (ret) {
199 bc = ERR_PTR(-ERESTARTSYS);
200 break;
201 }
202 } while (1);
203
204 return bc;
205}
206
207static void bc_put_bio_pages(struct bio *bio)
208{
209 struct bio_vec *bv;
210 unsigned int i;
211
212 __bio_for_each_segment(bv, bio, i, 0) {
213 if (bv->bv_page != ZERO_PAGE(0))
214 __free_page(bv->bv_page);
215 }
216}
217
218static void complete_and_free_bio(struct b_cmd *bc)
219{
220 if (bc->bio) {
221 const struct uc_map *ucm = &uc_map[bc->cmd.type];
222
223 if (ucm->data_transfer) {
224 if (!ucm->map_zero)
225 bio_unmap_user(bc->bio);
226 else
227 bc_put_bio_pages(bc->bio);
228 }
229 bio_put(bc->bio);
230 bc->bio = NULL;
231 }
232}
233
234static void b_dev_complete_commands(struct b_dev *bd)
235{
236 struct b_cmd *bc;
237
238 wait_event(bd->wq_done, !atomic_read(&bd->in_flight));
239
240 while ((bc = get_completed_command(bd)) != NULL)
241 complete_and_free_bio(bc);
242}
243
244static int b_dev_validate_command(struct b_user_cmd *buc)
245{
246 if (!binject_buc_check_magic(buc))
247 return -EINVAL;
248
249 switch (buc->type) {
250 case B_TYPE_WRITE:
251 case B_TYPE_READ:
252 case B_TYPE_DISCARD:
253 case B_TYPE_READVOID:
254 case B_TYPE_WRITEZERO:
255 if (buc->len)
256 return 0;
257 return -EINVAL;
258 default:
259 return -EINVAL;
260 }
261}
262
263static void b_cmd_endio(struct bio *bio, int error)
264{
265 struct b_cmd *bc = bio->bi_private;
266 struct b_dev *bd = bc->bd;
267 unsigned long flags;
268 unsigned long now;
269
270 now = ktime_to_ns(ktime_get());
271 bc->cmd.nsec = now - bc->issue_time;
272 bc->cmd.error = error;
273
274 spin_lock_irqsave(&bd->lock, flags);
275 list_add_tail(&bc->list, &bd->done_list);
276 bd->done_cmds++;
277 spin_unlock_irqrestore(&bd->lock, flags);
278
279 atomic_dec(&bd->in_flight);
280
281 wake_up(&bd->wq_done);
282}
283
284#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18)
285static int bio_cmd_endio(struct bio *bio, unsigned int bytes, int err)
286{
287 if (bio->bi_size)
288 return 1;
289
290 b_cmd_endio(bio, err);
291 return 0;
292}
293#else
294static void bio_cmd_endio(struct bio *bio, int err)
295{
296 b_cmd_endio(bio, err);
297}
298#endif
299
41c1852f
JA
300#define len_to_pages(len) ((len + PAGE_SIZE - 1) / PAGE_SIZE)
301
4f28f523
JA
302static int zero_map_bio(struct request_queue *q, struct bio *bio,
303 const struct uc_map *ucm, unsigned int len)
304{
305 unsigned int i, nr_pages, this_len, ret, err;
306 struct page *page;
307
41c1852f 308 nr_pages = len_to_pages(len);
4f28f523
JA
309 for (i = 0; i < nr_pages; i++) {
310 if (ucm->todevice)
311 page = ZERO_PAGE(0);
312 else {
313 page = alloc_page(GFP_KERNEL);
314 if (!page) {
315 err = -ENOMEM;
316 goto oom;
317 }
318 }
319
320 this_len = PAGE_SIZE;
321 if (this_len > len)
322 this_len = len;
323
41c1852f
JA
324 ret = bio_add_pc_page(q, bio, page, this_len, 0);
325 if (ret < this_len) {
4f28f523
JA
326 err = -E2BIG;
327 goto oom;
328 }
329 }
330 return 0;
331oom:
332 bc_put_bio_pages(bio);
333 return err;
334}
335
336static void map_uc_to_bio_flags(struct bio *bio, struct b_user_cmd *uc)
337{
338 unsigned int i;
339
340 for (i = 0; i < 8 * sizeof(uc->flags); i++) {
341 unsigned long mask;
342
343 if (uc->flags & (1UL << i))
344 bio->bi_rw |= uc_flag_map[i];
345
346 mask = ~((1UL << i) - 1);
347 if (!(mask & uc->flags))
348 break;
349 }
350}
351
352static struct bio *map_uc_to_bio(struct b_dev *bd, struct b_user_cmd *uc)
353{
354 struct request_queue *q = bdev_get_queue(bd->bdev);
355 const struct uc_map *ucm = &uc_map[uc->type];
356 struct bio *bio;
357
358 if (ucm->data_transfer && !ucm->map_zero) {
359 bio = binject_map_bio(q, bd->bdev, uc->buf, uc->len,
360 !ucm->todevice, GFP_KERNEL);
361 } else {
41c1852f 362 bio = bio_alloc(GFP_KERNEL, len_to_pages(uc->len));
4f28f523
JA
363 if (bio) {
364 bio->bi_bdev = bd->bdev;
4f28f523
JA
365 if (ucm->map_zero && uc->len) {
366 int err;
367
368 err = zero_map_bio(q, bio, ucm, uc->len);
369 if (err) {
370 bio_put(bio);
371 bio = ERR_PTR(err);
372 }
373 } else
374 bio->bi_size = uc->len;
375 }
376 }
377
378 if (!bio)
379 bio = ERR_PTR(-ENOMEM);
380 else if (!IS_ERR(bio)) {
381 map_uc_to_bio_flags(bio, uc);
382 bio->bi_sector = uc->offset / binject_get_bs(q);
41c1852f 383 bio->bi_rw |= ucm->rw_flags;
4f28f523
JA
384 }
385
386 return bio;
387}
388
389static int b_dev_add_command(struct b_dev *bd, struct b_cmd *bc)
390{
391 struct b_user_cmd *uc = &bc->cmd;
392 struct bio *bio;
393
394 bio = map_uc_to_bio(bd, uc);
395 if (IS_ERR(bio))
396 return PTR_ERR(bio);
397
398 bio_get(bio);
399 bc->bio = bio;
400
401 bio->bi_end_io = bio_cmd_endio;
402 bio->bi_private = bc;
403
404 bc->issue_time = ktime_to_ns(ktime_get());
405
406 atomic_inc(&bd->in_flight);
407 submit_bio(bio->bi_rw, bio);
408 return 0;
409}
410
411static void b_dev_free_command(struct b_dev *bd, struct b_cmd *bc)
412{
146229b6 413 BUG_ON(!list_empty(&bc->list));
4f28f523
JA
414 kmem_cache_free(b_slab, bc);
415}
416
417/*
418 * We are always writable, as we have an infinite queue depth
419 */
420static unsigned int b_dev_poll(struct file *file, poll_table *wait)
421{
422 struct b_dev *bd = file->private_data;
423 unsigned int mask = POLLOUT;
424
425 poll_wait(file, &bd->wq_done, wait);
426
427 spin_lock_irq(&bd->lock);
428 if (!list_empty(&bd->done_list))
429 mask |= POLLIN | POLLRDNORM;
430 spin_unlock_irq(&bd->lock);
431
432 return mask;
433}
434
435static int b_dev_release(struct inode *inode, struct file *file)
436{
437 struct b_dev *bd = file->private_data;
438
439 b_dev_put(bd);
440 return 0;
441}
442
443static struct b_dev *b_dev_lookup(int minor)
444{
445 struct b_dev *bd;
446
447 rcu_read_lock();
41c1852f 448
4f28f523 449 bd = idr_find(&b_minor_idr, minor);
41c1852f
JA
450 if (bd && !atomic_inc_not_zero(&bd->ref))
451 bd = NULL;
4f28f523 452
41c1852f 453 rcu_read_unlock();
4f28f523
JA
454 return bd;
455}
456
457static int b_dev_open(struct inode *inode, struct file *file)
458{
459 struct b_dev *bd;
460
461 bd = b_dev_lookup(iminor(inode));
462 if (!bd)
463 return -ENODEV;
464
465 file->private_data = bd;
466 return 0;
467}
468
469static ssize_t b_dev_write(struct file *file, const char __user *buf,
470 size_t count, loff_t *ppos)
471{
472 struct b_dev *bd = file->private_data;
41c1852f 473 struct b_cmd *bc = NULL;
4f28f523
JA
474 unsigned int total;
475 ssize_t done = 0;
476 int err = 0;
477
478 if (count % sizeof(struct b_user_cmd))
479 return -EINVAL;
480
481 total = count / sizeof(struct b_user_cmd);
482 while (total) {
4f28f523
JA
483 bc = get_free_command(bd);
484 if (IS_ERR(bc)) {
485 err = PTR_ERR(bc);
41c1852f 486 bc = NULL;
4f28f523
JA
487 break;
488 }
489
490 if (copy_from_user(&bc->cmd, buf, sizeof(struct b_user_cmd))) {
491 err = -EFAULT;
492 break;
493 }
494
495 err = b_dev_validate_command(&bc->cmd);
41c1852f 496 if (err)
4f28f523 497 break;
4f28f523
JA
498
499 err = b_dev_add_command(bd, bc);
41c1852f 500 if (err)
4f28f523 501 break;
4f28f523
JA
502
503 done += sizeof(struct b_user_cmd);
504 buf += sizeof(struct b_user_cmd);
505 total--;
a8527ea7 506 bc = NULL;
4f28f523
JA
507 }
508
41c1852f
JA
509 if (bc)
510 b_dev_free_command(bd, bc);
511
4f28f523
JA
512 *ppos = done;
513 if (!done)
514 done = err;
515
516 return done;
517}
518
519static ssize_t b_dev_read(struct file *file, char __user *buf, size_t count,
520 loff_t *ppos)
521{
522 struct b_dev *bd = file->private_data;
523 unsigned int total;
524 ssize_t done = 0;
525 int err = 0;
526
527 if (count % sizeof(struct b_user_cmd))
528 return -EINVAL;
529
530 total = count / sizeof(struct b_user_cmd);
531 while (total) {
532 struct b_cmd *bc;
533
534 bc = get_done_command(bd, !(file->f_flags & O_NONBLOCK));
535 if (IS_ERR(bc)) {
536 err = PTR_ERR(bc);
537 break;
538 }
539
540 complete_and_free_bio(bc);
541
542 if (copy_to_user(buf, &bc->cmd, sizeof(bc->cmd)))
543 err = -EFAULT;
544
545 b_dev_free_command(bd, bc);
546
547 if (err)
548 break;
549
550 done += sizeof(struct b_user_cmd);
551 buf += sizeof(struct b_user_cmd);
552 total--;
553 }
554
555 *ppos = done;
556 if (!done)
557 done = err;
558
559 return done;
560}
561
562static const struct file_operations b_dev_fops = {
563 .open = b_dev_open,
564 .release = b_dev_release,
565 .read = b_dev_read,
566 .write = b_dev_write,
567 .poll = b_dev_poll,
568 .owner = THIS_MODULE,
569};
570
571static int b_del_dev(struct b_ioctl_cmd *bic)
572{
573 struct b_dev *bd;
574
575 bd = b_dev_lookup(bic->minor);
576 if (bd) {
577 spin_lock(&b_dev_lock);
578 b_dev_remove_lookup(bd);
579 spin_unlock(&b_dev_lock);
580
581 /*
582 * Our lookup grabbed a reference, drop two
583 */
584 b_dev_put(bd);
585 b_dev_put(bd);
586 return 0;
587 }
588
589 return -ENODEV;
590}
591
592static int b_add_dev(struct b_ioctl_cmd *bic)
593{
594 struct inode *inode;
595 struct file *file;
596 struct b_dev *bd;
597 int ret;
598
599 file = fget(bic->fd);
600 if (!file)
601 return -EBADF;
602
603 __module_get(THIS_MODULE);
604
605 inode = file->f_mapping->host;
606 if (!S_ISBLK(inode->i_mode)) {
607 ret = -EINVAL;
608 goto out_put;
609 }
610
611 ret = idr_pre_get(&b_minor_idr, GFP_KERNEL);
612 if (!ret) {
613 ret = -ENOMEM;
614 goto out_put;
615 }
616
617 bd = kzalloc(sizeof(*bd), GFP_KERNEL);
618 if (!bd) {
619 ret = -ENOMEM;
620 goto out_put;
621 }
622
623 atomic_set(&bd->ref, 1);
624 spin_lock_init(&bd->lock);
625 INIT_LIST_HEAD(&bd->done_list);
626 init_waitqueue_head(&bd->wq_done);
627 bd->file = file;
628 bd->bdev = inode->i_bdev;;
629
630 spin_lock(&b_dev_lock);
631
632 ret = idr_get_new(&b_minor_idr, bd, &bd->minor);
633 if (ret < 0)
634 goto out_unlock;
635
636 if (bd->minor >= B_MAX_DEVS)
637 goto out_idr;
638
639 spin_unlock(&b_dev_lock);
640
641 INIT_LIST_HEAD(&bd->device_list);
642 bd->dev = binject_device_create(b_class, NULL,
643 MKDEV(b_major, bd->minor), bd, "binject%d", bd->minor);
644
645 spin_lock(&b_dev_lock);
646
647 if (IS_ERR(bd->dev))
648 goto out_idr;
649
650 list_add_tail(&bd->device_list, &b_dev_list);
5b7b80bb 651 bic->minor = bd->minor;
4f28f523
JA
652 spin_unlock(&b_dev_lock);
653 return 0;
654out_idr:
655 idr_remove(&b_minor_idr, bd->minor);
656out_unlock:
657 spin_unlock(&b_dev_lock);
658 kfree(bd);
659out_put:
660 fput(file);
661 module_put(THIS_MODULE);
662 return ret;
663}
664
665static long b_misc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
666{
667 int __user *uarg = (int __user *) arg;
668 struct b_ioctl_cmd bic;
5b7b80bb 669 int ret = -ENOTTY;
4f28f523
JA
670
671 if (copy_from_user(&bic, uarg, sizeof(bic)))
672 return -EFAULT;
673
674 switch (cmd) {
675 case 0:
5b7b80bb
JA
676 ret = b_add_dev(&bic);
677 if (!ret && copy_to_user(uarg, &bic, sizeof(bic))) {
678 b_del_dev(&bic);
679 ret = -EFAULT;
680 }
681 break;
4f28f523 682 case 1:
5b7b80bb
JA
683 ret = b_del_dev(&bic);
684 break;
4f28f523
JA
685 default:
686 break;
687 }
688
689 return -ENOTTY;
690}
691
692static const struct file_operations b_misc_fops = {
693 .unlocked_ioctl = b_misc_ioctl,
694 .owner = THIS_MODULE,
695};
696
697static struct miscdevice b_misc_dev = {
698 .minor = MISC_DYNAMIC_MINOR,
699 .name = "binject-ctl",
700 .fops = &b_misc_fops,
701};
702
703static void __exit b_exit(void)
704{
705 synchronize_rcu();
706 kmem_cache_destroy(b_slab);
707 class_destroy(b_class);
708 misc_deregister(&b_misc_dev);
709}
710
146229b6
JA
711static void b_cmd_init_once(void *data)
712{
713 struct b_cmd *bc = data;
714
715 INIT_LIST_HEAD(&bc->list);
716}
717
4f28f523
JA
718static int __init b_init(void)
719{
720 int ret;
721
146229b6
JA
722 b_slab = binject_create_slab("binject", sizeof(struct b_cmd),
723 SLAB_HWCACHE_ALIGN, b_cmd_init_once);
4f28f523
JA
724 if (!b_slab) {
725 printk(KERN_ERR "binject: failed to create cmd slab\n");
726 return -ENOMEM;
727 }
728
729 ret = misc_register(&b_misc_dev);
730 if (ret < 0)
731 goto fail_misc;
732
733 b_major = register_chrdev(0, "binject", &b_dev_fops);
734 if (b_major < 0)
735 goto fail_chr;
736
737 b_class = class_create(THIS_MODULE, "binject");
738 if (IS_ERR(b_class))
739 goto fail_class;
740
741 return 0;
742fail_class:
743 unregister_chrdev(b_major, "binject");
744fail_chr:
745 misc_deregister(&b_misc_dev);
746fail_misc:
747 kmem_cache_destroy(b_slab);
748 return ret;
749}
750
751module_init(b_init);
752module_exit(b_exit);
753
754MODULE_LICENSE("GPL");
755MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");