binject: standalone module
[binject.git] / main.c
CommitLineData
4f28f523
JA
1/*
2 * TODO
3 *
4 * - Proper ioctls
5 * - Get rid of device list?
6 */
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/poll.h>
11#include <linux/slab.h>
12#include <linux/idr.h>
13#include <linux/file.h>
14#include <linux/miscdevice.h>
15#include <linux/cdev.h>
16#include <linux/bio.h>
17#include <linux/blkdev.h>
18
19#include "kcompat.h"
20#include "binject.h"
21
22static LIST_HEAD(b_dev_list);
23static DEFINE_SPINLOCK(b_dev_lock);
24static DEFINE_IDR(b_minor_idr);
25static struct kmem_cache *b_slab;
26static struct class *b_class;
27static int b_major;
28
29#define B_MAX_DEVS 64
30
31struct b_dev {
32 struct list_head device_list;
33 struct list_head done_list;
34 atomic_t in_flight;
35 unsigned int done_cmds;
36 wait_queue_head_t wq_done;
37 struct block_device *bdev;
38 spinlock_t lock;
39 atomic_t ref;
40 struct file *file;
41 struct device *dev;
42 int minor;
43 struct rcu_head rcu_free;
44};
45
46struct b_cmd {
47 struct list_head list;
48 struct b_dev *bd;
49 struct bio *bio;
50 struct b_user_cmd cmd;
51 u64 issue_time;
52};
53
54static const unsigned long uc_flag_map[__B_FLAG_NR] = {
55 B_REQ_SYNC,
56 B_REQ_UNPLUG,
57 B_REQ_NOIDLE,
58 B_REQ_HARDBARRIER,
59 B_REQ_META,
60 B_REQ_RAHEAD,
61 B_REQ_FAILFAST_DEV,
62 B_REQ_FAILFAST_TRANSPORT,
63 B_REQ_FAILFAST_DRIVER
64};
65
66struct uc_map {
67 int type;
68 unsigned int data_transfer : 1;
69 unsigned int todevice : 1;
70 unsigned int map_zero : 1;
71};
72
73static const struct uc_map uc_map[B_TYPE_NR] = {
74 {
75 .type = B_TYPE_READ,
76 .data_transfer = 1,
77 .todevice = 0,
78 .map_zero = 0,
79 },
80 {
81 .type = B_TYPE_WRITE,
82 .data_transfer = 1,
83 .todevice = 1,
84 .map_zero = 0,
85 },
86 {
87 .type = B_TYPE_DISCARD,
88 .data_transfer = 0,
89 .todevice = 0,
90 .map_zero = 0,
91 },
92 {
93 .type = B_TYPE_READVOID,
94 .data_transfer = 1,
95 .todevice = 0,
96 .map_zero = 1,
97 },
98 {
99 .type = B_TYPE_WRITEZERO,
100 .data_transfer = 1,
101 .todevice = 1,
102 .map_zero = 1,
103 }
104};
105
106static void b_dev_complete_commands(struct b_dev *bd);
107
108static void b_dev_remove_lookup(struct b_dev *bd)
109{
110 if (!list_empty(&bd->device_list)) {
111 list_del_init(&bd->device_list);
112 idr_remove(&b_minor_idr, bd->minor);
113 }
114}
115
116static void bd_rcu_free(struct rcu_head *head)
117{
118 kfree(container_of(head, struct b_dev, rcu_free));
119}
120
121static void b_dev_put(struct b_dev *bd)
122{
123 if (!atomic_dec_and_test(&bd->ref))
124 return;
125
126 spin_lock(&b_dev_lock);
127 b_dev_remove_lookup(bd);
128 spin_unlock(&b_dev_lock);
129
130 b_dev_complete_commands(bd);
131
132 device_destroy(b_class, MKDEV(b_major, bd->minor));
133 fput(bd->file);
134 module_put(THIS_MODULE);
135
136 call_rcu(&bd->rcu_free, bd_rcu_free);
137}
138
139static struct b_cmd *get_free_command(struct b_dev *bd)
140{
141 struct b_cmd *bc;
142
143 bc = kmem_cache_alloc(b_slab, GFP_KERNEL);
144 if (bc) {
145 memset(bc, 0, sizeof(*bc));
146 INIT_LIST_HEAD(&bc->list);
147 bc->bd = bd;
148 return bc;
149 }
150
151 return ERR_PTR(-ENOMEM);
152}
153
154static struct b_cmd *get_completed_command(struct b_dev *bd)
155{
156 struct b_cmd *bc = NULL;
157
158 spin_lock_irq(&bd->lock);
159 if (!list_empty(&bd->done_list)) {
160 bc = list_entry(bd->done_list.next, struct b_cmd, list);
161 bd->done_cmds--;
162 list_del(&bc->list);
163 }
164 spin_unlock_irq(&bd->lock);
165 return bc;
166}
167
168static struct b_cmd *get_done_command(struct b_dev *bd, int block)
169{
170 struct b_cmd *bc;
171 int ret;
172
173 do {
174 bc = get_completed_command(bd);
175 if (bc)
176 break;
177
178 if (!block)
179 break;
180
181 ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
182 if (ret) {
183 bc = ERR_PTR(-ERESTARTSYS);
184 break;
185 }
186 } while (1);
187
188 return bc;
189}
190
191static void bc_put_bio_pages(struct bio *bio)
192{
193 struct bio_vec *bv;
194 unsigned int i;
195
196 __bio_for_each_segment(bv, bio, i, 0) {
197 if (bv->bv_page != ZERO_PAGE(0))
198 __free_page(bv->bv_page);
199 }
200}
201
202static void complete_and_free_bio(struct b_cmd *bc)
203{
204 if (bc->bio) {
205 const struct uc_map *ucm = &uc_map[bc->cmd.type];
206
207 if (ucm->data_transfer) {
208 if (!ucm->map_zero)
209 bio_unmap_user(bc->bio);
210 else
211 bc_put_bio_pages(bc->bio);
212 }
213 bio_put(bc->bio);
214 bc->bio = NULL;
215 }
216}
217
218static void b_dev_complete_commands(struct b_dev *bd)
219{
220 struct b_cmd *bc;
221
222 wait_event(bd->wq_done, !atomic_read(&bd->in_flight));
223
224 while ((bc = get_completed_command(bd)) != NULL)
225 complete_and_free_bio(bc);
226}
227
228static int b_dev_validate_command(struct b_user_cmd *buc)
229{
230 if (!binject_buc_check_magic(buc))
231 return -EINVAL;
232
233 switch (buc->type) {
234 case B_TYPE_WRITE:
235 case B_TYPE_READ:
236 case B_TYPE_DISCARD:
237 case B_TYPE_READVOID:
238 case B_TYPE_WRITEZERO:
239 if (buc->len)
240 return 0;
241 return -EINVAL;
242 default:
243 return -EINVAL;
244 }
245}
246
247static void b_cmd_endio(struct bio *bio, int error)
248{
249 struct b_cmd *bc = bio->bi_private;
250 struct b_dev *bd = bc->bd;
251 unsigned long flags;
252 unsigned long now;
253
254 now = ktime_to_ns(ktime_get());
255 bc->cmd.nsec = now - bc->issue_time;
256 bc->cmd.error = error;
257
258 spin_lock_irqsave(&bd->lock, flags);
259 list_add_tail(&bc->list, &bd->done_list);
260 bd->done_cmds++;
261 spin_unlock_irqrestore(&bd->lock, flags);
262
263 atomic_dec(&bd->in_flight);
264
265 wake_up(&bd->wq_done);
266}
267
268#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18)
269static int bio_cmd_endio(struct bio *bio, unsigned int bytes, int err)
270{
271 if (bio->bi_size)
272 return 1;
273
274 b_cmd_endio(bio, err);
275 return 0;
276}
277#else
278static void bio_cmd_endio(struct bio *bio, int err)
279{
280 b_cmd_endio(bio, err);
281}
282#endif
283
284static int zero_map_bio(struct request_queue *q, struct bio *bio,
285 const struct uc_map *ucm, unsigned int len)
286{
287 unsigned int i, nr_pages, this_len, ret, err;
288 struct page *page;
289
290 nr_pages = len / PAGE_SIZE;
291 for (i = 0; i < nr_pages; i++) {
292 if (ucm->todevice)
293 page = ZERO_PAGE(0);
294 else {
295 page = alloc_page(GFP_KERNEL);
296 if (!page) {
297 err = -ENOMEM;
298 goto oom;
299 }
300 }
301
302 this_len = PAGE_SIZE;
303 if (this_len > len)
304 this_len = len;
305
306 ret = bio_add_pc_page(q, bio, page, len, 0);
307 if (ret < len) {
308 err = -E2BIG;
309 goto oom;
310 }
311 }
312 return 0;
313oom:
314 bc_put_bio_pages(bio);
315 return err;
316}
317
318static void map_uc_to_bio_flags(struct bio *bio, struct b_user_cmd *uc)
319{
320 unsigned int i;
321
322 for (i = 0; i < 8 * sizeof(uc->flags); i++) {
323 unsigned long mask;
324
325 if (uc->flags & (1UL << i))
326 bio->bi_rw |= uc_flag_map[i];
327
328 mask = ~((1UL << i) - 1);
329 if (!(mask & uc->flags))
330 break;
331 }
332}
333
334static struct bio *map_uc_to_bio(struct b_dev *bd, struct b_user_cmd *uc)
335{
336 struct request_queue *q = bdev_get_queue(bd->bdev);
337 const struct uc_map *ucm = &uc_map[uc->type];
338 struct bio *bio;
339
340 if (ucm->data_transfer && !ucm->map_zero) {
341 bio = binject_map_bio(q, bd->bdev, uc->buf, uc->len,
342 !ucm->todevice, GFP_KERNEL);
343 } else {
344 bio = bio_alloc(GFP_KERNEL, uc->len / PAGE_SIZE);
345 if (bio) {
346 bio->bi_bdev = bd->bdev;
347 if (ucm->todevice)
348 binject_mark_bio_write(bio);
349 if (ucm->map_zero && uc->len) {
350 int err;
351
352 err = zero_map_bio(q, bio, ucm, uc->len);
353 if (err) {
354 bio_put(bio);
355 bio = ERR_PTR(err);
356 }
357 } else
358 bio->bi_size = uc->len;
359 }
360 }
361
362 if (!bio)
363 bio = ERR_PTR(-ENOMEM);
364 else if (!IS_ERR(bio)) {
365 map_uc_to_bio_flags(bio, uc);
366 bio->bi_sector = uc->offset / binject_get_bs(q);
367 }
368
369 return bio;
370}
371
372static int b_dev_add_command(struct b_dev *bd, struct b_cmd *bc)
373{
374 struct b_user_cmd *uc = &bc->cmd;
375 struct bio *bio;
376
377 bio = map_uc_to_bio(bd, uc);
378 if (IS_ERR(bio))
379 return PTR_ERR(bio);
380
381 bio_get(bio);
382 bc->bio = bio;
383
384 bio->bi_end_io = bio_cmd_endio;
385 bio->bi_private = bc;
386
387 bc->issue_time = ktime_to_ns(ktime_get());
388
389 atomic_inc(&bd->in_flight);
390 submit_bio(bio->bi_rw, bio);
391 return 0;
392}
393
394static void b_dev_free_command(struct b_dev *bd, struct b_cmd *bc)
395{
396 kmem_cache_free(b_slab, bc);
397}
398
399/*
400 * We are always writable, as we have an infinite queue depth
401 */
402static unsigned int b_dev_poll(struct file *file, poll_table *wait)
403{
404 struct b_dev *bd = file->private_data;
405 unsigned int mask = POLLOUT;
406
407 poll_wait(file, &bd->wq_done, wait);
408
409 spin_lock_irq(&bd->lock);
410 if (!list_empty(&bd->done_list))
411 mask |= POLLIN | POLLRDNORM;
412 spin_unlock_irq(&bd->lock);
413
414 return mask;
415}
416
417static int b_dev_release(struct inode *inode, struct file *file)
418{
419 struct b_dev *bd = file->private_data;
420
421 b_dev_put(bd);
422 return 0;
423}
424
425static struct b_dev *b_dev_lookup(int minor)
426{
427 struct b_dev *bd;
428
429 rcu_read_lock();
430 bd = idr_find(&b_minor_idr, minor);
431 if (bd) {
432 if (!atomic_inc_not_zero(&bd->ref))
433 bd = NULL;
434 }
435 rcu_read_unlock();
436
437 return bd;
438}
439
440static int b_dev_open(struct inode *inode, struct file *file)
441{
442 struct b_dev *bd;
443
444 bd = b_dev_lookup(iminor(inode));
445 if (!bd)
446 return -ENODEV;
447
448 file->private_data = bd;
449 return 0;
450}
451
452static ssize_t b_dev_write(struct file *file, const char __user *buf,
453 size_t count, loff_t *ppos)
454{
455 struct b_dev *bd = file->private_data;
456 unsigned int total;
457 ssize_t done = 0;
458 int err = 0;
459
460 if (count % sizeof(struct b_user_cmd))
461 return -EINVAL;
462
463 total = count / sizeof(struct b_user_cmd);
464 while (total) {
465 struct b_cmd *bc;
466
467 bc = get_free_command(bd);
468 if (IS_ERR(bc)) {
469 err = PTR_ERR(bc);
470 break;
471 }
472
473 if (copy_from_user(&bc->cmd, buf, sizeof(struct b_user_cmd))) {
474 err = -EFAULT;
475 break;
476 }
477
478 err = b_dev_validate_command(&bc->cmd);
479 if (err) {
480 b_dev_free_command(bd, bc);
481 break;
482 }
483
484 err = b_dev_add_command(bd, bc);
485 if (err) {
486 b_dev_free_command(bd, bc);
487 break;
488 }
489
490 done += sizeof(struct b_user_cmd);
491 buf += sizeof(struct b_user_cmd);
492 total--;
493 }
494
495 *ppos = done;
496 if (!done)
497 done = err;
498
499 return done;
500}
501
502static ssize_t b_dev_read(struct file *file, char __user *buf, size_t count,
503 loff_t *ppos)
504{
505 struct b_dev *bd = file->private_data;
506 unsigned int total;
507 ssize_t done = 0;
508 int err = 0;
509
510 if (count % sizeof(struct b_user_cmd))
511 return -EINVAL;
512
513 total = count / sizeof(struct b_user_cmd);
514 while (total) {
515 struct b_cmd *bc;
516
517 bc = get_done_command(bd, !(file->f_flags & O_NONBLOCK));
518 if (IS_ERR(bc)) {
519 err = PTR_ERR(bc);
520 break;
521 }
522
523 complete_and_free_bio(bc);
524
525 if (copy_to_user(buf, &bc->cmd, sizeof(bc->cmd)))
526 err = -EFAULT;
527
528 b_dev_free_command(bd, bc);
529
530 if (err)
531 break;
532
533 done += sizeof(struct b_user_cmd);
534 buf += sizeof(struct b_user_cmd);
535 total--;
536 }
537
538 *ppos = done;
539 if (!done)
540 done = err;
541
542 return done;
543}
544
545static const struct file_operations b_dev_fops = {
546 .open = b_dev_open,
547 .release = b_dev_release,
548 .read = b_dev_read,
549 .write = b_dev_write,
550 .poll = b_dev_poll,
551 .owner = THIS_MODULE,
552};
553
554static int b_del_dev(struct b_ioctl_cmd *bic)
555{
556 struct b_dev *bd;
557
558 bd = b_dev_lookup(bic->minor);
559 if (bd) {
560 spin_lock(&b_dev_lock);
561 b_dev_remove_lookup(bd);
562 spin_unlock(&b_dev_lock);
563
564 /*
565 * Our lookup grabbed a reference, drop two
566 */
567 b_dev_put(bd);
568 b_dev_put(bd);
569 return 0;
570 }
571
572 return -ENODEV;
573}
574
575static int b_add_dev(struct b_ioctl_cmd *bic)
576{
577 struct inode *inode;
578 struct file *file;
579 struct b_dev *bd;
580 int ret;
581
582 file = fget(bic->fd);
583 if (!file)
584 return -EBADF;
585
586 __module_get(THIS_MODULE);
587
588 inode = file->f_mapping->host;
589 if (!S_ISBLK(inode->i_mode)) {
590 ret = -EINVAL;
591 goto out_put;
592 }
593
594 ret = idr_pre_get(&b_minor_idr, GFP_KERNEL);
595 if (!ret) {
596 ret = -ENOMEM;
597 goto out_put;
598 }
599
600 bd = kzalloc(sizeof(*bd), GFP_KERNEL);
601 if (!bd) {
602 ret = -ENOMEM;
603 goto out_put;
604 }
605
606 atomic_set(&bd->ref, 1);
607 spin_lock_init(&bd->lock);
608 INIT_LIST_HEAD(&bd->done_list);
609 init_waitqueue_head(&bd->wq_done);
610 bd->file = file;
611 bd->bdev = inode->i_bdev;;
612
613 spin_lock(&b_dev_lock);
614
615 ret = idr_get_new(&b_minor_idr, bd, &bd->minor);
616 if (ret < 0)
617 goto out_unlock;
618
619 if (bd->minor >= B_MAX_DEVS)
620 goto out_idr;
621
622 spin_unlock(&b_dev_lock);
623
624 INIT_LIST_HEAD(&bd->device_list);
625 bd->dev = binject_device_create(b_class, NULL,
626 MKDEV(b_major, bd->minor), bd, "binject%d", bd->minor);
627
628 spin_lock(&b_dev_lock);
629
630 if (IS_ERR(bd->dev))
631 goto out_idr;
632
633 list_add_tail(&bd->device_list, &b_dev_list);
634 spin_unlock(&b_dev_lock);
635 return 0;
636out_idr:
637 idr_remove(&b_minor_idr, bd->minor);
638out_unlock:
639 spin_unlock(&b_dev_lock);
640 kfree(bd);
641out_put:
642 fput(file);
643 module_put(THIS_MODULE);
644 return ret;
645}
646
647static long b_misc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
648{
649 int __user *uarg = (int __user *) arg;
650 struct b_ioctl_cmd bic;
651
652 if (copy_from_user(&bic, uarg, sizeof(bic)))
653 return -EFAULT;
654
655 switch (cmd) {
656 case 0:
657 return b_add_dev(&bic);
658 case 1:
659 return b_del_dev(&bic);
660 default:
661 break;
662 }
663
664 return -ENOTTY;
665}
666
667static const struct file_operations b_misc_fops = {
668 .unlocked_ioctl = b_misc_ioctl,
669 .owner = THIS_MODULE,
670};
671
672static struct miscdevice b_misc_dev = {
673 .minor = MISC_DYNAMIC_MINOR,
674 .name = "binject-ctl",
675 .fops = &b_misc_fops,
676};
677
678static void __exit b_exit(void)
679{
680 synchronize_rcu();
681 kmem_cache_destroy(b_slab);
682 class_destroy(b_class);
683 misc_deregister(&b_misc_dev);
684}
685
686static int __init b_init(void)
687{
688 int ret;
689
690 b_slab = binject_create_slab("binject", sizeof(struct b_cmd));
691 if (!b_slab) {
692 printk(KERN_ERR "binject: failed to create cmd slab\n");
693 return -ENOMEM;
694 }
695
696 ret = misc_register(&b_misc_dev);
697 if (ret < 0)
698 goto fail_misc;
699
700 b_major = register_chrdev(0, "binject", &b_dev_fops);
701 if (b_major < 0)
702 goto fail_chr;
703
704 b_class = class_create(THIS_MODULE, "binject");
705 if (IS_ERR(b_class))
706 goto fail_class;
707
708 return 0;
709fail_class:
710 unregister_chrdev(b_major, "binject");
711fail_chr:
712 misc_deregister(&b_misc_dev);
713fail_misc:
714 kmem_cache_destroy(b_slab);
715 return ret;
716}
717
718module_init(b_init);
719module_exit(b_exit);
720
721MODULE_LICENSE("GPL");
722MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");