Fix buggy early free of bc
[binject.git] / main.c
1 /*
2  * TODO
3  *
4  * - Proper ioctls
5  * - Get rid of device list?
6  */
7 #include <linux/kernel.h>
8 #include <linux/module.h>
9 #include <linux/init.h>
10 #include <linux/poll.h>
11 #include <linux/slab.h>
12 #include <linux/idr.h>
13 #include <linux/file.h>
14 #include <linux/miscdevice.h>
15 #include <linux/cdev.h>
16 #include <linux/bio.h>
17 #include <linux/blkdev.h>
18
19 #include "kcompat.h"
20 #include "binject.h"
21
22 static LIST_HEAD(b_dev_list);
23 static DEFINE_SPINLOCK(b_dev_lock);
24 static DEFINE_IDR(b_minor_idr);
25 static struct kmem_cache *b_slab;
26 static struct class *b_class;
27 static int b_major;
28
29 #define B_MAX_DEVS      64
30
31 struct b_dev {
32         struct list_head device_list;
33         struct list_head done_list;
34         atomic_t in_flight;
35         unsigned int done_cmds;
36         wait_queue_head_t wq_done;
37         struct block_device *bdev;
38         spinlock_t lock;
39         atomic_t ref;
40         struct file *file;
41         struct device *dev;
42         int minor;
43         struct rcu_head rcu_free;
44 };
45
46 struct b_cmd {
47         struct list_head list;
48         struct b_dev *bd;
49         struct bio *bio;
50         struct b_user_cmd cmd;
51         u64 issue_time;
52 };
53
54 static const unsigned long uc_flag_map[__B_FLAG_NR] = {
55         B_REQ_SYNC,
56         B_REQ_UNPLUG,
57         B_REQ_NOIDLE,
58         B_REQ_HARDBARRIER,
59         B_REQ_META,
60         B_REQ_RAHEAD,
61         B_REQ_FAILFAST_DEV,
62         B_REQ_FAILFAST_TRANSPORT,
63         B_REQ_FAILFAST_DRIVER
64 };
65
66 struct uc_map {
67         int type;
68         unsigned int data_transfer : 1;
69         unsigned int todevice : 1;
70         unsigned int map_zero : 1;
71         unsigned long rw_flags;
72 };
73
74 static const struct uc_map uc_map[B_TYPE_NR] = {
75         {
76                 .type           = B_TYPE_READ,
77                 .data_transfer  = 1,
78                 .todevice       = 0,
79                 .map_zero       = 0,
80         },
81         {
82                 .type           = B_TYPE_WRITE,
83                 .data_transfer  = 1,
84                 .todevice       = 1,
85                 .map_zero       = 0,
86                 .rw_flags       = B_REQ_WRITE,
87         },
88         {
89                 .type           = B_TYPE_DISCARD,
90                 .data_transfer  = 0,
91                 .todevice       = 0,
92                 .map_zero       = 0,
93                 .rw_flags       = B_REQ_DISCARD | B_REQ_WRITE,
94         },
95         {
96                 .type           = B_TYPE_READVOID,
97                 .data_transfer  = 1,
98                 .todevice       = 0,
99                 .map_zero       = 1,
100         },
101         {
102                 .type           = B_TYPE_WRITEZERO,
103                 .data_transfer  = 1,
104                 .todevice       = 1,
105                 .map_zero       = 1,
106                 .rw_flags       = B_REQ_WRITE,
107         },
108         {
109                 .type           = B_TYPE_READBARRIER,
110                 .data_transfer  = 1,
111                 .todevice       = 0,
112                 .map_zero       = 0,
113                 .rw_flags       = B_REQ_HARDBARRIER,
114         },
115         {
116                 .type           = B_TYPE_WRITEBARRIER,
117                 .data_transfer  = 1,
118                 .todevice       = 1,
119                 .map_zero       = 0,
120                 .rw_flags       = B_REQ_HARDBARRIER | B_REQ_FLUSH | B_REQ_WRITE,
121         }
122 };
123
124 static void b_dev_complete_commands(struct b_dev *bd);
125
126 static void b_dev_remove_lookup(struct b_dev *bd)
127 {
128         if (!list_empty(&bd->device_list)) {
129                 list_del_init(&bd->device_list);
130                 idr_remove(&b_minor_idr, bd->minor);
131         }
132 }
133
134 static void bd_rcu_free(struct rcu_head *head)
135 {
136         kfree(container_of(head, struct b_dev, rcu_free));
137 }
138
139 static void b_dev_put(struct b_dev *bd)
140 {
141         if (!atomic_dec_and_test(&bd->ref))
142                 return;
143
144         spin_lock(&b_dev_lock);
145         b_dev_remove_lookup(bd);
146         spin_unlock(&b_dev_lock);
147
148         b_dev_complete_commands(bd);
149
150         device_destroy(b_class, MKDEV(b_major, bd->minor));
151         fput(bd->file);
152         module_put(THIS_MODULE);
153
154         call_rcu(&bd->rcu_free, bd_rcu_free);
155 }
156
157 static struct b_cmd *get_free_command(struct b_dev *bd)
158 {
159         struct b_cmd *bc;
160
161         bc = kmem_cache_alloc(b_slab, GFP_KERNEL);
162         if (bc) {
163                 memset(bc, 0, sizeof(*bc));
164                 INIT_LIST_HEAD(&bc->list);
165                 bc->bd = bd;
166                 return bc;
167         }
168
169         return ERR_PTR(-ENOMEM);
170 }
171
172 static struct b_cmd *get_completed_command(struct b_dev *bd)
173 {
174         struct b_cmd *bc = NULL;
175
176         spin_lock_irq(&bd->lock);
177         if (!list_empty(&bd->done_list)) {
178                 bc = list_entry(bd->done_list.next, struct b_cmd, list);
179                 bd->done_cmds--;
180                 list_del(&bc->list);
181         }
182         spin_unlock_irq(&bd->lock);
183         return bc;
184 }
185
186 static struct b_cmd *get_done_command(struct b_dev *bd, int block)
187 {
188         struct b_cmd *bc;
189         int ret;
190
191         do {
192                 bc = get_completed_command(bd);
193                 if (bc)
194                         break;
195
196                 if (!block)
197                         break;
198
199                 ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
200                 if (ret) {
201                         bc = ERR_PTR(-ERESTARTSYS);
202                         break;
203                 }
204         } while (1);
205
206         return bc;
207 }
208
209 static void bc_put_bio_pages(struct bio *bio)
210 {
211         struct bio_vec *bv;
212         unsigned int i;
213
214         __bio_for_each_segment(bv, bio, i, 0) {
215                 if (bv->bv_page != ZERO_PAGE(0))
216                         __free_page(bv->bv_page);
217         }
218 }
219
220 static void complete_and_free_bio(struct b_cmd *bc)
221 {
222         if (bc->bio) {
223                 const struct uc_map *ucm = &uc_map[bc->cmd.type];
224
225                 if (ucm->data_transfer) {
226                         if (!ucm->map_zero)
227                                 bio_unmap_user(bc->bio);
228                         else
229                                 bc_put_bio_pages(bc->bio);
230                 }
231                 bio_put(bc->bio);
232                 bc->bio = NULL;
233         }
234 }
235
236 static void b_dev_complete_commands(struct b_dev *bd)
237 {
238         struct b_cmd *bc;
239
240         wait_event(bd->wq_done, !atomic_read(&bd->in_flight));
241
242         while ((bc = get_completed_command(bd)) != NULL)
243                 complete_and_free_bio(bc);
244 }
245
246 static int b_dev_validate_command(struct b_user_cmd *buc)
247 {
248         if (!binject_buc_check_magic(buc))
249                 return -EINVAL;
250
251         switch (buc->type) {
252         case B_TYPE_WRITE:
253         case B_TYPE_READ:
254         case B_TYPE_DISCARD:
255         case B_TYPE_READVOID:
256         case B_TYPE_WRITEZERO:
257                 if (buc->len)
258                         return 0;
259                 return -EINVAL;
260         default:
261                 return -EINVAL;
262         }
263 }
264
265 static void b_cmd_endio(struct bio *bio, int error)
266 {
267         struct b_cmd *bc = bio->bi_private;
268         struct b_dev *bd = bc->bd;
269         unsigned long flags;
270         unsigned long now;
271
272         now = ktime_to_ns(ktime_get());
273         bc->cmd.nsec = now - bc->issue_time;
274         bc->cmd.error = error;
275
276         spin_lock_irqsave(&bd->lock, flags);
277         list_add_tail(&bc->list, &bd->done_list);
278         bd->done_cmds++;
279         spin_unlock_irqrestore(&bd->lock, flags);
280
281         atomic_dec(&bd->in_flight);
282
283         wake_up(&bd->wq_done);
284 }
285
286 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 18)
287 static int bio_cmd_endio(struct bio *bio, unsigned int bytes, int err)
288 {
289         if (bio->bi_size)
290                 return 1;
291
292         b_cmd_endio(bio, err);
293         return 0;
294 }
295 #else
296 static void bio_cmd_endio(struct bio *bio, int err)
297 {
298         b_cmd_endio(bio, err);
299 }
300 #endif
301
302 #define len_to_pages(len)      ((len + PAGE_SIZE - 1) / PAGE_SIZE)
303
304 static int zero_map_bio(struct request_queue *q, struct bio *bio,
305                         const struct uc_map *ucm, unsigned int len)
306 {
307         unsigned int i, nr_pages, this_len, ret, err;
308         struct page *page;
309
310         nr_pages = len_to_pages(len);
311         for (i = 0; i < nr_pages; i++) {
312                 if (ucm->todevice)
313                         page = ZERO_PAGE(0);
314                 else {
315                         page = alloc_page(GFP_KERNEL);
316                         if (!page) {
317                                 err = -ENOMEM;
318                                 goto oom;
319                         }
320                 }
321
322                 this_len = PAGE_SIZE;
323                 if (this_len > len)
324                         this_len = len;
325
326                 ret = bio_add_pc_page(q, bio, page, this_len, 0);
327                 if (ret < this_len) {
328                         err = -E2BIG;
329                         goto oom;
330                 }
331         }
332         return 0;
333 oom:
334         bc_put_bio_pages(bio);
335         return err;
336 }
337
338 static void map_uc_to_bio_flags(struct bio *bio, struct b_user_cmd *uc)
339 {
340         unsigned int i;
341
342         for (i = 0; i < 8 * sizeof(uc->flags); i++) {
343                 unsigned long mask;
344
345                 if (uc->flags & (1UL << i))
346                         bio->bi_rw |= uc_flag_map[i];
347
348                 mask = ~((1UL << i) - 1);
349                 if (!(mask & uc->flags))
350                         break;
351         }
352 }
353
354 static struct bio *map_uc_to_bio(struct b_dev *bd, struct b_user_cmd *uc)
355 {
356         struct request_queue *q = bdev_get_queue(bd->bdev);
357         const struct uc_map *ucm = &uc_map[uc->type];
358         struct bio *bio;
359
360         if (ucm->data_transfer && !ucm->map_zero) {
361                 bio = binject_map_bio(q, bd->bdev, uc->buf, uc->len,
362                                         !ucm->todevice, GFP_KERNEL);
363         } else {
364                 bio = bio_alloc(GFP_KERNEL, len_to_pages(uc->len));
365                 if (bio) {
366                         bio->bi_bdev = bd->bdev;
367                         if (ucm->map_zero && uc->len) {
368                                 int err;
369
370                                 err = zero_map_bio(q, bio, ucm, uc->len);
371                                 if (err) {
372                                         bio_put(bio);
373                                         bio = ERR_PTR(err);
374                                 }
375                         } else
376                                 bio->bi_size = uc->len;
377                 }
378         }
379
380         if (!bio)
381                 bio = ERR_PTR(-ENOMEM);
382         else if (!IS_ERR(bio)) {
383                 map_uc_to_bio_flags(bio, uc);
384                 bio->bi_sector = uc->offset / binject_get_bs(q);
385                 bio->bi_rw |= ucm->rw_flags;
386         }
387
388         return bio;
389 }
390
391 static int b_dev_add_command(struct b_dev *bd, struct b_cmd *bc)
392 {
393         struct b_user_cmd *uc = &bc->cmd;
394         struct bio *bio;
395
396         bio = map_uc_to_bio(bd, uc);
397         if (IS_ERR(bio))
398                 return PTR_ERR(bio);
399
400         bio_get(bio);
401         bc->bio = bio;
402
403         bio->bi_end_io = bio_cmd_endio;
404         bio->bi_private = bc;
405
406         bc->issue_time = ktime_to_ns(ktime_get());
407
408         atomic_inc(&bd->in_flight);
409         submit_bio(bio->bi_rw, bio);
410         return 0;
411 }
412
413 static void b_dev_free_command(struct b_dev *bd, struct b_cmd *bc)
414 {
415         kmem_cache_free(b_slab, bc);
416 }
417
418 /*
419  * We are always writable, as we have an infinite queue depth
420  */
421 static unsigned int b_dev_poll(struct file *file, poll_table *wait)
422 {
423         struct b_dev *bd = file->private_data;
424         unsigned int mask = POLLOUT;
425
426         poll_wait(file, &bd->wq_done, wait);
427
428         spin_lock_irq(&bd->lock);
429         if (!list_empty(&bd->done_list))
430                 mask |= POLLIN | POLLRDNORM;
431         spin_unlock_irq(&bd->lock);
432
433         return mask;
434 }
435
436 static int b_dev_release(struct inode *inode, struct file *file)
437 {
438         struct b_dev *bd = file->private_data;
439
440         b_dev_put(bd);
441         return 0;
442 }
443
444 static struct b_dev *b_dev_lookup(int minor)
445 {
446         struct b_dev *bd;
447
448         rcu_read_lock();
449
450         bd = idr_find(&b_minor_idr, minor);
451         if (bd && !atomic_inc_not_zero(&bd->ref))
452                 bd = NULL;
453
454         rcu_read_unlock();
455         return bd;
456 }
457
458 static int b_dev_open(struct inode *inode, struct file *file)
459 {
460         struct b_dev *bd;
461
462         bd = b_dev_lookup(iminor(inode));
463         if (!bd)
464                 return -ENODEV;
465
466         file->private_data = bd;
467         return 0;
468 }
469
470 static ssize_t b_dev_write(struct file *file, const char __user *buf,
471                            size_t count, loff_t *ppos)
472 {
473         struct b_dev *bd = file->private_data;
474         struct b_cmd *bc = NULL;
475         unsigned int total;
476         ssize_t done = 0;
477         int err = 0;
478
479         if (count % sizeof(struct b_user_cmd))
480                 return -EINVAL;
481
482         total = count / sizeof(struct b_user_cmd);
483         while (total) {
484                 bc = get_free_command(bd);
485                 if (IS_ERR(bc)) {
486                         err = PTR_ERR(bc);
487                         bc = NULL;
488                         break;
489                 }
490
491                 if (copy_from_user(&bc->cmd, buf, sizeof(struct b_user_cmd))) {
492                         err = -EFAULT;
493                         break;
494                 }
495
496                 err = b_dev_validate_command(&bc->cmd);
497                 if (err)
498                         break;
499
500                 err = b_dev_add_command(bd, bc);
501                 if (err)
502                         break;
503
504                 done += sizeof(struct b_user_cmd);
505                 buf += sizeof(struct b_user_cmd);
506                 total--;
507                 bc = NULL;
508         }
509
510         if (bc)
511                 b_dev_free_command(bd, bc);
512
513         *ppos = done;
514         if (!done)
515                 done = err;
516
517         return done;
518 }
519
520 static ssize_t b_dev_read(struct file *file, char __user *buf, size_t count,
521                           loff_t *ppos)
522 {
523         struct b_dev *bd = file->private_data;
524         unsigned int total;
525         ssize_t done = 0;
526         int err = 0;
527
528         if (count % sizeof(struct b_user_cmd))
529                 return -EINVAL;
530
531         total = count / sizeof(struct b_user_cmd);
532         while (total) {
533                 struct b_cmd *bc;
534
535                 bc = get_done_command(bd, !(file->f_flags & O_NONBLOCK));
536                 if (IS_ERR(bc)) {
537                         err = PTR_ERR(bc);
538                         break;
539                 }
540
541                 complete_and_free_bio(bc);
542
543                 if (copy_to_user(buf, &bc->cmd, sizeof(bc->cmd)))
544                         err = -EFAULT;
545
546                 b_dev_free_command(bd, bc);
547
548                 if (err)
549                         break;
550
551                 done += sizeof(struct b_user_cmd);
552                 buf += sizeof(struct b_user_cmd);
553                 total--;
554         }
555
556         *ppos = done;
557         if (!done)
558                 done = err;
559
560         return done;
561 }
562
563 static const struct file_operations b_dev_fops = {
564         .open           = b_dev_open,
565         .release        = b_dev_release,
566         .read           = b_dev_read,
567         .write          = b_dev_write,
568         .poll           = b_dev_poll,
569         .owner          = THIS_MODULE,
570 };
571
572 static int b_del_dev(struct b_ioctl_cmd *bic)
573 {
574         struct b_dev *bd;
575
576         bd = b_dev_lookup(bic->minor);
577         if (bd) {
578                 spin_lock(&b_dev_lock);
579                 b_dev_remove_lookup(bd);
580                 spin_unlock(&b_dev_lock);
581
582                 /*
583                  * Our lookup grabbed a reference, drop two
584                  */
585                 b_dev_put(bd);
586                 b_dev_put(bd);
587                 return 0;
588         }
589
590         return -ENODEV;
591 }
592
593 static int b_add_dev(struct b_ioctl_cmd *bic)
594 {
595         struct inode *inode;
596         struct file *file;
597         struct b_dev *bd;
598         int ret;
599
600         file = fget(bic->fd);
601         if (!file)
602                 return -EBADF;
603
604         __module_get(THIS_MODULE);
605
606         inode = file->f_mapping->host;
607         if (!S_ISBLK(inode->i_mode)) {
608                 ret = -EINVAL;
609                 goto out_put;
610         }
611
612         ret = idr_pre_get(&b_minor_idr, GFP_KERNEL);
613         if (!ret) {
614                 ret = -ENOMEM;
615                 goto out_put;
616         }
617
618         bd = kzalloc(sizeof(*bd), GFP_KERNEL);
619         if (!bd) {
620                 ret = -ENOMEM;
621                 goto out_put;
622         }
623
624         atomic_set(&bd->ref, 1);
625         spin_lock_init(&bd->lock);
626         INIT_LIST_HEAD(&bd->done_list);
627         init_waitqueue_head(&bd->wq_done);
628         bd->file = file;
629         bd->bdev = inode->i_bdev;;
630
631         spin_lock(&b_dev_lock);
632
633         ret = idr_get_new(&b_minor_idr, bd, &bd->minor);
634         if (ret < 0)
635                 goto out_unlock;
636
637         if (bd->minor >= B_MAX_DEVS)
638                 goto out_idr;
639
640         spin_unlock(&b_dev_lock);
641
642         INIT_LIST_HEAD(&bd->device_list);
643         bd->dev = binject_device_create(b_class, NULL,
644                         MKDEV(b_major, bd->minor), bd, "binject%d", bd->minor);
645
646         spin_lock(&b_dev_lock);
647
648         if (IS_ERR(bd->dev))
649                 goto out_idr;
650
651         list_add_tail(&bd->device_list, &b_dev_list);
652         spin_unlock(&b_dev_lock);
653         return 0;
654 out_idr:
655         idr_remove(&b_minor_idr, bd->minor);
656 out_unlock:
657         spin_unlock(&b_dev_lock);
658         kfree(bd);
659 out_put:
660         fput(file);
661         module_put(THIS_MODULE);
662         return ret;
663 }
664
665 static long b_misc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
666 {
667         int __user *uarg = (int __user *) arg;
668         struct b_ioctl_cmd bic;
669
670         if (copy_from_user(&bic, uarg, sizeof(bic)))
671                 return -EFAULT;
672
673         switch (cmd) {
674         case 0:
675                 return b_add_dev(&bic);
676         case 1:
677                 return b_del_dev(&bic);
678         default:
679                 break;
680         }
681
682         return -ENOTTY;
683 }
684
685 static const struct file_operations b_misc_fops = {
686         .unlocked_ioctl = b_misc_ioctl,
687         .owner          = THIS_MODULE,
688 };
689
690 static struct miscdevice b_misc_dev = {
691         .minor          = MISC_DYNAMIC_MINOR,
692         .name           = "binject-ctl",
693         .fops           = &b_misc_fops,
694 };
695
696 static void __exit b_exit(void)
697 {
698         synchronize_rcu();
699         kmem_cache_destroy(b_slab);
700         class_destroy(b_class);
701         misc_deregister(&b_misc_dev);
702 }
703
704 static int __init b_init(void)
705 {
706         int ret;
707
708         b_slab = binject_create_slab("binject", sizeof(struct b_cmd));
709         if (!b_slab) {
710                 printk(KERN_ERR "binject: failed to create cmd slab\n");
711                 return -ENOMEM;
712         }
713
714         ret = misc_register(&b_misc_dev);
715         if (ret < 0)
716                 goto fail_misc;
717
718         b_major = register_chrdev(0, "binject", &b_dev_fops);
719         if (b_major < 0)
720                 goto fail_chr;
721
722         b_class = class_create(THIS_MODULE, "binject");
723         if (IS_ERR(b_class))
724                 goto fail_class;
725
726         return 0;
727 fail_class:
728         unregister_chrdev(b_major, "binject");
729 fail_chr:
730         misc_deregister(&b_misc_dev);
731 fail_misc:
732         kmem_cache_destroy(b_slab);
733         return ret;
734 }
735
736 module_init(b_init);
737 module_exit(b_exit);
738
739 MODULE_LICENSE("GPL");
740 MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");