1 // SPDX-License-Identifier: GPL-2.0-only
3 * VDUSE: vDPA Device in Userspace
5 * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
7 * Author: Xie Yongji <xieyongji@bytedance.com>
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/cdev.h>
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/slab.h>
17 #include <linux/wait.h>
18 #include <linux/dma-map-ops.h>
19 #include <linux/poll.h>
20 #include <linux/file.h>
21 #include <linux/uio.h>
22 #include <linux/vdpa.h>
23 #include <linux/nospec.h>
24 #include <uapi/linux/vduse.h>
25 #include <uapi/linux/vdpa.h>
26 #include <uapi/linux/virtio_config.h>
27 #include <uapi/linux/virtio_ids.h>
28 #include <uapi/linux/virtio_blk.h>
29 #include <linux/mod_devicetable.h>
31 #include "iova_domain.h"
33 #define DRV_AUTHOR "Yongji Xie <xieyongji@bytedance.com>"
34 #define DRV_DESC "vDPA Device in Userspace"
35 #define DRV_LICENSE "GPL v2"
37 #define VDUSE_DEV_MAX (1U << MINORBITS)
38 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
39 #define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
40 #define VDUSE_MSG_DEFAULT_TIMEOUT 30
42 struct vduse_virtqueue {
49 struct vdpa_vq_state state;
54 struct eventfd_ctx *kickfd;
55 struct vdpa_callback cb;
56 struct work_struct inject;
57 struct work_struct kick;
63 struct vdpa_device vdpa;
64 struct vduse_dev *dev;
68 struct vduse_vdpa *vdev;
70 struct vduse_virtqueue *vqs;
71 struct vduse_iova_domain *domain;
77 wait_queue_head_t waitq;
78 struct list_head send_list;
79 struct list_head recv_list;
80 struct vdpa_callback config_cb;
81 struct work_struct inject;
83 struct rw_semaphore rwsem;
100 struct vduse_dev_msg {
101 struct vduse_dev_request req;
102 struct vduse_dev_response resp;
103 struct list_head list;
104 wait_queue_head_t waitq;
108 struct vduse_control {
112 static DEFINE_MUTEX(vduse_lock);
113 static DEFINE_IDR(vduse_idr);
115 static dev_t vduse_major;
116 static struct class *vduse_class;
117 static struct cdev vduse_ctrl_cdev;
118 static struct cdev vduse_cdev;
119 static struct workqueue_struct *vduse_irq_wq;
121 static u32 allowed_device_id[] = {
125 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
127 struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
132 static inline struct vduse_dev *dev_to_vduse(struct device *dev)
134 struct vdpa_device *vdpa = dev_to_vdpa(dev);
136 return vdpa_to_vduse(vdpa);
139 static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
142 struct vduse_dev_msg *msg;
144 list_for_each_entry(msg, head, list) {
145 if (msg->req.request_id == request_id) {
146 list_del(&msg->list);
154 static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
156 struct vduse_dev_msg *msg = NULL;
158 if (!list_empty(head)) {
159 msg = list_first_entry(head, struct vduse_dev_msg, list);
160 list_del(&msg->list);
166 static void vduse_enqueue_msg(struct list_head *head,
167 struct vduse_dev_msg *msg)
169 list_add_tail(&msg->list, head);
172 static void vduse_dev_broken(struct vduse_dev *dev)
174 struct vduse_dev_msg *msg, *tmp;
176 if (unlikely(dev->broken))
179 list_splice_init(&dev->recv_list, &dev->send_list);
180 list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
181 list_del(&msg->list);
183 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
184 wake_up(&msg->waitq);
187 wake_up(&dev->waitq);
190 static int vduse_dev_msg_sync(struct vduse_dev *dev,
191 struct vduse_dev_msg *msg)
195 if (unlikely(dev->broken))
198 init_waitqueue_head(&msg->waitq);
199 spin_lock(&dev->msg_lock);
200 if (unlikely(dev->broken)) {
201 spin_unlock(&dev->msg_lock);
204 msg->req.request_id = dev->msg_unique++;
205 vduse_enqueue_msg(&dev->send_list, msg);
206 wake_up(&dev->waitq);
207 spin_unlock(&dev->msg_lock);
208 if (dev->msg_timeout)
209 ret = wait_event_killable_timeout(msg->waitq, msg->completed,
210 (long)dev->msg_timeout * HZ);
212 ret = wait_event_killable(msg->waitq, msg->completed);
214 spin_lock(&dev->msg_lock);
215 if (!msg->completed) {
216 list_del(&msg->list);
217 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
218 /* Mark the device as malfunction when there is a timeout */
220 vduse_dev_broken(dev);
222 ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
223 spin_unlock(&dev->msg_lock);
228 static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
229 struct vduse_virtqueue *vq,
230 struct vdpa_vq_state_packed *packed)
232 struct vduse_dev_msg msg = { 0 };
235 msg.req.type = VDUSE_GET_VQ_STATE;
236 msg.req.vq_state.index = vq->index;
238 ret = vduse_dev_msg_sync(dev, &msg);
242 packed->last_avail_counter =
243 msg.resp.vq_state.packed.last_avail_counter & 0x0001;
244 packed->last_avail_idx =
245 msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
246 packed->last_used_counter =
247 msg.resp.vq_state.packed.last_used_counter & 0x0001;
248 packed->last_used_idx =
249 msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
254 static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
255 struct vduse_virtqueue *vq,
256 struct vdpa_vq_state_split *split)
258 struct vduse_dev_msg msg = { 0 };
261 msg.req.type = VDUSE_GET_VQ_STATE;
262 msg.req.vq_state.index = vq->index;
264 ret = vduse_dev_msg_sync(dev, &msg);
268 split->avail_index = msg.resp.vq_state.split.avail_index;
273 static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
275 struct vduse_dev_msg msg = { 0 };
277 msg.req.type = VDUSE_SET_STATUS;
278 msg.req.s.status = status;
280 return vduse_dev_msg_sync(dev, &msg);
283 static int vduse_dev_update_iotlb(struct vduse_dev *dev,
286 struct vduse_dev_msg msg = { 0 };
291 msg.req.type = VDUSE_UPDATE_IOTLB;
292 msg.req.iova.start = start;
293 msg.req.iova.last = last;
295 return vduse_dev_msg_sync(dev, &msg);
298 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
300 struct file *file = iocb->ki_filp;
301 struct vduse_dev *dev = file->private_data;
302 struct vduse_dev_msg *msg;
303 int size = sizeof(struct vduse_dev_request);
306 if (iov_iter_count(to) < size)
309 spin_lock(&dev->msg_lock);
311 msg = vduse_dequeue_msg(&dev->send_list);
316 if (file->f_flags & O_NONBLOCK)
319 spin_unlock(&dev->msg_lock);
320 ret = wait_event_interruptible_exclusive(dev->waitq,
321 !list_empty(&dev->send_list));
325 spin_lock(&dev->msg_lock);
327 spin_unlock(&dev->msg_lock);
328 ret = copy_to_iter(&msg->req, size, to);
329 spin_lock(&dev->msg_lock);
332 vduse_enqueue_msg(&dev->send_list, msg);
335 vduse_enqueue_msg(&dev->recv_list, msg);
337 spin_unlock(&dev->msg_lock);
342 static bool is_mem_zero(const char *ptr, int size)
346 for (i = 0; i < size; i++) {
353 static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
355 struct file *file = iocb->ki_filp;
356 struct vduse_dev *dev = file->private_data;
357 struct vduse_dev_response resp;
358 struct vduse_dev_msg *msg;
361 ret = copy_from_iter(&resp, sizeof(resp), from);
362 if (ret != sizeof(resp))
365 if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
368 spin_lock(&dev->msg_lock);
369 msg = vduse_find_msg(&dev->recv_list, resp.request_id);
375 memcpy(&msg->resp, &resp, sizeof(resp));
377 wake_up(&msg->waitq);
379 spin_unlock(&dev->msg_lock);
384 static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
386 struct vduse_dev *dev = file->private_data;
389 poll_wait(file, &dev->waitq, wait);
391 spin_lock(&dev->msg_lock);
393 if (unlikely(dev->broken))
395 if (!list_empty(&dev->send_list))
396 mask |= EPOLLIN | EPOLLRDNORM;
397 if (!list_empty(&dev->recv_list))
398 mask |= EPOLLOUT | EPOLLWRNORM;
400 spin_unlock(&dev->msg_lock);
405 static void vduse_dev_reset(struct vduse_dev *dev)
408 struct vduse_iova_domain *domain = dev->domain;
410 /* The coherent mappings are handled in vduse_dev_free_coherent() */
411 if (domain->bounce_map)
412 vduse_domain_reset_bounce_map(domain);
414 down_write(&dev->rwsem);
417 dev->driver_features = 0;
419 spin_lock(&dev->irq_lock);
420 dev->config_cb.callback = NULL;
421 dev->config_cb.private = NULL;
422 spin_unlock(&dev->irq_lock);
423 flush_work(&dev->inject);
425 for (i = 0; i < dev->vq_num; i++) {
426 struct vduse_virtqueue *vq = &dev->vqs[i];
433 memset(&vq->state, 0, sizeof(vq->state));
435 spin_lock(&vq->kick_lock);
438 eventfd_ctx_put(vq->kickfd);
440 spin_unlock(&vq->kick_lock);
442 spin_lock(&vq->irq_lock);
443 vq->cb.callback = NULL;
444 vq->cb.private = NULL;
445 spin_unlock(&vq->irq_lock);
446 flush_work(&vq->inject);
447 flush_work(&vq->kick);
450 up_write(&dev->rwsem);
453 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
454 u64 desc_area, u64 driver_area,
457 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
458 struct vduse_virtqueue *vq = &dev->vqs[idx];
460 vq->desc_addr = desc_area;
461 vq->driver_addr = driver_area;
462 vq->device_addr = device_area;
467 static void vduse_vq_kick(struct vduse_virtqueue *vq)
469 spin_lock(&vq->kick_lock);
474 eventfd_signal(vq->kickfd, 1);
478 spin_unlock(&vq->kick_lock);
481 static void vduse_vq_kick_work(struct work_struct *work)
483 struct vduse_virtqueue *vq = container_of(work,
484 struct vduse_virtqueue, kick);
489 static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
491 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
492 struct vduse_virtqueue *vq = &dev->vqs[idx];
494 if (!eventfd_signal_allowed()) {
495 schedule_work(&vq->kick);
501 static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
502 struct vdpa_callback *cb)
504 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
505 struct vduse_virtqueue *vq = &dev->vqs[idx];
507 spin_lock(&vq->irq_lock);
508 vq->cb.callback = cb->callback;
509 vq->cb.private = cb->private;
510 spin_unlock(&vq->irq_lock);
513 static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
515 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
516 struct vduse_virtqueue *vq = &dev->vqs[idx];
521 static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
524 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
525 struct vduse_virtqueue *vq = &dev->vqs[idx];
530 static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
532 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
533 struct vduse_virtqueue *vq = &dev->vqs[idx];
538 static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
539 const struct vdpa_vq_state *state)
541 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
542 struct vduse_virtqueue *vq = &dev->vqs[idx];
544 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
545 vq->state.packed.last_avail_counter =
546 state->packed.last_avail_counter;
547 vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
548 vq->state.packed.last_used_counter =
549 state->packed.last_used_counter;
550 vq->state.packed.last_used_idx = state->packed.last_used_idx;
552 vq->state.split.avail_index = state->split.avail_index;
557 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
558 struct vdpa_vq_state *state)
560 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
561 struct vduse_virtqueue *vq = &dev->vqs[idx];
563 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
564 return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
566 return vduse_dev_get_vq_state_split(dev, vq, &state->split);
569 static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
571 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
573 return dev->vq_align;
576 static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa)
578 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
580 return dev->device_features;
583 static int vduse_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
585 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
587 dev->driver_features = features;
591 static u64 vduse_vdpa_get_driver_features(struct vdpa_device *vdpa)
593 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
595 return dev->driver_features;
598 static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
599 struct vdpa_callback *cb)
601 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
603 spin_lock(&dev->irq_lock);
604 dev->config_cb.callback = cb->callback;
605 dev->config_cb.private = cb->private;
606 spin_unlock(&dev->irq_lock);
609 static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
611 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
615 for (i = 0; i < dev->vq_num; i++)
616 if (num_max < dev->vqs[i].num_max)
617 num_max = dev->vqs[i].num_max;
622 static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
624 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
626 return dev->device_id;
629 static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
631 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
633 return dev->vendor_id;
636 static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
638 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
643 static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
645 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
647 if (vduse_dev_set_status(dev, status))
650 dev->status = status;
653 static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
655 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
657 return dev->config_size;
660 static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
661 void *buf, unsigned int len)
663 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
665 if (offset > dev->config_size ||
666 len > dev->config_size - offset)
669 memcpy(buf, dev->config + offset, len);
672 static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
673 const void *buf, unsigned int len)
675 /* Now we only support read-only configuration space */
678 static int vduse_vdpa_reset(struct vdpa_device *vdpa)
680 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
681 int ret = vduse_dev_set_status(dev, 0);
683 vduse_dev_reset(dev);
688 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
690 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
692 return dev->generation;
695 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
697 struct vhost_iotlb *iotlb)
699 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
702 ret = vduse_domain_set_map(dev->domain, iotlb);
706 ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
708 vduse_domain_clear_map(dev->domain, iotlb);
715 static void vduse_vdpa_free(struct vdpa_device *vdpa)
717 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
722 static const struct vdpa_config_ops vduse_vdpa_config_ops = {
723 .set_vq_address = vduse_vdpa_set_vq_address,
724 .kick_vq = vduse_vdpa_kick_vq,
725 .set_vq_cb = vduse_vdpa_set_vq_cb,
726 .set_vq_num = vduse_vdpa_set_vq_num,
727 .set_vq_ready = vduse_vdpa_set_vq_ready,
728 .get_vq_ready = vduse_vdpa_get_vq_ready,
729 .set_vq_state = vduse_vdpa_set_vq_state,
730 .get_vq_state = vduse_vdpa_get_vq_state,
731 .get_vq_align = vduse_vdpa_get_vq_align,
732 .get_device_features = vduse_vdpa_get_device_features,
733 .set_driver_features = vduse_vdpa_set_driver_features,
734 .get_driver_features = vduse_vdpa_get_driver_features,
735 .set_config_cb = vduse_vdpa_set_config_cb,
736 .get_vq_num_max = vduse_vdpa_get_vq_num_max,
737 .get_device_id = vduse_vdpa_get_device_id,
738 .get_vendor_id = vduse_vdpa_get_vendor_id,
739 .get_status = vduse_vdpa_get_status,
740 .set_status = vduse_vdpa_set_status,
741 .get_config_size = vduse_vdpa_get_config_size,
742 .get_config = vduse_vdpa_get_config,
743 .set_config = vduse_vdpa_set_config,
744 .get_generation = vduse_vdpa_get_generation,
745 .reset = vduse_vdpa_reset,
746 .set_map = vduse_vdpa_set_map,
747 .free = vduse_vdpa_free,
750 static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
751 unsigned long offset, size_t size,
752 enum dma_data_direction dir,
755 struct vduse_dev *vdev = dev_to_vduse(dev);
756 struct vduse_iova_domain *domain = vdev->domain;
758 return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
761 static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
762 size_t size, enum dma_data_direction dir,
765 struct vduse_dev *vdev = dev_to_vduse(dev);
766 struct vduse_iova_domain *domain = vdev->domain;
768 return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
771 static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
772 dma_addr_t *dma_addr, gfp_t flag,
775 struct vduse_dev *vdev = dev_to_vduse(dev);
776 struct vduse_iova_domain *domain = vdev->domain;
780 *dma_addr = DMA_MAPPING_ERROR;
781 addr = vduse_domain_alloc_coherent(domain, size,
782 (dma_addr_t *)&iova, flag, attrs);
786 *dma_addr = (dma_addr_t)iova;
791 static void vduse_dev_free_coherent(struct device *dev, size_t size,
792 void *vaddr, dma_addr_t dma_addr,
795 struct vduse_dev *vdev = dev_to_vduse(dev);
796 struct vduse_iova_domain *domain = vdev->domain;
798 vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
801 static size_t vduse_dev_max_mapping_size(struct device *dev)
803 struct vduse_dev *vdev = dev_to_vduse(dev);
804 struct vduse_iova_domain *domain = vdev->domain;
806 return domain->bounce_size;
809 static const struct dma_map_ops vduse_dev_dma_ops = {
810 .map_page = vduse_dev_map_page,
811 .unmap_page = vduse_dev_unmap_page,
812 .alloc = vduse_dev_alloc_coherent,
813 .free = vduse_dev_free_coherent,
814 .max_mapping_size = vduse_dev_max_mapping_size,
817 static unsigned int perm_to_file_flags(u8 perm)
819 unsigned int flags = 0;
822 case VDUSE_ACCESS_WO:
825 case VDUSE_ACCESS_RO:
828 case VDUSE_ACCESS_RW:
832 WARN(1, "invalidate vhost IOTLB permission\n");
839 static int vduse_kickfd_setup(struct vduse_dev *dev,
840 struct vduse_vq_eventfd *eventfd)
842 struct eventfd_ctx *ctx = NULL;
843 struct vduse_virtqueue *vq;
846 if (eventfd->index >= dev->vq_num)
849 index = array_index_nospec(eventfd->index, dev->vq_num);
850 vq = &dev->vqs[index];
851 if (eventfd->fd >= 0) {
852 ctx = eventfd_ctx_fdget(eventfd->fd);
855 } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
858 spin_lock(&vq->kick_lock);
860 eventfd_ctx_put(vq->kickfd);
862 if (vq->ready && vq->kicked && vq->kickfd) {
863 eventfd_signal(vq->kickfd, 1);
866 spin_unlock(&vq->kick_lock);
871 static bool vduse_dev_is_ready(struct vduse_dev *dev)
875 for (i = 0; i < dev->vq_num; i++)
876 if (!dev->vqs[i].num_max)
882 static void vduse_dev_irq_inject(struct work_struct *work)
884 struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
886 spin_lock_irq(&dev->irq_lock);
887 if (dev->config_cb.callback)
888 dev->config_cb.callback(dev->config_cb.private);
889 spin_unlock_irq(&dev->irq_lock);
892 static void vduse_vq_irq_inject(struct work_struct *work)
894 struct vduse_virtqueue *vq = container_of(work,
895 struct vduse_virtqueue, inject);
897 spin_lock_irq(&vq->irq_lock);
898 if (vq->ready && vq->cb.callback)
899 vq->cb.callback(vq->cb.private);
900 spin_unlock_irq(&vq->irq_lock);
903 static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
904 struct work_struct *irq_work)
908 down_read(&dev->rwsem);
909 if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
913 queue_work(vduse_irq_wq, irq_work);
915 up_read(&dev->rwsem);
920 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
923 struct vduse_dev *dev = file->private_data;
924 void __user *argp = (void __user *)arg;
927 if (unlikely(dev->broken))
931 case VDUSE_IOTLB_GET_FD: {
932 struct vduse_iotlb_entry entry;
933 struct vhost_iotlb_map *map;
934 struct vdpa_map_file *map_file;
935 struct vduse_iova_domain *domain = dev->domain;
936 struct file *f = NULL;
939 if (copy_from_user(&entry, argp, sizeof(entry)))
943 if (entry.start > entry.last)
946 spin_lock(&domain->iotlb_lock);
947 map = vhost_iotlb_itree_first(domain->iotlb,
948 entry.start, entry.last);
950 map_file = (struct vdpa_map_file *)map->opaque;
951 f = get_file(map_file->file);
952 entry.offset = map_file->offset;
953 entry.start = map->start;
954 entry.last = map->last;
955 entry.perm = map->perm;
957 spin_unlock(&domain->iotlb_lock);
963 if (copy_to_user(argp, &entry, sizeof(entry))) {
967 ret = receive_fd(f, perm_to_file_flags(entry.perm));
971 case VDUSE_DEV_GET_FEATURES:
973 * Just mirror what driver wrote here.
974 * The driver is expected to check FEATURE_OK later.
976 ret = put_user(dev->driver_features, (u64 __user *)argp);
978 case VDUSE_DEV_SET_CONFIG: {
979 struct vduse_config_data config;
980 unsigned long size = offsetof(struct vduse_config_data,
984 if (copy_from_user(&config, argp, size))
988 if (config.offset > dev->config_size ||
989 config.length == 0 ||
990 config.length > dev->config_size - config.offset)
994 if (copy_from_user(dev->config + config.offset, argp + size,
1001 case VDUSE_DEV_INJECT_CONFIG_IRQ:
1002 ret = vduse_dev_queue_irq_work(dev, &dev->inject);
1004 case VDUSE_VQ_SETUP: {
1005 struct vduse_vq_config config;
1009 if (copy_from_user(&config, argp, sizeof(config)))
1013 if (config.index >= dev->vq_num)
1016 if (!is_mem_zero((const char *)config.reserved,
1017 sizeof(config.reserved)))
1020 index = array_index_nospec(config.index, dev->vq_num);
1021 dev->vqs[index].num_max = config.max_size;
1025 case VDUSE_VQ_GET_INFO: {
1026 struct vduse_vq_info vq_info;
1027 struct vduse_virtqueue *vq;
1031 if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1035 if (vq_info.index >= dev->vq_num)
1038 index = array_index_nospec(vq_info.index, dev->vq_num);
1039 vq = &dev->vqs[index];
1040 vq_info.desc_addr = vq->desc_addr;
1041 vq_info.driver_addr = vq->driver_addr;
1042 vq_info.device_addr = vq->device_addr;
1043 vq_info.num = vq->num;
1045 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1046 vq_info.packed.last_avail_counter =
1047 vq->state.packed.last_avail_counter;
1048 vq_info.packed.last_avail_idx =
1049 vq->state.packed.last_avail_idx;
1050 vq_info.packed.last_used_counter =
1051 vq->state.packed.last_used_counter;
1052 vq_info.packed.last_used_idx =
1053 vq->state.packed.last_used_idx;
1055 vq_info.split.avail_index =
1056 vq->state.split.avail_index;
1058 vq_info.ready = vq->ready;
1061 if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1067 case VDUSE_VQ_SETUP_KICKFD: {
1068 struct vduse_vq_eventfd eventfd;
1071 if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1074 ret = vduse_kickfd_setup(dev, &eventfd);
1077 case VDUSE_VQ_INJECT_IRQ: {
1081 if (get_user(index, (u32 __user *)argp))
1085 if (index >= dev->vq_num)
1088 index = array_index_nospec(index, dev->vq_num);
1089 ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
1100 static int vduse_dev_release(struct inode *inode, struct file *file)
1102 struct vduse_dev *dev = file->private_data;
1104 spin_lock(&dev->msg_lock);
1105 /* Make sure the inflight messages can processed after reconncection */
1106 list_splice_init(&dev->recv_list, &dev->send_list);
1107 spin_unlock(&dev->msg_lock);
1108 dev->connected = false;
1113 static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1115 struct vduse_dev *dev;
1117 mutex_lock(&vduse_lock);
1118 dev = idr_find(&vduse_idr, minor);
1119 mutex_unlock(&vduse_lock);
1124 static int vduse_dev_open(struct inode *inode, struct file *file)
1127 struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1133 mutex_lock(&dev->lock);
1138 dev->connected = true;
1139 file->private_data = dev;
1141 mutex_unlock(&dev->lock);
1146 static const struct file_operations vduse_dev_fops = {
1147 .owner = THIS_MODULE,
1148 .open = vduse_dev_open,
1149 .release = vduse_dev_release,
1150 .read_iter = vduse_dev_read_iter,
1151 .write_iter = vduse_dev_write_iter,
1152 .poll = vduse_dev_poll,
1153 .unlocked_ioctl = vduse_dev_ioctl,
1154 .compat_ioctl = compat_ptr_ioctl,
1155 .llseek = noop_llseek,
1158 static struct vduse_dev *vduse_dev_create(void)
1160 struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1165 mutex_init(&dev->lock);
1166 spin_lock_init(&dev->msg_lock);
1167 INIT_LIST_HEAD(&dev->send_list);
1168 INIT_LIST_HEAD(&dev->recv_list);
1169 spin_lock_init(&dev->irq_lock);
1170 init_rwsem(&dev->rwsem);
1172 INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1173 init_waitqueue_head(&dev->waitq);
1178 static void vduse_dev_destroy(struct vduse_dev *dev)
1183 static struct vduse_dev *vduse_find_dev(const char *name)
1185 struct vduse_dev *dev;
1188 idr_for_each_entry(&vduse_idr, dev, id)
1189 if (!strcmp(dev->name, name))
1195 static int vduse_destroy_dev(char *name)
1197 struct vduse_dev *dev = vduse_find_dev(name);
1202 mutex_lock(&dev->lock);
1203 if (dev->vdev || dev->connected) {
1204 mutex_unlock(&dev->lock);
1207 dev->connected = true;
1208 mutex_unlock(&dev->lock);
1210 vduse_dev_reset(dev);
1211 device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1212 idr_remove(&vduse_idr, dev->minor);
1213 kvfree(dev->config);
1215 vduse_domain_destroy(dev->domain);
1217 vduse_dev_destroy(dev);
1218 module_put(THIS_MODULE);
1223 static bool device_is_allowed(u32 device_id)
1227 for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1228 if (allowed_device_id[i] == device_id)
1234 static bool features_is_valid(u64 features)
1236 if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
1239 /* Now we only support read-only configuration space */
1240 if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
1246 static bool vduse_validate_config(struct vduse_dev_config *config)
1248 if (!is_mem_zero((const char *)config->reserved,
1249 sizeof(config->reserved)))
1252 if (config->vq_align > PAGE_SIZE)
1255 if (config->config_size > PAGE_SIZE)
1258 if (!device_is_allowed(config->device_id))
1261 if (!features_is_valid(config->features))
1267 static ssize_t msg_timeout_show(struct device *device,
1268 struct device_attribute *attr, char *buf)
1270 struct vduse_dev *dev = dev_get_drvdata(device);
1272 return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1275 static ssize_t msg_timeout_store(struct device *device,
1276 struct device_attribute *attr,
1277 const char *buf, size_t count)
1279 struct vduse_dev *dev = dev_get_drvdata(device);
1282 ret = kstrtouint(buf, 10, &dev->msg_timeout);
1289 static DEVICE_ATTR_RW(msg_timeout);
1291 static struct attribute *vduse_dev_attrs[] = {
1292 &dev_attr_msg_timeout.attr,
1296 ATTRIBUTE_GROUPS(vduse_dev);
1298 static int vduse_create_dev(struct vduse_dev_config *config,
1299 void *config_buf, u64 api_version)
1302 struct vduse_dev *dev;
1305 if (vduse_find_dev(config->name))
1309 dev = vduse_dev_create();
1313 dev->api_version = api_version;
1314 dev->device_features = config->features;
1315 dev->device_id = config->device_id;
1316 dev->vendor_id = config->vendor_id;
1317 dev->name = kstrdup(config->name, GFP_KERNEL);
1321 dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
1326 dev->config = config_buf;
1327 dev->config_size = config->config_size;
1328 dev->vq_align = config->vq_align;
1329 dev->vq_num = config->vq_num;
1330 dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
1334 for (i = 0; i < dev->vq_num; i++) {
1335 dev->vqs[i].index = i;
1336 INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
1337 INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
1338 spin_lock_init(&dev->vqs[i].kick_lock);
1339 spin_lock_init(&dev->vqs[i].irq_lock);
1342 ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
1347 dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
1348 dev->dev = device_create(vduse_class, NULL,
1349 MKDEV(MAJOR(vduse_major), dev->minor),
1350 dev, "%s", config->name);
1351 if (IS_ERR(dev->dev)) {
1352 ret = PTR_ERR(dev->dev);
1355 __module_get(THIS_MODULE);
1359 idr_remove(&vduse_idr, dev->minor);
1363 vduse_domain_destroy(dev->domain);
1367 vduse_dev_destroy(dev);
1372 static long vduse_ioctl(struct file *file, unsigned int cmd,
1376 void __user *argp = (void __user *)arg;
1377 struct vduse_control *control = file->private_data;
1379 mutex_lock(&vduse_lock);
1381 case VDUSE_GET_API_VERSION:
1382 ret = put_user(control->api_version, (u64 __user *)argp);
1384 case VDUSE_SET_API_VERSION: {
1388 if (get_user(api_version, (u64 __user *)argp))
1392 if (api_version > VDUSE_API_VERSION)
1396 control->api_version = api_version;
1399 case VDUSE_CREATE_DEV: {
1400 struct vduse_dev_config config;
1401 unsigned long size = offsetof(struct vduse_dev_config, config);
1405 if (copy_from_user(&config, argp, size))
1409 if (vduse_validate_config(&config) == false)
1412 buf = vmemdup_user(argp + size, config.config_size);
1417 config.name[VDUSE_NAME_MAX - 1] = '\0';
1418 ret = vduse_create_dev(&config, buf, control->api_version);
1423 case VDUSE_DESTROY_DEV: {
1424 char name[VDUSE_NAME_MAX];
1427 if (copy_from_user(name, argp, VDUSE_NAME_MAX))
1430 name[VDUSE_NAME_MAX - 1] = '\0';
1431 ret = vduse_destroy_dev(name);
1438 mutex_unlock(&vduse_lock);
1443 static int vduse_release(struct inode *inode, struct file *file)
1445 struct vduse_control *control = file->private_data;
1451 static int vduse_open(struct inode *inode, struct file *file)
1453 struct vduse_control *control;
1455 control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
1459 control->api_version = VDUSE_API_VERSION;
1460 file->private_data = control;
1465 static const struct file_operations vduse_ctrl_fops = {
1466 .owner = THIS_MODULE,
1468 .release = vduse_release,
1469 .unlocked_ioctl = vduse_ioctl,
1470 .compat_ioctl = compat_ptr_ioctl,
1471 .llseek = noop_llseek,
1474 static char *vduse_devnode(struct device *dev, umode_t *mode)
1476 return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1479 static void vduse_mgmtdev_release(struct device *dev)
1483 static struct device vduse_mgmtdev = {
1484 .init_name = "vduse",
1485 .release = vduse_mgmtdev_release,
1488 static struct vdpa_mgmt_dev mgmt_dev;
1490 static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
1492 struct vduse_vdpa *vdev;
1498 vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
1499 &vduse_vdpa_config_ops, 1, 1, name, true);
1501 return PTR_ERR(vdev);
1505 vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
1506 ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
1508 put_device(&vdev->vdpa.dev);
1511 set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
1512 vdev->vdpa.dma_dev = &vdev->vdpa.dev;
1513 vdev->vdpa.mdev = &mgmt_dev;
1518 static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
1519 const struct vdpa_dev_set_config *config)
1521 struct vduse_dev *dev;
1524 mutex_lock(&vduse_lock);
1525 dev = vduse_find_dev(name);
1526 if (!dev || !vduse_dev_is_ready(dev)) {
1527 mutex_unlock(&vduse_lock);
1530 ret = vduse_dev_init_vdpa(dev, name);
1531 mutex_unlock(&vduse_lock);
1535 ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
1537 put_device(&dev->vdev->vdpa.dev);
1544 static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
1546 _vdpa_unregister_device(dev);
1549 static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
1550 .dev_add = vdpa_dev_add,
1551 .dev_del = vdpa_dev_del,
1554 static struct virtio_device_id id_table[] = {
1555 { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
1559 static struct vdpa_mgmt_dev mgmt_dev = {
1560 .device = &vduse_mgmtdev,
1561 .id_table = id_table,
1562 .ops = &vdpa_dev_mgmtdev_ops,
1565 static int vduse_mgmtdev_init(void)
1569 ret = device_register(&vduse_mgmtdev);
1573 ret = vdpa_mgmtdev_register(&mgmt_dev);
1579 device_unregister(&vduse_mgmtdev);
1583 static void vduse_mgmtdev_exit(void)
1585 vdpa_mgmtdev_unregister(&mgmt_dev);
1586 device_unregister(&vduse_mgmtdev);
1589 static int vduse_init(void)
1594 vduse_class = class_create(THIS_MODULE, "vduse");
1595 if (IS_ERR(vduse_class))
1596 return PTR_ERR(vduse_class);
1598 vduse_class->devnode = vduse_devnode;
1599 vduse_class->dev_groups = vduse_dev_groups;
1601 ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
1603 goto err_chardev_region;
1605 /* /dev/vduse/control */
1606 cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
1607 vduse_ctrl_cdev.owner = THIS_MODULE;
1608 ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
1612 dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
1618 /* /dev/vduse/$DEVICE */
1619 cdev_init(&vduse_cdev, &vduse_dev_fops);
1620 vduse_cdev.owner = THIS_MODULE;
1621 ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
1626 vduse_irq_wq = alloc_workqueue("vduse-irq",
1627 WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
1628 if (!vduse_irq_wq) {
1633 ret = vduse_domain_init();
1637 ret = vduse_mgmtdev_init();
1643 vduse_domain_exit();
1645 destroy_workqueue(vduse_irq_wq);
1647 cdev_del(&vduse_cdev);
1649 device_destroy(vduse_class, vduse_major);
1651 cdev_del(&vduse_ctrl_cdev);
1653 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
1655 class_destroy(vduse_class);
1658 module_init(vduse_init);
1660 static void vduse_exit(void)
1662 vduse_mgmtdev_exit();
1663 vduse_domain_exit();
1664 destroy_workqueue(vduse_irq_wq);
1665 cdev_del(&vduse_cdev);
1666 device_destroy(vduse_class, vduse_major);
1667 cdev_del(&vduse_ctrl_cdev);
1668 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
1669 class_destroy(vduse_class);
1671 module_exit(vduse_exit);
1673 MODULE_LICENSE(DRV_LICENSE);
1674 MODULE_AUTHOR(DRV_AUTHOR);
1675 MODULE_DESCRIPTION(DRV_DESC);