Merge tag 'linux-kselftest-fixes-6.2-rc6' of git://git.kernel.org/pub/scm/linux/kerne...
[linux-block.git] / virt / kvm / eventfd.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * kvm eventfd support - use eventfd objects to signal various KVM events
4  *
5  * Copyright 2009 Novell.  All Rights Reserved.
6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7  *
8  * Author:
9  *      Gregory Haskins <ghaskins@novell.com>
10  */
11
12 #include <linux/kvm_host.h>
13 #include <linux/kvm.h>
14 #include <linux/kvm_irqfd.h>
15 #include <linux/workqueue.h>
16 #include <linux/syscalls.h>
17 #include <linux/wait.h>
18 #include <linux/poll.h>
19 #include <linux/file.h>
20 #include <linux/list.h>
21 #include <linux/eventfd.h>
22 #include <linux/kernel.h>
23 #include <linux/srcu.h>
24 #include <linux/slab.h>
25 #include <linux/seqlock.h>
26 #include <linux/irqbypass.h>
27 #include <trace/events/kvm.h>
28
29 #include <kvm/iodev.h>
30
31 #ifdef CONFIG_HAVE_KVM_IRQFD
32
33 static struct workqueue_struct *irqfd_cleanup_wq;
34
35 bool __attribute__((weak))
36 kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
37 {
38         return true;
39 }
40
41 static void
42 irqfd_inject(struct work_struct *work)
43 {
44         struct kvm_kernel_irqfd *irqfd =
45                 container_of(work, struct kvm_kernel_irqfd, inject);
46         struct kvm *kvm = irqfd->kvm;
47
48         if (!irqfd->resampler) {
49                 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
50                                 false);
51                 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
52                                 false);
53         } else
54                 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
55                             irqfd->gsi, 1, false);
56 }
57
58 /*
59  * Since resampler irqfds share an IRQ source ID, we de-assert once
60  * then notify all of the resampler irqfds using this GSI.  We can't
61  * do multiple de-asserts or we risk racing with incoming re-asserts.
62  */
63 static void
64 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
65 {
66         struct kvm_kernel_irqfd_resampler *resampler;
67         struct kvm *kvm;
68         struct kvm_kernel_irqfd *irqfd;
69         int idx;
70
71         resampler = container_of(kian,
72                         struct kvm_kernel_irqfd_resampler, notifier);
73         kvm = resampler->kvm;
74
75         kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
76                     resampler->notifier.gsi, 0, false);
77
78         idx = srcu_read_lock(&kvm->irq_srcu);
79
80         list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
81             srcu_read_lock_held(&kvm->irq_srcu))
82                 eventfd_signal(irqfd->resamplefd, 1);
83
84         srcu_read_unlock(&kvm->irq_srcu, idx);
85 }
86
87 static void
88 irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
89 {
90         struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
91         struct kvm *kvm = resampler->kvm;
92
93         mutex_lock(&kvm->irqfds.resampler_lock);
94
95         list_del_rcu(&irqfd->resampler_link);
96         synchronize_srcu(&kvm->irq_srcu);
97
98         if (list_empty(&resampler->list)) {
99                 list_del(&resampler->link);
100                 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
101                 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
102                             resampler->notifier.gsi, 0, false);
103                 kfree(resampler);
104         }
105
106         mutex_unlock(&kvm->irqfds.resampler_lock);
107 }
108
109 /*
110  * Race-free decouple logic (ordering is critical)
111  */
112 static void
113 irqfd_shutdown(struct work_struct *work)
114 {
115         struct kvm_kernel_irqfd *irqfd =
116                 container_of(work, struct kvm_kernel_irqfd, shutdown);
117         struct kvm *kvm = irqfd->kvm;
118         u64 cnt;
119
120         /* Make sure irqfd has been initialized in assign path. */
121         synchronize_srcu(&kvm->irq_srcu);
122
123         /*
124          * Synchronize with the wait-queue and unhook ourselves to prevent
125          * further events.
126          */
127         eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
128
129         /*
130          * We know no new events will be scheduled at this point, so block
131          * until all previously outstanding events have completed
132          */
133         flush_work(&irqfd->inject);
134
135         if (irqfd->resampler) {
136                 irqfd_resampler_shutdown(irqfd);
137                 eventfd_ctx_put(irqfd->resamplefd);
138         }
139
140         /*
141          * It is now safe to release the object's resources
142          */
143 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
144         irq_bypass_unregister_consumer(&irqfd->consumer);
145 #endif
146         eventfd_ctx_put(irqfd->eventfd);
147         kfree(irqfd);
148 }
149
150
151 /* assumes kvm->irqfds.lock is held */
152 static bool
153 irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
154 {
155         return list_empty(&irqfd->list) ? false : true;
156 }
157
158 /*
159  * Mark the irqfd as inactive and schedule it for removal
160  *
161  * assumes kvm->irqfds.lock is held
162  */
163 static void
164 irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
165 {
166         BUG_ON(!irqfd_is_active(irqfd));
167
168         list_del_init(&irqfd->list);
169
170         queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
171 }
172
173 int __attribute__((weak)) kvm_arch_set_irq_inatomic(
174                                 struct kvm_kernel_irq_routing_entry *irq,
175                                 struct kvm *kvm, int irq_source_id,
176                                 int level,
177                                 bool line_status)
178 {
179         return -EWOULDBLOCK;
180 }
181
182 /*
183  * Called with wqh->lock held and interrupts disabled
184  */
185 static int
186 irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
187 {
188         struct kvm_kernel_irqfd *irqfd =
189                 container_of(wait, struct kvm_kernel_irqfd, wait);
190         __poll_t flags = key_to_poll(key);
191         struct kvm_kernel_irq_routing_entry irq;
192         struct kvm *kvm = irqfd->kvm;
193         unsigned seq;
194         int idx;
195         int ret = 0;
196
197         if (flags & EPOLLIN) {
198                 u64 cnt;
199                 eventfd_ctx_do_read(irqfd->eventfd, &cnt);
200
201                 idx = srcu_read_lock(&kvm->irq_srcu);
202                 do {
203                         seq = read_seqcount_begin(&irqfd->irq_entry_sc);
204                         irq = irqfd->irq_entry;
205                 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
206                 /* An event has been signaled, inject an interrupt */
207                 if (kvm_arch_set_irq_inatomic(&irq, kvm,
208                                               KVM_USERSPACE_IRQ_SOURCE_ID, 1,
209                                               false) == -EWOULDBLOCK)
210                         schedule_work(&irqfd->inject);
211                 srcu_read_unlock(&kvm->irq_srcu, idx);
212                 ret = 1;
213         }
214
215         if (flags & EPOLLHUP) {
216                 /* The eventfd is closing, detach from KVM */
217                 unsigned long iflags;
218
219                 spin_lock_irqsave(&kvm->irqfds.lock, iflags);
220
221                 /*
222                  * We must check if someone deactivated the irqfd before
223                  * we could acquire the irqfds.lock since the item is
224                  * deactivated from the KVM side before it is unhooked from
225                  * the wait-queue.  If it is already deactivated, we can
226                  * simply return knowing the other side will cleanup for us.
227                  * We cannot race against the irqfd going away since the
228                  * other side is required to acquire wqh->lock, which we hold
229                  */
230                 if (irqfd_is_active(irqfd))
231                         irqfd_deactivate(irqfd);
232
233                 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
234         }
235
236         return ret;
237 }
238
239 static void
240 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
241                         poll_table *pt)
242 {
243         struct kvm_kernel_irqfd *irqfd =
244                 container_of(pt, struct kvm_kernel_irqfd, pt);
245         add_wait_queue_priority(wqh, &irqfd->wait);
246 }
247
248 /* Must be called under irqfds.lock */
249 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
250 {
251         struct kvm_kernel_irq_routing_entry *e;
252         struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
253         int n_entries;
254
255         n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
256
257         write_seqcount_begin(&irqfd->irq_entry_sc);
258
259         e = entries;
260         if (n_entries == 1)
261                 irqfd->irq_entry = *e;
262         else
263                 irqfd->irq_entry.type = 0;
264
265         write_seqcount_end(&irqfd->irq_entry_sc);
266 }
267
268 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
269 void __attribute__((weak)) kvm_arch_irq_bypass_stop(
270                                 struct irq_bypass_consumer *cons)
271 {
272 }
273
274 void __attribute__((weak)) kvm_arch_irq_bypass_start(
275                                 struct irq_bypass_consumer *cons)
276 {
277 }
278
279 int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
280                                 struct kvm *kvm, unsigned int host_irq,
281                                 uint32_t guest_irq, bool set)
282 {
283         return 0;
284 }
285
286 bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
287                                 struct kvm_kernel_irq_routing_entry *old,
288                                 struct kvm_kernel_irq_routing_entry *new)
289 {
290         return true;
291 }
292 #endif
293
294 static int
295 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
296 {
297         struct kvm_kernel_irqfd *irqfd, *tmp;
298         struct fd f;
299         struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
300         int ret;
301         __poll_t events;
302         int idx;
303
304         if (!kvm_arch_intc_initialized(kvm))
305                 return -EAGAIN;
306
307         if (!kvm_arch_irqfd_allowed(kvm, args))
308                 return -EINVAL;
309
310         irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
311         if (!irqfd)
312                 return -ENOMEM;
313
314         irqfd->kvm = kvm;
315         irqfd->gsi = args->gsi;
316         INIT_LIST_HEAD(&irqfd->list);
317         INIT_WORK(&irqfd->inject, irqfd_inject);
318         INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
319         seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
320
321         f = fdget(args->fd);
322         if (!f.file) {
323                 ret = -EBADF;
324                 goto out;
325         }
326
327         eventfd = eventfd_ctx_fileget(f.file);
328         if (IS_ERR(eventfd)) {
329                 ret = PTR_ERR(eventfd);
330                 goto fail;
331         }
332
333         irqfd->eventfd = eventfd;
334
335         if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
336                 struct kvm_kernel_irqfd_resampler *resampler;
337
338                 resamplefd = eventfd_ctx_fdget(args->resamplefd);
339                 if (IS_ERR(resamplefd)) {
340                         ret = PTR_ERR(resamplefd);
341                         goto fail;
342                 }
343
344                 irqfd->resamplefd = resamplefd;
345                 INIT_LIST_HEAD(&irqfd->resampler_link);
346
347                 mutex_lock(&kvm->irqfds.resampler_lock);
348
349                 list_for_each_entry(resampler,
350                                     &kvm->irqfds.resampler_list, link) {
351                         if (resampler->notifier.gsi == irqfd->gsi) {
352                                 irqfd->resampler = resampler;
353                                 break;
354                         }
355                 }
356
357                 if (!irqfd->resampler) {
358                         resampler = kzalloc(sizeof(*resampler),
359                                             GFP_KERNEL_ACCOUNT);
360                         if (!resampler) {
361                                 ret = -ENOMEM;
362                                 mutex_unlock(&kvm->irqfds.resampler_lock);
363                                 goto fail;
364                         }
365
366                         resampler->kvm = kvm;
367                         INIT_LIST_HEAD(&resampler->list);
368                         resampler->notifier.gsi = irqfd->gsi;
369                         resampler->notifier.irq_acked = irqfd_resampler_ack;
370                         INIT_LIST_HEAD(&resampler->link);
371
372                         list_add(&resampler->link, &kvm->irqfds.resampler_list);
373                         kvm_register_irq_ack_notifier(kvm,
374                                                       &resampler->notifier);
375                         irqfd->resampler = resampler;
376                 }
377
378                 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
379                 synchronize_srcu(&kvm->irq_srcu);
380
381                 mutex_unlock(&kvm->irqfds.resampler_lock);
382         }
383
384         /*
385          * Install our own custom wake-up handling so we are notified via
386          * a callback whenever someone signals the underlying eventfd
387          */
388         init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
389         init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
390
391         spin_lock_irq(&kvm->irqfds.lock);
392
393         ret = 0;
394         list_for_each_entry(tmp, &kvm->irqfds.items, list) {
395                 if (irqfd->eventfd != tmp->eventfd)
396                         continue;
397                 /* This fd is used for another irq already. */
398                 ret = -EBUSY;
399                 spin_unlock_irq(&kvm->irqfds.lock);
400                 goto fail;
401         }
402
403         idx = srcu_read_lock(&kvm->irq_srcu);
404         irqfd_update(kvm, irqfd);
405
406         list_add_tail(&irqfd->list, &kvm->irqfds.items);
407
408         spin_unlock_irq(&kvm->irqfds.lock);
409
410         /*
411          * Check if there was an event already pending on the eventfd
412          * before we registered, and trigger it as if we didn't miss it.
413          */
414         events = vfs_poll(f.file, &irqfd->pt);
415
416         if (events & EPOLLIN)
417                 schedule_work(&irqfd->inject);
418
419 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
420         if (kvm_arch_has_irq_bypass()) {
421                 irqfd->consumer.token = (void *)irqfd->eventfd;
422                 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
423                 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
424                 irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
425                 irqfd->consumer.start = kvm_arch_irq_bypass_start;
426                 ret = irq_bypass_register_consumer(&irqfd->consumer);
427                 if (ret)
428                         pr_info("irq bypass consumer (token %p) registration fails: %d\n",
429                                 irqfd->consumer.token, ret);
430         }
431 #endif
432
433         srcu_read_unlock(&kvm->irq_srcu, idx);
434
435         /*
436          * do not drop the file until the irqfd is fully initialized, otherwise
437          * we might race against the EPOLLHUP
438          */
439         fdput(f);
440         return 0;
441
442 fail:
443         if (irqfd->resampler)
444                 irqfd_resampler_shutdown(irqfd);
445
446         if (resamplefd && !IS_ERR(resamplefd))
447                 eventfd_ctx_put(resamplefd);
448
449         if (eventfd && !IS_ERR(eventfd))
450                 eventfd_ctx_put(eventfd);
451
452         fdput(f);
453
454 out:
455         kfree(irqfd);
456         return ret;
457 }
458
459 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
460 {
461         struct kvm_irq_ack_notifier *kian;
462         int gsi, idx;
463
464         idx = srcu_read_lock(&kvm->irq_srcu);
465         gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
466         if (gsi != -1)
467                 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
468                                           link, srcu_read_lock_held(&kvm->irq_srcu))
469                         if (kian->gsi == gsi) {
470                                 srcu_read_unlock(&kvm->irq_srcu, idx);
471                                 return true;
472                         }
473
474         srcu_read_unlock(&kvm->irq_srcu, idx);
475
476         return false;
477 }
478 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
479
480 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
481 {
482         struct kvm_irq_ack_notifier *kian;
483
484         hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
485                                   link, srcu_read_lock_held(&kvm->irq_srcu))
486                 if (kian->gsi == gsi)
487                         kian->irq_acked(kian);
488 }
489
490 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
491 {
492         int gsi, idx;
493
494         trace_kvm_ack_irq(irqchip, pin);
495
496         idx = srcu_read_lock(&kvm->irq_srcu);
497         gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
498         if (gsi != -1)
499                 kvm_notify_acked_gsi(kvm, gsi);
500         srcu_read_unlock(&kvm->irq_srcu, idx);
501 }
502
503 void kvm_register_irq_ack_notifier(struct kvm *kvm,
504                                    struct kvm_irq_ack_notifier *kian)
505 {
506         mutex_lock(&kvm->irq_lock);
507         hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
508         mutex_unlock(&kvm->irq_lock);
509         kvm_arch_post_irq_ack_notifier_list_update(kvm);
510 }
511
512 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
513                                     struct kvm_irq_ack_notifier *kian)
514 {
515         mutex_lock(&kvm->irq_lock);
516         hlist_del_init_rcu(&kian->link);
517         mutex_unlock(&kvm->irq_lock);
518         synchronize_srcu(&kvm->irq_srcu);
519         kvm_arch_post_irq_ack_notifier_list_update(kvm);
520 }
521 #endif
522
523 void
524 kvm_eventfd_init(struct kvm *kvm)
525 {
526 #ifdef CONFIG_HAVE_KVM_IRQFD
527         spin_lock_init(&kvm->irqfds.lock);
528         INIT_LIST_HEAD(&kvm->irqfds.items);
529         INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
530         mutex_init(&kvm->irqfds.resampler_lock);
531 #endif
532         INIT_LIST_HEAD(&kvm->ioeventfds);
533 }
534
535 #ifdef CONFIG_HAVE_KVM_IRQFD
536 /*
537  * shutdown any irqfd's that match fd+gsi
538  */
539 static int
540 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
541 {
542         struct kvm_kernel_irqfd *irqfd, *tmp;
543         struct eventfd_ctx *eventfd;
544
545         eventfd = eventfd_ctx_fdget(args->fd);
546         if (IS_ERR(eventfd))
547                 return PTR_ERR(eventfd);
548
549         spin_lock_irq(&kvm->irqfds.lock);
550
551         list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
552                 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
553                         /*
554                          * This clearing of irq_entry.type is needed for when
555                          * another thread calls kvm_irq_routing_update before
556                          * we flush workqueue below (we synchronize with
557                          * kvm_irq_routing_update using irqfds.lock).
558                          */
559                         write_seqcount_begin(&irqfd->irq_entry_sc);
560                         irqfd->irq_entry.type = 0;
561                         write_seqcount_end(&irqfd->irq_entry_sc);
562                         irqfd_deactivate(irqfd);
563                 }
564         }
565
566         spin_unlock_irq(&kvm->irqfds.lock);
567         eventfd_ctx_put(eventfd);
568
569         /*
570          * Block until we know all outstanding shutdown jobs have completed
571          * so that we guarantee there will not be any more interrupts on this
572          * gsi once this deassign function returns.
573          */
574         flush_workqueue(irqfd_cleanup_wq);
575
576         return 0;
577 }
578
579 int
580 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
581 {
582         if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
583                 return -EINVAL;
584
585         if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
586                 return kvm_irqfd_deassign(kvm, args);
587
588         return kvm_irqfd_assign(kvm, args);
589 }
590
591 /*
592  * This function is called as the kvm VM fd is being released. Shutdown all
593  * irqfds that still remain open
594  */
595 void
596 kvm_irqfd_release(struct kvm *kvm)
597 {
598         struct kvm_kernel_irqfd *irqfd, *tmp;
599
600         spin_lock_irq(&kvm->irqfds.lock);
601
602         list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
603                 irqfd_deactivate(irqfd);
604
605         spin_unlock_irq(&kvm->irqfds.lock);
606
607         /*
608          * Block until we know all outstanding shutdown jobs have completed
609          * since we do not take a kvm* reference.
610          */
611         flush_workqueue(irqfd_cleanup_wq);
612
613 }
614
615 /*
616  * Take note of a change in irq routing.
617  * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
618  */
619 void kvm_irq_routing_update(struct kvm *kvm)
620 {
621         struct kvm_kernel_irqfd *irqfd;
622
623         spin_lock_irq(&kvm->irqfds.lock);
624
625         list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
626 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
627                 /* Under irqfds.lock, so can read irq_entry safely */
628                 struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry;
629 #endif
630
631                 irqfd_update(kvm, irqfd);
632
633 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
634                 if (irqfd->producer &&
635                     kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
636                         int ret = kvm_arch_update_irqfd_routing(
637                                         irqfd->kvm, irqfd->producer->irq,
638                                         irqfd->gsi, 1);
639                         WARN_ON(ret);
640                 }
641 #endif
642         }
643
644         spin_unlock_irq(&kvm->irqfds.lock);
645 }
646
647 /*
648  * create a host-wide workqueue for issuing deferred shutdown requests
649  * aggregated from all vm* instances. We need our own isolated
650  * queue to ease flushing work items when a VM exits.
651  */
652 int kvm_irqfd_init(void)
653 {
654         irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0);
655         if (!irqfd_cleanup_wq)
656                 return -ENOMEM;
657
658         return 0;
659 }
660
661 void kvm_irqfd_exit(void)
662 {
663         destroy_workqueue(irqfd_cleanup_wq);
664 }
665 #endif
666
667 /*
668  * --------------------------------------------------------------------
669  * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
670  *
671  * userspace can register a PIO/MMIO address with an eventfd for receiving
672  * notification when the memory has been touched.
673  * --------------------------------------------------------------------
674  */
675
676 struct _ioeventfd {
677         struct list_head     list;
678         u64                  addr;
679         int                  length;
680         struct eventfd_ctx  *eventfd;
681         u64                  datamatch;
682         struct kvm_io_device dev;
683         u8                   bus_idx;
684         bool                 wildcard;
685 };
686
687 static inline struct _ioeventfd *
688 to_ioeventfd(struct kvm_io_device *dev)
689 {
690         return container_of(dev, struct _ioeventfd, dev);
691 }
692
693 static void
694 ioeventfd_release(struct _ioeventfd *p)
695 {
696         eventfd_ctx_put(p->eventfd);
697         list_del(&p->list);
698         kfree(p);
699 }
700
701 static bool
702 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
703 {
704         u64 _val;
705
706         if (addr != p->addr)
707                 /* address must be precise for a hit */
708                 return false;
709
710         if (!p->length)
711                 /* length = 0 means only look at the address, so always a hit */
712                 return true;
713
714         if (len != p->length)
715                 /* address-range must be precise for a hit */
716                 return false;
717
718         if (p->wildcard)
719                 /* all else equal, wildcard is always a hit */
720                 return true;
721
722         /* otherwise, we have to actually compare the data */
723
724         BUG_ON(!IS_ALIGNED((unsigned long)val, len));
725
726         switch (len) {
727         case 1:
728                 _val = *(u8 *)val;
729                 break;
730         case 2:
731                 _val = *(u16 *)val;
732                 break;
733         case 4:
734                 _val = *(u32 *)val;
735                 break;
736         case 8:
737                 _val = *(u64 *)val;
738                 break;
739         default:
740                 return false;
741         }
742
743         return _val == p->datamatch;
744 }
745
746 /* MMIO/PIO writes trigger an event if the addr/val match */
747 static int
748 ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
749                 int len, const void *val)
750 {
751         struct _ioeventfd *p = to_ioeventfd(this);
752
753         if (!ioeventfd_in_range(p, addr, len, val))
754                 return -EOPNOTSUPP;
755
756         eventfd_signal(p->eventfd, 1);
757         return 0;
758 }
759
760 /*
761  * This function is called as KVM is completely shutting down.  We do not
762  * need to worry about locking just nuke anything we have as quickly as possible
763  */
764 static void
765 ioeventfd_destructor(struct kvm_io_device *this)
766 {
767         struct _ioeventfd *p = to_ioeventfd(this);
768
769         ioeventfd_release(p);
770 }
771
772 static const struct kvm_io_device_ops ioeventfd_ops = {
773         .write      = ioeventfd_write,
774         .destructor = ioeventfd_destructor,
775 };
776
777 /* assumes kvm->slots_lock held */
778 static bool
779 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
780 {
781         struct _ioeventfd *_p;
782
783         list_for_each_entry(_p, &kvm->ioeventfds, list)
784                 if (_p->bus_idx == p->bus_idx &&
785                     _p->addr == p->addr &&
786                     (!_p->length || !p->length ||
787                      (_p->length == p->length &&
788                       (_p->wildcard || p->wildcard ||
789                        _p->datamatch == p->datamatch))))
790                         return true;
791
792         return false;
793 }
794
795 static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
796 {
797         if (flags & KVM_IOEVENTFD_FLAG_PIO)
798                 return KVM_PIO_BUS;
799         if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
800                 return KVM_VIRTIO_CCW_NOTIFY_BUS;
801         return KVM_MMIO_BUS;
802 }
803
804 static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
805                                 enum kvm_bus bus_idx,
806                                 struct kvm_ioeventfd *args)
807 {
808
809         struct eventfd_ctx *eventfd;
810         struct _ioeventfd *p;
811         int ret;
812
813         eventfd = eventfd_ctx_fdget(args->fd);
814         if (IS_ERR(eventfd))
815                 return PTR_ERR(eventfd);
816
817         p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
818         if (!p) {
819                 ret = -ENOMEM;
820                 goto fail;
821         }
822
823         INIT_LIST_HEAD(&p->list);
824         p->addr    = args->addr;
825         p->bus_idx = bus_idx;
826         p->length  = args->len;
827         p->eventfd = eventfd;
828
829         /* The datamatch feature is optional, otherwise this is a wildcard */
830         if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
831                 p->datamatch = args->datamatch;
832         else
833                 p->wildcard = true;
834
835         mutex_lock(&kvm->slots_lock);
836
837         /* Verify that there isn't a match already */
838         if (ioeventfd_check_collision(kvm, p)) {
839                 ret = -EEXIST;
840                 goto unlock_fail;
841         }
842
843         kvm_iodevice_init(&p->dev, &ioeventfd_ops);
844
845         ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
846                                       &p->dev);
847         if (ret < 0)
848                 goto unlock_fail;
849
850         kvm_get_bus(kvm, bus_idx)->ioeventfd_count++;
851         list_add_tail(&p->list, &kvm->ioeventfds);
852
853         mutex_unlock(&kvm->slots_lock);
854
855         return 0;
856
857 unlock_fail:
858         mutex_unlock(&kvm->slots_lock);
859
860 fail:
861         kfree(p);
862         eventfd_ctx_put(eventfd);
863
864         return ret;
865 }
866
867 static int
868 kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
869                            struct kvm_ioeventfd *args)
870 {
871         struct _ioeventfd        *p, *tmp;
872         struct eventfd_ctx       *eventfd;
873         struct kvm_io_bus        *bus;
874         int                       ret = -ENOENT;
875         bool                      wildcard;
876
877         eventfd = eventfd_ctx_fdget(args->fd);
878         if (IS_ERR(eventfd))
879                 return PTR_ERR(eventfd);
880
881         wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
882
883         mutex_lock(&kvm->slots_lock);
884
885         list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
886
887                 if (p->bus_idx != bus_idx ||
888                     p->eventfd != eventfd  ||
889                     p->addr != args->addr  ||
890                     p->length != args->len ||
891                     p->wildcard != wildcard)
892                         continue;
893
894                 if (!p->wildcard && p->datamatch != args->datamatch)
895                         continue;
896
897                 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
898                 bus = kvm_get_bus(kvm, bus_idx);
899                 if (bus)
900                         bus->ioeventfd_count--;
901                 ioeventfd_release(p);
902                 ret = 0;
903                 break;
904         }
905
906         mutex_unlock(&kvm->slots_lock);
907
908         eventfd_ctx_put(eventfd);
909
910         return ret;
911 }
912
913 static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
914 {
915         enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags);
916         int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
917
918         if (!args->len && bus_idx == KVM_MMIO_BUS)
919                 kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
920
921         return ret;
922 }
923
924 static int
925 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
926 {
927         enum kvm_bus              bus_idx;
928         int ret;
929
930         bus_idx = ioeventfd_bus_from_flags(args->flags);
931         /* must be natural-word sized, or 0 to ignore length */
932         switch (args->len) {
933         case 0:
934         case 1:
935         case 2:
936         case 4:
937         case 8:
938                 break;
939         default:
940                 return -EINVAL;
941         }
942
943         /* check for range overflow */
944         if (args->addr + args->len < args->addr)
945                 return -EINVAL;
946
947         /* check for extra flags that we don't understand */
948         if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
949                 return -EINVAL;
950
951         /* ioeventfd with no length can't be combined with DATAMATCH */
952         if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
953                 return -EINVAL;
954
955         ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
956         if (ret)
957                 goto fail;
958
959         /* When length is ignored, MMIO is also put on a separate bus, for
960          * faster lookups.
961          */
962         if (!args->len && bus_idx == KVM_MMIO_BUS) {
963                 ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
964                 if (ret < 0)
965                         goto fast_fail;
966         }
967
968         return 0;
969
970 fast_fail:
971         kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
972 fail:
973         return ret;
974 }
975
976 int
977 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
978 {
979         if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
980                 return kvm_deassign_ioeventfd(kvm, args);
981
982         return kvm_assign_ioeventfd(kvm, args);
983 }