mm: replace vma->vm_flags direct modifications with modifier calls
[linux-block.git] / arch / powerpc / kvm / book3s_xive_native.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2019, IBM Corporation.
4  */
5
6 #define pr_fmt(fmt) "xive-kvm: " fmt
7
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <linux/irqdomain.h>
16 #include <asm/uaccess.h>
17 #include <asm/kvm_book3s.h>
18 #include <asm/kvm_ppc.h>
19 #include <asm/hvcall.h>
20 #include <asm/xive.h>
21 #include <asm/xive-regs.h>
22 #include <asm/debug.h>
23 #include <asm/opal.h>
24
25 #include <linux/debugfs.h>
26 #include <linux/seq_file.h>
27
28 #include "book3s_xive.h"
29
30 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31 {
32         u64 val;
33
34         /*
35          * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10
36          * load operation, so there is no need to enforce load-after-store
37          * ordering.
38          */
39
40         val = in_be64(xd->eoi_mmio + offset);
41         return (u8)val;
42 }
43
44 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
45 {
46         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
47         struct xive_q *q = &xc->queues[prio];
48
49         xive_native_disable_queue(xc->vp_id, q, prio);
50         if (q->qpage) {
51                 put_page(virt_to_page(q->qpage));
52                 q->qpage = NULL;
53         }
54 }
55
56 static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
57                                               u8 prio, __be32 *qpage,
58                                               u32 order, bool can_escalate)
59 {
60         int rc;
61         __be32 *qpage_prev = q->qpage;
62
63         rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
64                                          can_escalate);
65         if (rc)
66                 return rc;
67
68         if (qpage_prev)
69                 put_page(virt_to_page(qpage_prev));
70
71         return rc;
72 }
73
74 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
75 {
76         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
77         int i;
78
79         if (!kvmppc_xive_enabled(vcpu))
80                 return;
81
82         if (!xc)
83                 return;
84
85         pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
86
87         /* Ensure no interrupt is still routed to that VP */
88         xc->valid = false;
89         kvmppc_xive_disable_vcpu_interrupts(vcpu);
90
91         /* Free escalations */
92         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
93                 /* Free the escalation irq */
94                 if (xc->esc_virq[i]) {
95                         if (kvmppc_xive_has_single_escalation(xc->xive))
96                                 xive_cleanup_single_escalation(vcpu, xc->esc_virq[i]);
97                         free_irq(xc->esc_virq[i], vcpu);
98                         irq_dispose_mapping(xc->esc_virq[i]);
99                         kfree(xc->esc_virq_names[i]);
100                         xc->esc_virq[i] = 0;
101                 }
102         }
103
104         /* Disable the VP */
105         xive_native_disable_vp(xc->vp_id);
106
107         /* Clear the cam word so guest entry won't try to push context */
108         vcpu->arch.xive_cam_word = 0;
109
110         /* Free the queues */
111         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
112                 kvmppc_xive_native_cleanup_queue(vcpu, i);
113         }
114
115         /* Free the VP */
116         kfree(xc);
117
118         /* Cleanup the vcpu */
119         vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
120         vcpu->arch.xive_vcpu = NULL;
121 }
122
123 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
124                                     struct kvm_vcpu *vcpu, u32 server_num)
125 {
126         struct kvmppc_xive *xive = dev->private;
127         struct kvmppc_xive_vcpu *xc = NULL;
128         int rc;
129         u32 vp_id;
130
131         pr_devel("native_connect_vcpu(server=%d)\n", server_num);
132
133         if (dev->ops != &kvm_xive_native_ops) {
134                 pr_devel("Wrong ops !\n");
135                 return -EPERM;
136         }
137         if (xive->kvm != vcpu->kvm)
138                 return -EPERM;
139         if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
140                 return -EBUSY;
141
142         mutex_lock(&xive->lock);
143
144         rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
145         if (rc)
146                 goto bail;
147
148         xc = kzalloc(sizeof(*xc), GFP_KERNEL);
149         if (!xc) {
150                 rc = -ENOMEM;
151                 goto bail;
152         }
153
154         vcpu->arch.xive_vcpu = xc;
155         xc->xive = xive;
156         xc->vcpu = vcpu;
157         xc->server_num = server_num;
158
159         xc->vp_id = vp_id;
160         xc->valid = true;
161         vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
162
163         rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
164         if (rc) {
165                 pr_err("Failed to get VP info from OPAL: %d\n", rc);
166                 goto bail;
167         }
168
169         if (!kvmppc_xive_check_save_restore(vcpu)) {
170                 pr_err("inconsistent save-restore setup for VCPU %d\n", server_num);
171                 rc = -EIO;
172                 goto bail;
173         }
174
175         /*
176          * Enable the VP first as the single escalation mode will
177          * affect escalation interrupts numbering
178          */
179         rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
180         if (rc) {
181                 pr_err("Failed to enable VP in OPAL: %d\n", rc);
182                 goto bail;
183         }
184
185         /* Configure VCPU fields for use by assembly push/pull */
186         vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
187         vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
188
189         /* TODO: reset all queues to a clean state ? */
190 bail:
191         mutex_unlock(&xive->lock);
192         if (rc)
193                 kvmppc_xive_native_cleanup_vcpu(vcpu);
194
195         return rc;
196 }
197
198 /*
199  * Device passthrough support
200  */
201 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
202 {
203         struct kvmppc_xive *xive = kvm->arch.xive;
204         pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
205
206         if (irq >= KVMPPC_XIVE_NR_IRQS)
207                 return -EINVAL;
208
209         /*
210          * Clear the ESB pages of the IRQ number being mapped (or
211          * unmapped) into the guest and let the VM fault handler
212          * repopulate with the appropriate ESB pages (device or IC)
213          */
214         pr_debug("clearing esb pages for girq 0x%lx\n", irq);
215         mutex_lock(&xive->mapping_lock);
216         if (xive->mapping)
217                 unmap_mapping_range(xive->mapping,
218                                     esb_pgoff << PAGE_SHIFT,
219                                     2ull << PAGE_SHIFT, 1);
220         mutex_unlock(&xive->mapping_lock);
221         return 0;
222 }
223
224 static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
225         .reset_mapped = kvmppc_xive_native_reset_mapped,
226 };
227
228 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
229 {
230         struct vm_area_struct *vma = vmf->vma;
231         struct kvm_device *dev = vma->vm_file->private_data;
232         struct kvmppc_xive *xive = dev->private;
233         struct kvmppc_xive_src_block *sb;
234         struct kvmppc_xive_irq_state *state;
235         struct xive_irq_data *xd;
236         u32 hw_num;
237         u16 src;
238         u64 page;
239         unsigned long irq;
240         u64 page_offset;
241
242         /*
243          * Linux/KVM uses a two pages ESB setting, one for trigger and
244          * one for EOI
245          */
246         page_offset = vmf->pgoff - vma->vm_pgoff;
247         irq = page_offset / 2;
248
249         sb = kvmppc_xive_find_source(xive, irq, &src);
250         if (!sb) {
251                 pr_devel("%s: source %lx not found !\n", __func__, irq);
252                 return VM_FAULT_SIGBUS;
253         }
254
255         state = &sb->irq_state[src];
256
257         /* Some sanity checking */
258         if (!state->valid) {
259                 pr_devel("%s: source %lx invalid !\n", __func__, irq);
260                 return VM_FAULT_SIGBUS;
261         }
262
263         kvmppc_xive_select_irq(state, &hw_num, &xd);
264
265         arch_spin_lock(&sb->lock);
266
267         /*
268          * first/even page is for trigger
269          * second/odd page is for EOI and management.
270          */
271         page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
272         arch_spin_unlock(&sb->lock);
273
274         if (WARN_ON(!page)) {
275                 pr_err("%s: accessing invalid ESB page for source %lx !\n",
276                        __func__, irq);
277                 return VM_FAULT_SIGBUS;
278         }
279
280         vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
281         return VM_FAULT_NOPAGE;
282 }
283
284 static const struct vm_operations_struct xive_native_esb_vmops = {
285         .fault = xive_native_esb_fault,
286 };
287
288 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
289 {
290         struct vm_area_struct *vma = vmf->vma;
291
292         switch (vmf->pgoff - vma->vm_pgoff) {
293         case 0: /* HW - forbid access */
294         case 1: /* HV - forbid access */
295                 return VM_FAULT_SIGBUS;
296         case 2: /* OS */
297                 vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
298                 return VM_FAULT_NOPAGE;
299         case 3: /* USER - TODO */
300         default:
301                 return VM_FAULT_SIGBUS;
302         }
303 }
304
305 static const struct vm_operations_struct xive_native_tima_vmops = {
306         .fault = xive_native_tima_fault,
307 };
308
309 static int kvmppc_xive_native_mmap(struct kvm_device *dev,
310                                    struct vm_area_struct *vma)
311 {
312         struct kvmppc_xive *xive = dev->private;
313
314         /* We only allow mappings at fixed offset for now */
315         if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
316                 if (vma_pages(vma) > 4)
317                         return -EINVAL;
318                 vma->vm_ops = &xive_native_tima_vmops;
319         } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
320                 if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
321                         return -EINVAL;
322                 vma->vm_ops = &xive_native_esb_vmops;
323         } else {
324                 return -EINVAL;
325         }
326
327         vm_flags_set(vma, VM_IO | VM_PFNMAP);
328         vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
329
330         /*
331          * Grab the KVM device file address_space to be able to clear
332          * the ESB pages mapping when a device is passed-through into
333          * the guest.
334          */
335         xive->mapping = vma->vm_file->f_mapping;
336         return 0;
337 }
338
339 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
340                                          u64 addr)
341 {
342         struct kvmppc_xive_src_block *sb;
343         struct kvmppc_xive_irq_state *state;
344         u64 __user *ubufp = (u64 __user *) addr;
345         u64 val;
346         u16 idx;
347         int rc;
348
349         pr_devel("%s irq=0x%lx\n", __func__, irq);
350
351         if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
352                 return -E2BIG;
353
354         sb = kvmppc_xive_find_source(xive, irq, &idx);
355         if (!sb) {
356                 pr_debug("No source, creating source block...\n");
357                 sb = kvmppc_xive_create_src_block(xive, irq);
358                 if (!sb) {
359                         pr_err("Failed to create block...\n");
360                         return -ENOMEM;
361                 }
362         }
363         state = &sb->irq_state[idx];
364
365         if (get_user(val, ubufp)) {
366                 pr_err("fault getting user info !\n");
367                 return -EFAULT;
368         }
369
370         arch_spin_lock(&sb->lock);
371
372         /*
373          * If the source doesn't already have an IPI, allocate
374          * one and get the corresponding data
375          */
376         if (!state->ipi_number) {
377                 state->ipi_number = xive_native_alloc_irq();
378                 if (state->ipi_number == 0) {
379                         pr_err("Failed to allocate IRQ !\n");
380                         rc = -ENXIO;
381                         goto unlock;
382                 }
383                 xive_native_populate_irq_data(state->ipi_number,
384                                               &state->ipi_data);
385                 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
386                          state->ipi_number, irq);
387         }
388
389         /* Restore LSI state */
390         if (val & KVM_XIVE_LEVEL_SENSITIVE) {
391                 state->lsi = true;
392                 if (val & KVM_XIVE_LEVEL_ASSERTED)
393                         state->asserted = true;
394                 pr_devel("  LSI ! Asserted=%d\n", state->asserted);
395         }
396
397         /* Mask IRQ to start with */
398         state->act_server = 0;
399         state->act_priority = MASKED;
400         xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
401         xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
402
403         /* Increment the number of valid sources and mark this one valid */
404         if (!state->valid)
405                 xive->src_count++;
406         state->valid = true;
407
408         rc = 0;
409
410 unlock:
411         arch_spin_unlock(&sb->lock);
412
413         return rc;
414 }
415
416 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
417                                         struct kvmppc_xive_src_block *sb,
418                                         struct kvmppc_xive_irq_state *state,
419                                         u32 server, u8 priority, bool masked,
420                                         u32 eisn)
421 {
422         struct kvm *kvm = xive->kvm;
423         u32 hw_num;
424         int rc = 0;
425
426         arch_spin_lock(&sb->lock);
427
428         if (state->act_server == server && state->act_priority == priority &&
429             state->eisn == eisn)
430                 goto unlock;
431
432         pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
433                  priority, server, masked, state->act_server,
434                  state->act_priority);
435
436         kvmppc_xive_select_irq(state, &hw_num, NULL);
437
438         if (priority != MASKED && !masked) {
439                 rc = kvmppc_xive_select_target(kvm, &server, priority);
440                 if (rc)
441                         goto unlock;
442
443                 state->act_priority = priority;
444                 state->act_server = server;
445                 state->eisn = eisn;
446
447                 rc = xive_native_configure_irq(hw_num,
448                                                kvmppc_xive_vp(xive, server),
449                                                priority, eisn);
450         } else {
451                 state->act_priority = MASKED;
452                 state->act_server = 0;
453                 state->eisn = 0;
454
455                 rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
456         }
457
458 unlock:
459         arch_spin_unlock(&sb->lock);
460         return rc;
461 }
462
463 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
464                                                 long irq, u64 addr)
465 {
466         struct kvmppc_xive_src_block *sb;
467         struct kvmppc_xive_irq_state *state;
468         u64 __user *ubufp = (u64 __user *) addr;
469         u16 src;
470         u64 kvm_cfg;
471         u32 server;
472         u8 priority;
473         bool masked;
474         u32 eisn;
475
476         sb = kvmppc_xive_find_source(xive, irq, &src);
477         if (!sb)
478                 return -ENOENT;
479
480         state = &sb->irq_state[src];
481
482         if (!state->valid)
483                 return -EINVAL;
484
485         if (get_user(kvm_cfg, ubufp))
486                 return -EFAULT;
487
488         pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
489
490         priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
491                 KVM_XIVE_SOURCE_PRIORITY_SHIFT;
492         server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
493                 KVM_XIVE_SOURCE_SERVER_SHIFT;
494         masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
495                 KVM_XIVE_SOURCE_MASKED_SHIFT;
496         eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
497                 KVM_XIVE_SOURCE_EISN_SHIFT;
498
499         if (priority != xive_prio_from_guest(priority)) {
500                 pr_err("invalid priority for queue %d for VCPU %d\n",
501                        priority, server);
502                 return -EINVAL;
503         }
504
505         return kvmppc_xive_native_update_source_config(xive, sb, state, server,
506                                                        priority, masked, eisn);
507 }
508
509 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
510                                           long irq, u64 addr)
511 {
512         struct kvmppc_xive_src_block *sb;
513         struct kvmppc_xive_irq_state *state;
514         struct xive_irq_data *xd;
515         u32 hw_num;
516         u16 src;
517         int rc = 0;
518
519         pr_devel("%s irq=0x%lx", __func__, irq);
520
521         sb = kvmppc_xive_find_source(xive, irq, &src);
522         if (!sb)
523                 return -ENOENT;
524
525         state = &sb->irq_state[src];
526
527         rc = -EINVAL;
528
529         arch_spin_lock(&sb->lock);
530
531         if (state->valid) {
532                 kvmppc_xive_select_irq(state, &hw_num, &xd);
533                 xive_native_sync_source(hw_num);
534                 rc = 0;
535         }
536
537         arch_spin_unlock(&sb->lock);
538         return rc;
539 }
540
541 static int xive_native_validate_queue_size(u32 qshift)
542 {
543         /*
544          * We only support 64K pages for the moment. This is also
545          * advertised in the DT property "ibm,xive-eq-sizes"
546          */
547         switch (qshift) {
548         case 0: /* EQ reset */
549         case 16:
550                 return 0;
551         case 12:
552         case 21:
553         case 24:
554         default:
555                 return -EINVAL;
556         }
557 }
558
559 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
560                                                long eq_idx, u64 addr)
561 {
562         struct kvm *kvm = xive->kvm;
563         struct kvm_vcpu *vcpu;
564         struct kvmppc_xive_vcpu *xc;
565         void __user *ubufp = (void __user *) addr;
566         u32 server;
567         u8 priority;
568         struct kvm_ppc_xive_eq kvm_eq;
569         int rc;
570         __be32 *qaddr = 0;
571         struct page *page;
572         struct xive_q *q;
573         gfn_t gfn;
574         unsigned long page_size;
575         int srcu_idx;
576
577         /*
578          * Demangle priority/server tuple from the EQ identifier
579          */
580         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
581                 KVM_XIVE_EQ_PRIORITY_SHIFT;
582         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
583                 KVM_XIVE_EQ_SERVER_SHIFT;
584
585         if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
586                 return -EFAULT;
587
588         vcpu = kvmppc_xive_find_server(kvm, server);
589         if (!vcpu) {
590                 pr_err("Can't find server %d\n", server);
591                 return -ENOENT;
592         }
593         xc = vcpu->arch.xive_vcpu;
594
595         if (priority != xive_prio_from_guest(priority)) {
596                 pr_err("Trying to restore invalid queue %d for VCPU %d\n",
597                        priority, server);
598                 return -EINVAL;
599         }
600         q = &xc->queues[priority];
601
602         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
603                  __func__, server, priority, kvm_eq.flags,
604                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
605
606         /* reset queue and disable queueing */
607         if (!kvm_eq.qshift) {
608                 q->guest_qaddr  = 0;
609                 q->guest_qshift = 0;
610
611                 rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
612                                                         NULL, 0, true);
613                 if (rc) {
614                         pr_err("Failed to reset queue %d for VCPU %d: %d\n",
615                                priority, xc->server_num, rc);
616                         return rc;
617                 }
618
619                 return 0;
620         }
621
622         /*
623          * sPAPR specifies a "Unconditional Notify (n) flag" for the
624          * H_INT_SET_QUEUE_CONFIG hcall which forces notification
625          * without using the coalescing mechanisms provided by the
626          * XIVE END ESBs. This is required on KVM as notification
627          * using the END ESBs is not supported.
628          */
629         if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
630                 pr_err("invalid flags %d\n", kvm_eq.flags);
631                 return -EINVAL;
632         }
633
634         rc = xive_native_validate_queue_size(kvm_eq.qshift);
635         if (rc) {
636                 pr_err("invalid queue size %d\n", kvm_eq.qshift);
637                 return rc;
638         }
639
640         if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
641                 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
642                        1ull << kvm_eq.qshift);
643                 return -EINVAL;
644         }
645
646         srcu_idx = srcu_read_lock(&kvm->srcu);
647         gfn = gpa_to_gfn(kvm_eq.qaddr);
648
649         page_size = kvm_host_page_size(vcpu, gfn);
650         if (1ull << kvm_eq.qshift > page_size) {
651                 srcu_read_unlock(&kvm->srcu, srcu_idx);
652                 pr_warn("Incompatible host page size %lx!\n", page_size);
653                 return -EINVAL;
654         }
655
656         page = gfn_to_page(kvm, gfn);
657         if (is_error_page(page)) {
658                 srcu_read_unlock(&kvm->srcu, srcu_idx);
659                 pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
660                 return -EINVAL;
661         }
662
663         qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
664         srcu_read_unlock(&kvm->srcu, srcu_idx);
665
666         /*
667          * Backup the queue page guest address to the mark EQ page
668          * dirty for migration.
669          */
670         q->guest_qaddr  = kvm_eq.qaddr;
671         q->guest_qshift = kvm_eq.qshift;
672
673          /*
674           * Unconditional Notification is forced by default at the
675           * OPAL level because the use of END ESBs is not supported by
676           * Linux.
677           */
678         rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
679                                         (__be32 *) qaddr, kvm_eq.qshift, true);
680         if (rc) {
681                 pr_err("Failed to configure queue %d for VCPU %d: %d\n",
682                        priority, xc->server_num, rc);
683                 put_page(page);
684                 return rc;
685         }
686
687         /*
688          * Only restore the queue state when needed. When doing the
689          * H_INT_SET_SOURCE_CONFIG hcall, it should not.
690          */
691         if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
692                 rc = xive_native_set_queue_state(xc->vp_id, priority,
693                                                  kvm_eq.qtoggle,
694                                                  kvm_eq.qindex);
695                 if (rc)
696                         goto error;
697         }
698
699         rc = kvmppc_xive_attach_escalation(vcpu, priority,
700                                            kvmppc_xive_has_single_escalation(xive));
701 error:
702         if (rc)
703                 kvmppc_xive_native_cleanup_queue(vcpu, priority);
704         return rc;
705 }
706
707 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
708                                                long eq_idx, u64 addr)
709 {
710         struct kvm *kvm = xive->kvm;
711         struct kvm_vcpu *vcpu;
712         struct kvmppc_xive_vcpu *xc;
713         struct xive_q *q;
714         void __user *ubufp = (u64 __user *) addr;
715         u32 server;
716         u8 priority;
717         struct kvm_ppc_xive_eq kvm_eq;
718         u64 qaddr;
719         u64 qshift;
720         u64 qeoi_page;
721         u32 escalate_irq;
722         u64 qflags;
723         int rc;
724
725         /*
726          * Demangle priority/server tuple from the EQ identifier
727          */
728         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
729                 KVM_XIVE_EQ_PRIORITY_SHIFT;
730         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
731                 KVM_XIVE_EQ_SERVER_SHIFT;
732
733         vcpu = kvmppc_xive_find_server(kvm, server);
734         if (!vcpu) {
735                 pr_err("Can't find server %d\n", server);
736                 return -ENOENT;
737         }
738         xc = vcpu->arch.xive_vcpu;
739
740         if (priority != xive_prio_from_guest(priority)) {
741                 pr_err("invalid priority for queue %d for VCPU %d\n",
742                        priority, server);
743                 return -EINVAL;
744         }
745         q = &xc->queues[priority];
746
747         memset(&kvm_eq, 0, sizeof(kvm_eq));
748
749         if (!q->qpage)
750                 return 0;
751
752         rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
753                                         &qeoi_page, &escalate_irq, &qflags);
754         if (rc)
755                 return rc;
756
757         kvm_eq.flags = 0;
758         if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
759                 kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
760
761         kvm_eq.qshift = q->guest_qshift;
762         kvm_eq.qaddr  = q->guest_qaddr;
763
764         rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
765                                          &kvm_eq.qindex);
766         if (rc)
767                 return rc;
768
769         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
770                  __func__, server, priority, kvm_eq.flags,
771                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
772
773         if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
774                 return -EFAULT;
775
776         return 0;
777 }
778
779 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
780 {
781         int i;
782
783         for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
784                 struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
785
786                 if (!state->valid)
787                         continue;
788
789                 if (state->act_priority == MASKED)
790                         continue;
791
792                 state->eisn = 0;
793                 state->act_server = 0;
794                 state->act_priority = MASKED;
795                 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
796                 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
797                 if (state->pt_number) {
798                         xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
799                         xive_native_configure_irq(state->pt_number,
800                                                   0, MASKED, 0);
801                 }
802         }
803 }
804
805 static int kvmppc_xive_reset(struct kvmppc_xive *xive)
806 {
807         struct kvm *kvm = xive->kvm;
808         struct kvm_vcpu *vcpu;
809         unsigned long i;
810
811         pr_devel("%s\n", __func__);
812
813         mutex_lock(&xive->lock);
814
815         kvm_for_each_vcpu(i, vcpu, kvm) {
816                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
817                 unsigned int prio;
818
819                 if (!xc)
820                         continue;
821
822                 kvmppc_xive_disable_vcpu_interrupts(vcpu);
823
824                 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
825
826                         /* Single escalation, no queue 7 */
827                         if (prio == 7 && kvmppc_xive_has_single_escalation(xive))
828                                 break;
829
830                         if (xc->esc_virq[prio]) {
831                                 free_irq(xc->esc_virq[prio], vcpu);
832                                 irq_dispose_mapping(xc->esc_virq[prio]);
833                                 kfree(xc->esc_virq_names[prio]);
834                                 xc->esc_virq[prio] = 0;
835                         }
836
837                         kvmppc_xive_native_cleanup_queue(vcpu, prio);
838                 }
839         }
840
841         for (i = 0; i <= xive->max_sbid; i++) {
842                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
843
844                 if (sb) {
845                         arch_spin_lock(&sb->lock);
846                         kvmppc_xive_reset_sources(sb);
847                         arch_spin_unlock(&sb->lock);
848                 }
849         }
850
851         mutex_unlock(&xive->lock);
852
853         return 0;
854 }
855
856 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
857 {
858         int j;
859
860         for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
861                 struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
862                 struct xive_irq_data *xd;
863                 u32 hw_num;
864
865                 if (!state->valid)
866                         continue;
867
868                 /*
869                  * The struct kvmppc_xive_irq_state reflects the state
870                  * of the EAS configuration and not the state of the
871                  * source. The source is masked setting the PQ bits to
872                  * '-Q', which is what is being done before calling
873                  * the KVM_DEV_XIVE_EQ_SYNC control.
874                  *
875                  * If a source EAS is configured, OPAL syncs the XIVE
876                  * IC of the source and the XIVE IC of the previous
877                  * target if any.
878                  *
879                  * So it should be fine ignoring MASKED sources as
880                  * they have been synced already.
881                  */
882                 if (state->act_priority == MASKED)
883                         continue;
884
885                 kvmppc_xive_select_irq(state, &hw_num, &xd);
886                 xive_native_sync_source(hw_num);
887                 xive_native_sync_queue(hw_num);
888         }
889 }
890
891 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
892 {
893         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
894         unsigned int prio;
895         int srcu_idx;
896
897         if (!xc)
898                 return -ENOENT;
899
900         for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
901                 struct xive_q *q = &xc->queues[prio];
902
903                 if (!q->qpage)
904                         continue;
905
906                 /* Mark EQ page dirty for migration */
907                 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
908                 mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
909                 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
910         }
911         return 0;
912 }
913
914 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
915 {
916         struct kvm *kvm = xive->kvm;
917         struct kvm_vcpu *vcpu;
918         unsigned long i;
919
920         pr_devel("%s\n", __func__);
921
922         mutex_lock(&xive->lock);
923         for (i = 0; i <= xive->max_sbid; i++) {
924                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
925
926                 if (sb) {
927                         arch_spin_lock(&sb->lock);
928                         kvmppc_xive_native_sync_sources(sb);
929                         arch_spin_unlock(&sb->lock);
930                 }
931         }
932
933         kvm_for_each_vcpu(i, vcpu, kvm) {
934                 kvmppc_xive_native_vcpu_eq_sync(vcpu);
935         }
936         mutex_unlock(&xive->lock);
937
938         return 0;
939 }
940
941 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
942                                        struct kvm_device_attr *attr)
943 {
944         struct kvmppc_xive *xive = dev->private;
945
946         switch (attr->group) {
947         case KVM_DEV_XIVE_GRP_CTRL:
948                 switch (attr->attr) {
949                 case KVM_DEV_XIVE_RESET:
950                         return kvmppc_xive_reset(xive);
951                 case KVM_DEV_XIVE_EQ_SYNC:
952                         return kvmppc_xive_native_eq_sync(xive);
953                 case KVM_DEV_XIVE_NR_SERVERS:
954                         return kvmppc_xive_set_nr_servers(xive, attr->addr);
955                 }
956                 break;
957         case KVM_DEV_XIVE_GRP_SOURCE:
958                 return kvmppc_xive_native_set_source(xive, attr->attr,
959                                                      attr->addr);
960         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
961                 return kvmppc_xive_native_set_source_config(xive, attr->attr,
962                                                             attr->addr);
963         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
964                 return kvmppc_xive_native_set_queue_config(xive, attr->attr,
965                                                            attr->addr);
966         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
967                 return kvmppc_xive_native_sync_source(xive, attr->attr,
968                                                       attr->addr);
969         }
970         return -ENXIO;
971 }
972
973 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
974                                        struct kvm_device_attr *attr)
975 {
976         struct kvmppc_xive *xive = dev->private;
977
978         switch (attr->group) {
979         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
980                 return kvmppc_xive_native_get_queue_config(xive, attr->attr,
981                                                            attr->addr);
982         }
983         return -ENXIO;
984 }
985
986 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
987                                        struct kvm_device_attr *attr)
988 {
989         switch (attr->group) {
990         case KVM_DEV_XIVE_GRP_CTRL:
991                 switch (attr->attr) {
992                 case KVM_DEV_XIVE_RESET:
993                 case KVM_DEV_XIVE_EQ_SYNC:
994                 case KVM_DEV_XIVE_NR_SERVERS:
995                         return 0;
996                 }
997                 break;
998         case KVM_DEV_XIVE_GRP_SOURCE:
999         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
1000         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
1001                 if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
1002                     attr->attr < KVMPPC_XIVE_NR_IRQS)
1003                         return 0;
1004                 break;
1005         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
1006                 return 0;
1007         }
1008         return -ENXIO;
1009 }
1010
1011 /*
1012  * Called when device fd is closed.  kvm->lock is held.
1013  */
1014 static void kvmppc_xive_native_release(struct kvm_device *dev)
1015 {
1016         struct kvmppc_xive *xive = dev->private;
1017         struct kvm *kvm = xive->kvm;
1018         struct kvm_vcpu *vcpu;
1019         unsigned long i;
1020
1021         pr_devel("Releasing xive native device\n");
1022
1023         /*
1024          * Clear the KVM device file address_space which is used to
1025          * unmap the ESB pages when a device is passed-through.
1026          */
1027         mutex_lock(&xive->mapping_lock);
1028         xive->mapping = NULL;
1029         mutex_unlock(&xive->mapping_lock);
1030
1031         /*
1032          * Since this is the device release function, we know that
1033          * userspace does not have any open fd or mmap referring to
1034          * the device.  Therefore there can not be any of the
1035          * device attribute set/get, mmap, or page fault functions
1036          * being executed concurrently, and similarly, the
1037          * connect_vcpu and set/clr_mapped functions also cannot
1038          * be being executed.
1039          */
1040
1041         debugfs_remove(xive->dentry);
1042
1043         /*
1044          * We should clean up the vCPU interrupt presenters first.
1045          */
1046         kvm_for_each_vcpu(i, vcpu, kvm) {
1047                 /*
1048                  * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1049                  * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1050                  * Holding the vcpu->mutex also means that the vcpu cannot
1051                  * be executing the KVM_RUN ioctl, and therefore it cannot
1052                  * be executing the XIVE push or pull code or accessing
1053                  * the XIVE MMIO regions.
1054                  */
1055                 mutex_lock(&vcpu->mutex);
1056                 kvmppc_xive_native_cleanup_vcpu(vcpu);
1057                 mutex_unlock(&vcpu->mutex);
1058         }
1059
1060         /*
1061          * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1062          * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1063          * against xive code getting called during vcpu execution or
1064          * set/get one_reg operations.
1065          */
1066         kvm->arch.xive = NULL;
1067
1068         for (i = 0; i <= xive->max_sbid; i++) {
1069                 if (xive->src_blocks[i])
1070                         kvmppc_xive_free_sources(xive->src_blocks[i]);
1071                 kfree(xive->src_blocks[i]);
1072                 xive->src_blocks[i] = NULL;
1073         }
1074
1075         if (xive->vp_base != XIVE_INVALID_VP)
1076                 xive_native_free_vp_block(xive->vp_base);
1077
1078         /*
1079          * A reference of the kvmppc_xive pointer is now kept under
1080          * the xive_devices struct of the machine for reuse. It is
1081          * freed when the VM is destroyed for now until we fix all the
1082          * execution paths.
1083          */
1084
1085         kfree(dev);
1086 }
1087
1088 /*
1089  * Create a XIVE device.  kvm->lock is held.
1090  */
1091 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1092 {
1093         struct kvmppc_xive *xive;
1094         struct kvm *kvm = dev->kvm;
1095
1096         pr_devel("Creating xive native device\n");
1097
1098         if (kvm->arch.xive)
1099                 return -EEXIST;
1100
1101         xive = kvmppc_xive_get_device(kvm, type);
1102         if (!xive)
1103                 return -ENOMEM;
1104
1105         dev->private = xive;
1106         xive->dev = dev;
1107         xive->kvm = kvm;
1108         mutex_init(&xive->mapping_lock);
1109         mutex_init(&xive->lock);
1110
1111         /* VP allocation is delayed to the first call to connect_vcpu */
1112         xive->vp_base = XIVE_INVALID_VP;
1113         /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
1114          * on a POWER9 system.
1115          */
1116         xive->nr_servers = KVM_MAX_VCPUS;
1117
1118         if (xive_native_has_single_escalation())
1119                 xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
1120
1121         if (xive_native_has_save_restore())
1122                 xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
1123
1124         xive->ops = &kvmppc_xive_native_ops;
1125
1126         kvm->arch.xive = xive;
1127         return 0;
1128 }
1129
1130 /*
1131  * Interrupt Pending Buffer (IPB) offset
1132  */
1133 #define TM_IPB_SHIFT 40
1134 #define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1135
1136 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1137 {
1138         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1139         u64 opal_state;
1140         int rc;
1141
1142         if (!kvmppc_xive_enabled(vcpu))
1143                 return -EPERM;
1144
1145         if (!xc)
1146                 return -ENOENT;
1147
1148         /* Thread context registers. We only care about IPB and CPPR */
1149         val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1150
1151         /* Get the VP state from OPAL */
1152         rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1153         if (rc)
1154                 return rc;
1155
1156         /*
1157          * Capture the backup of IPB register in the NVT structure and
1158          * merge it in our KVM VP state.
1159          */
1160         val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1161
1162         pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1163                  __func__,
1164                  vcpu->arch.xive_saved_state.nsr,
1165                  vcpu->arch.xive_saved_state.cppr,
1166                  vcpu->arch.xive_saved_state.ipb,
1167                  vcpu->arch.xive_saved_state.pipr,
1168                  vcpu->arch.xive_saved_state.w01,
1169                  (u32) vcpu->arch.xive_cam_word, opal_state);
1170
1171         return 0;
1172 }
1173
1174 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1175 {
1176         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1177         struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1178
1179         pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1180                  val->xive_timaval[0], val->xive_timaval[1]);
1181
1182         if (!kvmppc_xive_enabled(vcpu))
1183                 return -EPERM;
1184
1185         if (!xc || !xive)
1186                 return -ENOENT;
1187
1188         /* We can't update the state of a "pushed" VCPU  */
1189         if (WARN_ON(vcpu->arch.xive_pushed))
1190                 return -EBUSY;
1191
1192         /*
1193          * Restore the thread context registers. IPB and CPPR should
1194          * be the only ones that matter.
1195          */
1196         vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1197
1198         /*
1199          * There is no need to restore the XIVE internal state (IPB
1200          * stored in the NVT) as the IPB register was merged in KVM VP
1201          * state when captured.
1202          */
1203         return 0;
1204 }
1205
1206 bool kvmppc_xive_native_supported(void)
1207 {
1208         return xive_native_has_queue_state_support();
1209 }
1210
1211 static int xive_native_debug_show(struct seq_file *m, void *private)
1212 {
1213         struct kvmppc_xive *xive = m->private;
1214         struct kvm *kvm = xive->kvm;
1215         struct kvm_vcpu *vcpu;
1216         unsigned long i;
1217
1218         if (!kvm)
1219                 return 0;
1220
1221         seq_puts(m, "=========\nVCPU state\n=========\n");
1222
1223         kvm_for_each_vcpu(i, vcpu, kvm) {
1224                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1225
1226                 if (!xc)
1227                         continue;
1228
1229                 seq_printf(m, "VCPU %d: VP=%#x/%02x\n"
1230                            "    NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1231                            xc->server_num, xc->vp_id, xc->vp_chip_id,
1232                            vcpu->arch.xive_saved_state.nsr,
1233                            vcpu->arch.xive_saved_state.cppr,
1234                            vcpu->arch.xive_saved_state.ipb,
1235                            vcpu->arch.xive_saved_state.pipr,
1236                            be64_to_cpu(vcpu->arch.xive_saved_state.w01),
1237                            be32_to_cpu(vcpu->arch.xive_cam_word));
1238
1239                 kvmppc_xive_debug_show_queues(m, vcpu);
1240         }
1241
1242         seq_puts(m, "=========\nSources\n=========\n");
1243
1244         for (i = 0; i <= xive->max_sbid; i++) {
1245                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
1246
1247                 if (sb) {
1248                         arch_spin_lock(&sb->lock);
1249                         kvmppc_xive_debug_show_sources(m, sb);
1250                         arch_spin_unlock(&sb->lock);
1251                 }
1252         }
1253
1254         return 0;
1255 }
1256
1257 DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
1258
1259 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1260 {
1261         xive->dentry = debugfs_create_file("xive", 0444, xive->kvm->debugfs_dentry,
1262                                            xive, &xive_native_debug_fops);
1263
1264         pr_debug("%s: created\n", __func__);
1265 }
1266
1267 static void kvmppc_xive_native_init(struct kvm_device *dev)
1268 {
1269         struct kvmppc_xive *xive = dev->private;
1270
1271         /* Register some debug interfaces */
1272         xive_native_debugfs_init(xive);
1273 }
1274
1275 struct kvm_device_ops kvm_xive_native_ops = {
1276         .name = "kvm-xive-native",
1277         .create = kvmppc_xive_native_create,
1278         .init = kvmppc_xive_native_init,
1279         .release = kvmppc_xive_native_release,
1280         .set_attr = kvmppc_xive_native_set_attr,
1281         .get_attr = kvmppc_xive_native_get_attr,
1282         .has_attr = kvmppc_xive_native_has_attr,
1283         .mmap = kvmppc_xive_native_mmap,
1284 };