arm64: mte: Lock a page for MTE tag initialisation
[linux-block.git] / arch / arm64 / kvm / mmu.c
CommitLineData
d94d71cb 1// SPDX-License-Identifier: GPL-2.0-only
749cf76c
CD
2/*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
749cf76c 5 */
342cd0ab
CD
6
7#include <linux/mman.h>
8#include <linux/kvm_host.h>
9#include <linux/io.h>
ad361f09 10#include <linux/hugetlb.h>
196f878a 11#include <linux/sched/signal.h>
45e96ea6 12#include <trace/events/kvm.h>
342cd0ab 13#include <asm/pgalloc.h>
94f8e641 14#include <asm/cacheflush.h>
342cd0ab
CD
15#include <asm/kvm_arm.h>
16#include <asm/kvm_mmu.h>
0f9d09b8 17#include <asm/kvm_pgtable.h>
0db5e022 18#include <asm/kvm_ras.h>
d5d8184d 19#include <asm/kvm_asm.h>
94f8e641 20#include <asm/kvm_emulate.h>
1e947bad 21#include <asm/virt.h>
d5d8184d
CD
22
23#include "trace.h"
342cd0ab 24
0f9d09b8 25static struct kvm_pgtable *hyp_pgtable;
342cd0ab
CD
26static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
27
5a677ce0
MZ
28static unsigned long hyp_idmap_start;
29static unsigned long hyp_idmap_end;
30static phys_addr_t hyp_idmap_vector;
31
e3f019b3
MZ
32static unsigned long io_map_base;
33
5994bc9e
OU
34static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
35{
36 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
37 phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
38
39 return (boundary - 1 < end - 1) ? boundary : end;
40}
6d674e28 41
52bae936
WD
42/*
43 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
44 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
45 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
46 * long will also starve other vCPUs. We have to also make sure that the page
47 * tables are not freed while we released the lock.
48 */
49static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
50 phys_addr_t end,
51 int (*fn)(struct kvm_pgtable *, u64, u64),
52 bool resched)
53{
54 int ret;
55 u64 next;
56
57 do {
58 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
59 if (!pgt)
60 return -EINVAL;
61
5994bc9e 62 next = stage2_range_addr_end(addr, end);
52bae936
WD
63 ret = fn(pgt, addr, next - addr);
64 if (ret)
65 break;
66
67 if (resched && next != end)
fcc5bf89 68 cond_resched_rwlock_write(&kvm->mmu_lock);
52bae936
WD
69 } while (addr = next, addr != end);
70
71 return ret;
72}
73
cc38d61c
QP
74#define stage2_apply_range_resched(kvm, addr, end, fn) \
75 stage2_apply_range(kvm, addr, end, fn, true)
76
15a49a44
MS
77static bool memslot_is_logging(struct kvm_memory_slot *memslot)
78{
15a49a44 79 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
7276030a
MS
80}
81
82/**
83 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
84 * @kvm: pointer to kvm structure.
85 *
86 * Interface to HYP function to flush all VM TLB entries
87 */
88void kvm_flush_remote_tlbs(struct kvm *kvm)
89{
3cc4e148 90 ++kvm->stat.generic.remote_tlb_flush_requests;
a0e50aa3 91 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
15a49a44 92}
ad361f09 93
e6fab544
AB
94static bool kvm_is_device_pfn(unsigned long pfn)
95{
873ba463 96 return !pfn_is_map_memory(pfn);
e6fab544
AB
97}
98
7aef0cbc
QP
99static void *stage2_memcache_zalloc_page(void *arg)
100{
101 struct kvm_mmu_memory_cache *mc = arg;
d38ba8cc 102 void *virt;
7aef0cbc
QP
103
104 /* Allocated with __GFP_ZERO, so no need to zero */
d38ba8cc
YA
105 virt = kvm_mmu_memory_cache_alloc(mc);
106 if (virt)
107 kvm_account_pgtable_pages(virt, 1);
108 return virt;
7aef0cbc
QP
109}
110
111static void *kvm_host_zalloc_pages_exact(size_t size)
112{
113 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
114}
115
d38ba8cc
YA
116static void *kvm_s2_zalloc_pages_exact(size_t size)
117{
118 void *virt = kvm_host_zalloc_pages_exact(size);
119
120 if (virt)
121 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT));
122 return virt;
123}
124
125static void kvm_s2_free_pages_exact(void *virt, size_t size)
126{
127 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT));
128 free_pages_exact(virt, size);
129}
130
7aef0cbc
QP
131static void kvm_host_get_page(void *addr)
132{
133 get_page(virt_to_page(addr));
134}
135
136static void kvm_host_put_page(void *addr)
137{
138 put_page(virt_to_page(addr));
139}
140
d38ba8cc
YA
141static void kvm_s2_put_page(void *addr)
142{
143 struct page *p = virt_to_page(addr);
144 /* Dropping last refcount, the page will be freed */
145 if (page_count(p) == 1)
146 kvm_account_pgtable_pages(addr, -1);
147 put_page(p);
148}
149
7aef0cbc
QP
150static int kvm_host_page_count(void *addr)
151{
152 return page_count(virt_to_page(addr));
153}
154
155static phys_addr_t kvm_host_pa(void *addr)
156{
157 return __pa(addr);
158}
159
160static void *kvm_host_va(phys_addr_t phys)
161{
162 return __va(phys);
163}
164
378e6a9c
YW
165static void clean_dcache_guest_page(void *va, size_t size)
166{
167 __clean_dcache_guest_page(va, size);
168}
169
170static void invalidate_icache_guest_page(void *va, size_t size)
171{
172 __invalidate_icache_guest_page(va, size);
173}
174
363ef89f
MZ
175/*
176 * Unmapping vs dcache management:
177 *
178 * If a guest maps certain memory pages as uncached, all writes will
179 * bypass the data cache and go directly to RAM. However, the CPUs
180 * can still speculate reads (not writes) and fill cache lines with
181 * data.
182 *
183 * Those cache lines will be *clean* cache lines though, so a
184 * clean+invalidate operation is equivalent to an invalidate
185 * operation, because no cache lines are marked dirty.
186 *
187 * Those clean cache lines could be filled prior to an uncached write
188 * by the guest, and the cache coherent IO subsystem would therefore
189 * end up writing old data to disk.
190 *
191 * This is why right after unmapping a page/section and invalidating
52bae936
WD
192 * the corresponding TLBs, we flush to make sure the IO subsystem will
193 * never hit in the cache.
e48d53a9
MZ
194 *
195 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
196 * we then fully enforce cacheability of RAM, no matter what the guest
197 * does.
363ef89f 198 */
7a1c831e
SP
199/**
200 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
c9c0279c 201 * @mmu: The KVM stage-2 MMU pointer
7a1c831e
SP
202 * @start: The intermediate physical base address of the range to unmap
203 * @size: The size of the area to unmap
c9c0279c 204 * @may_block: Whether or not we are permitted to block
7a1c831e
SP
205 *
206 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
207 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
208 * destroying the VM), otherwise another faulting VCPU may come in and mess
209 * with things behind our backs.
210 */
b5331379
WD
211static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
212 bool may_block)
4f853a71 213{
cfb1a98d 214 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
52bae936 215 phys_addr_t end = start + size;
4f853a71 216
fcc5bf89 217 lockdep_assert_held_write(&kvm->mmu_lock);
47a91b72 218 WARN_ON(size & ~PAGE_MASK);
52bae936
WD
219 WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
220 may_block));
000d3996
MZ
221}
222
b5331379
WD
223static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
224{
225 __unmap_stage2_range(mmu, start, size, true);
226}
227
9d218a1f
MZ
228static void stage2_flush_memslot(struct kvm *kvm,
229 struct kvm_memory_slot *memslot)
230{
231 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
232 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
9d218a1f 233
8d5207be 234 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
9d218a1f
MZ
235}
236
237/**
238 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
239 * @kvm: The struct kvm pointer
240 *
241 * Go through the stage 2 page tables and invalidate any cache lines
242 * backing memory already mapped to the VM.
243 */
3c1e7165 244static void stage2_flush_vm(struct kvm *kvm)
9d218a1f
MZ
245{
246 struct kvm_memslots *slots;
247 struct kvm_memory_slot *memslot;
a54d8066 248 int idx, bkt;
9d218a1f
MZ
249
250 idx = srcu_read_lock(&kvm->srcu);
fcc5bf89 251 write_lock(&kvm->mmu_lock);
9d218a1f
MZ
252
253 slots = kvm_memslots(kvm);
a54d8066 254 kvm_for_each_memslot(memslot, bkt, slots)
9d218a1f
MZ
255 stage2_flush_memslot(kvm, memslot);
256
fcc5bf89 257 write_unlock(&kvm->mmu_lock);
9d218a1f
MZ
258 srcu_read_unlock(&kvm->srcu, idx);
259}
260
342cd0ab 261/**
4f728276 262 * free_hyp_pgds - free Hyp-mode page tables
342cd0ab 263 */
4f728276 264void free_hyp_pgds(void)
342cd0ab 265{
d157f4a5 266 mutex_lock(&kvm_hyp_pgd_mutex);
0f9d09b8
WD
267 if (hyp_pgtable) {
268 kvm_pgtable_hyp_destroy(hyp_pgtable);
269 kfree(hyp_pgtable);
bfa79a80 270 hyp_pgtable = NULL;
26781f9c 271 }
342cd0ab
CD
272 mutex_unlock(&kvm_hyp_pgd_mutex);
273}
274
bfa79a80
QP
275static bool kvm_host_owns_hyp_mappings(void)
276{
64a1fbda
QP
277 if (is_kernel_in_hyp_mode())
278 return false;
279
bfa79a80
QP
280 if (static_branch_likely(&kvm_protected_mode_initialized))
281 return false;
282
283 /*
284 * This can happen at boot time when __create_hyp_mappings() is called
285 * after the hyp protection has been enabled, but the static key has
286 * not been flipped yet.
287 */
288 if (!hyp_pgtable && is_protected_kvm_enabled())
289 return false;
290
291 WARN_ON(!hyp_pgtable);
292
293 return true;
294}
295
ce335431
KS
296int __create_hyp_mappings(unsigned long start, unsigned long size,
297 unsigned long phys, enum kvm_pgtable_prot prot)
342cd0ab 298{
0f9d09b8 299 int err;
342cd0ab 300
66c57edd
QP
301 if (WARN_ON(!kvm_host_owns_hyp_mappings()))
302 return -EINVAL;
bfa79a80 303
342cd0ab 304 mutex_lock(&kvm_hyp_pgd_mutex);
0f9d09b8 305 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
342cd0ab 306 mutex_unlock(&kvm_hyp_pgd_mutex);
0f9d09b8 307
342cd0ab
CD
308 return err;
309}
310
40c2729b
CD
311static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
312{
313 if (!is_vmalloc_addr(kaddr)) {
314 BUG_ON(!virt_addr_valid(kaddr));
315 return __pa(kaddr);
316 } else {
317 return page_to_phys(vmalloc_to_page(kaddr)) +
318 offset_in_page(kaddr);
319 }
320}
321
a83e2191
QP
322struct hyp_shared_pfn {
323 u64 pfn;
324 int count;
325 struct rb_node node;
326};
327
328static DEFINE_MUTEX(hyp_shared_pfns_lock);
329static struct rb_root hyp_shared_pfns = RB_ROOT;
330
331static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
332 struct rb_node **parent)
66c57edd 333{
a83e2191
QP
334 struct hyp_shared_pfn *this;
335
336 *node = &hyp_shared_pfns.rb_node;
337 *parent = NULL;
338 while (**node) {
339 this = container_of(**node, struct hyp_shared_pfn, node);
340 *parent = **node;
341 if (this->pfn < pfn)
342 *node = &((**node)->rb_left);
343 else if (this->pfn > pfn)
344 *node = &((**node)->rb_right);
345 else
346 return this;
347 }
66c57edd 348
a83e2191
QP
349 return NULL;
350}
351
352static int share_pfn_hyp(u64 pfn)
66c57edd 353{
a83e2191
QP
354 struct rb_node **node, *parent;
355 struct hyp_shared_pfn *this;
356 int ret = 0;
357
358 mutex_lock(&hyp_shared_pfns_lock);
359 this = find_shared_pfn(pfn, &node, &parent);
360 if (this) {
361 this->count++;
362 goto unlock;
66c57edd
QP
363 }
364
a83e2191
QP
365 this = kzalloc(sizeof(*this), GFP_KERNEL);
366 if (!this) {
367 ret = -ENOMEM;
368 goto unlock;
369 }
370
371 this->pfn = pfn;
372 this->count = 1;
373 rb_link_node(&this->node, parent, node);
374 rb_insert_color(&this->node, &hyp_shared_pfns);
375 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
376unlock:
377 mutex_unlock(&hyp_shared_pfns_lock);
378
379 return ret;
66c57edd
QP
380}
381
52b28657 382static int unshare_pfn_hyp(u64 pfn)
66c57edd 383{
52b28657
QP
384 struct rb_node **node, *parent;
385 struct hyp_shared_pfn *this;
386 int ret = 0;
387
388 mutex_lock(&hyp_shared_pfns_lock);
389 this = find_shared_pfn(pfn, &node, &parent);
390 if (WARN_ON(!this)) {
391 ret = -ENOENT;
392 goto unlock;
393 }
394
395 this->count--;
396 if (this->count)
397 goto unlock;
398
399 rb_erase(&this->node, &hyp_shared_pfns);
400 kfree(this);
401 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
402unlock:
403 mutex_unlock(&hyp_shared_pfns_lock);
404
405 return ret;
406}
407
3f868e14
QP
408int kvm_share_hyp(void *from, void *to)
409{
a83e2191
QP
410 phys_addr_t start, end, cur;
411 u64 pfn;
66c57edd
QP
412 int ret;
413
3f868e14
QP
414 if (is_kernel_in_hyp_mode())
415 return 0;
416
417 /*
418 * The share hcall maps things in the 'fixed-offset' region of the hyp
419 * VA space, so we can only share physically contiguous data-structures
420 * for now.
421 */
422 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
423 return -EINVAL;
424
425 if (kvm_host_owns_hyp_mappings())
426 return create_hyp_mappings(from, to, PAGE_HYP);
427
a83e2191
QP
428 start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
429 end = PAGE_ALIGN(__pa(to));
430 for (cur = start; cur < end; cur += PAGE_SIZE) {
431 pfn = __phys_to_pfn(cur);
432 ret = share_pfn_hyp(pfn);
66c57edd
QP
433 if (ret)
434 return ret;
435 }
436
437 return 0;
438}
439
52b28657
QP
440void kvm_unshare_hyp(void *from, void *to)
441{
442 phys_addr_t start, end, cur;
443 u64 pfn;
444
445 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
446 return;
447
448 start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
449 end = PAGE_ALIGN(__pa(to));
450 for (cur = start; cur < end; cur += PAGE_SIZE) {
451 pfn = __phys_to_pfn(cur);
452 WARN_ON(unshare_pfn_hyp(pfn));
453 }
454}
455
342cd0ab 456/**
06e8c3b0 457 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
342cd0ab
CD
458 * @from: The virtual kernel start address of the range
459 * @to: The virtual kernel end address of the range (exclusive)
c8dddecd 460 * @prot: The protection to be applied to this range
342cd0ab 461 *
06e8c3b0
MZ
462 * The same virtual address as the kernel virtual address is also used
463 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
464 * physical pages.
342cd0ab 465 */
0f9d09b8 466int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
342cd0ab 467{
40c2729b
CD
468 phys_addr_t phys_addr;
469 unsigned long virt_addr;
6c41a413
MZ
470 unsigned long start = kern_hyp_va((unsigned long)from);
471 unsigned long end = kern_hyp_va((unsigned long)to);
6060df84 472
1e947bad
MZ
473 if (is_kernel_in_hyp_mode())
474 return 0;
475
3f868e14
QP
476 if (!kvm_host_owns_hyp_mappings())
477 return -EPERM;
66c57edd 478
40c2729b
CD
479 start = start & PAGE_MASK;
480 end = PAGE_ALIGN(end);
6060df84 481
40c2729b
CD
482 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
483 int err;
6060df84 484
40c2729b 485 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
0f9d09b8 486 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
c8dddecd 487 prot);
40c2729b
CD
488 if (err)
489 return err;
490 }
491
492 return 0;
342cd0ab
CD
493}
494
92abe0f8
KS
495
496/**
497 * hyp_alloc_private_va_range - Allocates a private VA range.
498 * @size: The size of the VA range to reserve.
499 * @haddr: The hypervisor virtual start address of the allocation.
500 *
501 * The private virtual address (VA) range is allocated below io_map_base
502 * and aligned based on the order of @size.
503 *
504 * Return: 0 on success or negative error code on failure.
505 */
506int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
342cd0ab 507{
e3f019b3
MZ
508 unsigned long base;
509 int ret = 0;
6060df84 510
e3f019b3 511 mutex_lock(&kvm_hyp_pgd_mutex);
6060df84 512
e3f019b3 513 /*
656012c7 514 * This assumes that we have enough space below the idmap
e3f019b3
MZ
515 * page to allocate our VAs. If not, the check below will
516 * kick. A potential alternative would be to detect that
517 * overflow and switch to an allocation above the idmap.
518 *
519 * The allocated size is always a multiple of PAGE_SIZE.
520 */
92abe0f8
KS
521 base = io_map_base - PAGE_ALIGN(size);
522
523 /* Align the allocation based on the order of its size */
524 base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
1bb32a44 525
e3f019b3
MZ
526 /*
527 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
528 * allocating the new area, as it would indicate we've
529 * overflowed the idmap/IO address range.
530 */
531 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
532 ret = -ENOMEM;
533 else
92abe0f8 534 *haddr = io_map_base = base;
e3f019b3
MZ
535
536 mutex_unlock(&kvm_hyp_pgd_mutex);
537
92abe0f8
KS
538 return ret;
539}
540
541static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
542 unsigned long *haddr,
543 enum kvm_pgtable_prot prot)
544{
545 unsigned long addr;
546 int ret = 0;
547
548 if (!kvm_host_owns_hyp_mappings()) {
549 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
550 phys_addr, size, prot);
551 if (IS_ERR_VALUE(addr))
552 return addr;
553 *haddr = addr;
554
555 return 0;
556 }
557
558 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
559 ret = hyp_alloc_private_va_range(size, &addr);
e3f019b3 560 if (ret)
92abe0f8 561 return ret;
e3f019b3 562
92abe0f8 563 ret = __create_hyp_mappings(addr, size, phys_addr, prot);
e3f019b3 564 if (ret)
92abe0f8 565 return ret;
e3f019b3 566
92abe0f8 567 *haddr = addr + offset_in_page(phys_addr);
dc2e4633
MZ
568 return ret;
569}
570
571/**
572 * create_hyp_io_mappings - Map IO into both kernel and HYP
573 * @phys_addr: The physical start address which gets mapped
574 * @size: Size of the region being mapped
575 * @kaddr: Kernel VA for this mapping
576 * @haddr: HYP VA for this mapping
577 */
578int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
579 void __iomem **kaddr,
580 void __iomem **haddr)
581{
582 unsigned long addr;
583 int ret;
584
bff01cb6
QP
585 if (is_protected_kvm_enabled())
586 return -EPERM;
587
dc2e4633
MZ
588 *kaddr = ioremap(phys_addr, size);
589 if (!*kaddr)
590 return -ENOMEM;
591
592 if (is_kernel_in_hyp_mode()) {
593 *haddr = *kaddr;
594 return 0;
595 }
596
597 ret = __create_hyp_private_mapping(phys_addr, size,
598 &addr, PAGE_HYP_DEVICE);
1bb32a44
MZ
599 if (ret) {
600 iounmap(*kaddr);
601 *kaddr = NULL;
dc2e4633
MZ
602 *haddr = NULL;
603 return ret;
604 }
605
606 *haddr = (void __iomem *)addr;
607 return 0;
608}
609
610/**
611 * create_hyp_exec_mappings - Map an executable range into HYP
612 * @phys_addr: The physical start address which gets mapped
613 * @size: Size of the region being mapped
614 * @haddr: HYP VA for this mapping
615 */
616int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
617 void **haddr)
618{
619 unsigned long addr;
620 int ret;
621
622 BUG_ON(is_kernel_in_hyp_mode());
623
624 ret = __create_hyp_private_mapping(phys_addr, size,
625 &addr, PAGE_HYP_EXEC);
626 if (ret) {
627 *haddr = NULL;
1bb32a44
MZ
628 return ret;
629 }
630
dc2e4633 631 *haddr = (void *)addr;
1bb32a44 632 return 0;
342cd0ab
CD
633}
634
6011cf68
MZ
635static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
636 /* We shouldn't need any other callback to walk the PT */
637 .phys_to_virt = kvm_host_va,
638};
639
640static int get_user_mapping_size(struct kvm *kvm, u64 addr)
641{
642 struct kvm_pgtable pgt = {
643 .pgd = (kvm_pte_t *)kvm->mm->pgd,
644 .ia_bits = VA_BITS,
645 .start_level = (KVM_PGTABLE_MAX_LEVELS -
646 CONFIG_PGTABLE_LEVELS),
647 .mm_ops = &kvm_user_mm_ops,
648 };
649 kvm_pte_t pte = 0; /* Keep GCC quiet... */
650 u32 level = ~0;
651 int ret;
652
653 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
654 VM_BUG_ON(ret);
655 VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
656 VM_BUG_ON(!(pte & PTE_VALID));
657
658 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
659}
660
7aef0cbc
QP
661static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
662 .zalloc_page = stage2_memcache_zalloc_page,
d38ba8cc
YA
663 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
664 .free_pages_exact = kvm_s2_free_pages_exact,
7aef0cbc 665 .get_page = kvm_host_get_page,
d38ba8cc 666 .put_page = kvm_s2_put_page,
7aef0cbc
QP
667 .page_count = kvm_host_page_count,
668 .phys_to_virt = kvm_host_va,
669 .virt_to_phys = kvm_host_pa,
25aa2869
YW
670 .dcache_clean_inval_poc = clean_dcache_guest_page,
671 .icache_inval_pou = invalidate_icache_guest_page,
7aef0cbc
QP
672};
673
d5d8184d 674/**
21ea4578 675 * kvm_init_stage2_mmu - Initialise a S2 MMU structure
a0e50aa3
CD
676 * @kvm: The pointer to the KVM structure
677 * @mmu: The pointer to the s2 MMU structure
d5d8184d 678 *
71233d05 679 * Allocates only the stage-2 HW PGD level table(s).
d5d8184d
CD
680 * Note we don't need locking here as this is only called when the VM is
681 * created, which can only be done once.
682 */
a0e50aa3 683int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
d5d8184d 684{
71233d05
WD
685 int cpu, err;
686 struct kvm_pgtable *pgt;
d5d8184d 687
71233d05 688 if (mmu->pgt != NULL) {
d5d8184d
CD
689 kvm_err("kvm_arch already initialized?\n");
690 return -EINVAL;
691 }
692
115bae92 693 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
71233d05 694 if (!pgt)
a987370f
MZ
695 return -ENOMEM;
696
9d8604b2
MZ
697 mmu->arch = &kvm->arch;
698 err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
71233d05
WD
699 if (err)
700 goto out_free_pgtable;
e329fb75 701
a0e50aa3
CD
702 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
703 if (!mmu->last_vcpu_ran) {
71233d05
WD
704 err = -ENOMEM;
705 goto out_destroy_pgtable;
a0e50aa3
CD
706 }
707
708 for_each_possible_cpu(cpu)
709 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
710
71233d05
WD
711 mmu->pgt = pgt;
712 mmu->pgd_phys = __pa(pgt->pgd);
d5d8184d 713 return 0;
71233d05
WD
714
715out_destroy_pgtable:
716 kvm_pgtable_stage2_destroy(pgt);
717out_free_pgtable:
718 kfree(pgt);
719 return err;
d5d8184d
CD
720}
721
957db105
CD
722static void stage2_unmap_memslot(struct kvm *kvm,
723 struct kvm_memory_slot *memslot)
724{
725 hva_t hva = memslot->userspace_addr;
726 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
727 phys_addr_t size = PAGE_SIZE * memslot->npages;
728 hva_t reg_end = hva + size;
729
730 /*
731 * A memory region could potentially cover multiple VMAs, and any holes
732 * between them, so iterate over all of them to find out if we should
733 * unmap any of them.
734 *
735 * +--------------------------------------------+
736 * +---------------+----------------+ +----------------+
737 * | : VMA 1 | VMA 2 | | VMA 3 : |
738 * +---------------+----------------+ +----------------+
739 * | memory region |
740 * +--------------------------------------------+
741 */
742 do {
c728fd4c 743 struct vm_area_struct *vma;
957db105
CD
744 hva_t vm_start, vm_end;
745
c728fd4c
GS
746 vma = find_vma_intersection(current->mm, hva, reg_end);
747 if (!vma)
957db105
CD
748 break;
749
750 /*
751 * Take the intersection of this VMA with the memory region
752 */
753 vm_start = max(hva, vma->vm_start);
754 vm_end = min(reg_end, vma->vm_end);
755
756 if (!(vma->vm_flags & VM_PFNMAP)) {
757 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
a0e50aa3 758 unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
957db105
CD
759 }
760 hva = vm_end;
761 } while (hva < reg_end);
762}
763
764/**
765 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
766 * @kvm: The struct kvm pointer
767 *
656012c7 768 * Go through the memregions and unmap any regular RAM
957db105
CD
769 * backing memory already mapped to the VM.
770 */
771void stage2_unmap_vm(struct kvm *kvm)
772{
773 struct kvm_memslots *slots;
774 struct kvm_memory_slot *memslot;
a54d8066 775 int idx, bkt;
957db105
CD
776
777 idx = srcu_read_lock(&kvm->srcu);
89154dd5 778 mmap_read_lock(current->mm);
fcc5bf89 779 write_lock(&kvm->mmu_lock);
957db105
CD
780
781 slots = kvm_memslots(kvm);
a54d8066 782 kvm_for_each_memslot(memslot, bkt, slots)
957db105
CD
783 stage2_unmap_memslot(kvm, memslot);
784
fcc5bf89 785 write_unlock(&kvm->mmu_lock);
89154dd5 786 mmap_read_unlock(current->mm);
957db105
CD
787 srcu_read_unlock(&kvm->srcu, idx);
788}
789
a0e50aa3 790void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
d5d8184d 791{
cfb1a98d 792 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
71233d05 793 struct kvm_pgtable *pgt = NULL;
d5d8184d 794
fcc5bf89 795 write_lock(&kvm->mmu_lock);
71233d05
WD
796 pgt = mmu->pgt;
797 if (pgt) {
71233d05
WD
798 mmu->pgd_phys = 0;
799 mmu->pgt = NULL;
800 free_percpu(mmu->last_vcpu_ran);
6c0d706b 801 }
fcc5bf89 802 write_unlock(&kvm->mmu_lock);
8b3405e3 803
71233d05
WD
804 if (pgt) {
805 kvm_pgtable_stage2_destroy(pgt);
806 kfree(pgt);
a0e50aa3 807 }
d5d8184d
CD
808}
809
d5d8184d
CD
810/**
811 * kvm_phys_addr_ioremap - map a device range to guest IPA
812 *
813 * @kvm: The KVM pointer
814 * @guest_ipa: The IPA at which to insert the mapping
815 * @pa: The physical address of the device
816 * @size: The size of the mapping
c9c0279c 817 * @writable: Whether or not to create a writable mapping
d5d8184d
CD
818 */
819int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
c40f2f8f 820 phys_addr_t pa, unsigned long size, bool writable)
d5d8184d 821{
02bbd374 822 phys_addr_t addr;
d5d8184d 823 int ret = 0;
837f66c7 824 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
02bbd374
WD
825 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
826 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
827 KVM_PGTABLE_PROT_R |
828 (writable ? KVM_PGTABLE_PROT_W : 0);
d5d8184d 829
bff01cb6
QP
830 if (is_protected_kvm_enabled())
831 return -EPERM;
832
02bbd374
WD
833 size += offset_in_page(guest_ipa);
834 guest_ipa &= PAGE_MASK;
c40f2f8f 835
02bbd374 836 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
c1a33aeb
SC
837 ret = kvm_mmu_topup_memory_cache(&cache,
838 kvm_mmu_cache_min_pages(kvm));
d5d8184d 839 if (ret)
02bbd374
WD
840 break;
841
fcc5bf89 842 write_lock(&kvm->mmu_lock);
02bbd374
WD
843 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
844 &cache);
fcc5bf89 845 write_unlock(&kvm->mmu_lock);
d5d8184d 846 if (ret)
02bbd374 847 break;
d5d8184d 848
02bbd374 849 pa += PAGE_SIZE;
d5d8184d
CD
850 }
851
c1a33aeb 852 kvm_mmu_free_memory_cache(&cache);
d5d8184d
CD
853 return ret;
854}
855
c6473555
MS
856/**
857 * stage2_wp_range() - write protect stage2 memory region range
c9c0279c 858 * @mmu: The KVM stage-2 MMU pointer
c6473555
MS
859 * @addr: Start address of range
860 * @end: End address of range
861 */
a0e50aa3 862static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
c6473555 863{
cfb1a98d 864 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
cc38d61c 865 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
c6473555
MS
866}
867
868/**
869 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
870 * @kvm: The KVM pointer
871 * @slot: The memory slot to write protect
872 *
873 * Called to start logging dirty pages after memory region
874 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
4ea5af53 875 * all present PUD, PMD and PTEs are write protected in the memory region.
c6473555
MS
876 * Afterwards read of dirty page log can be called.
877 *
878 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
879 * serializing operations for VM memory regions.
880 */
eab62148 881static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
c6473555 882{
9f6b8029
PB
883 struct kvm_memslots *slots = kvm_memslots(kvm);
884 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
0577d1ab
SC
885 phys_addr_t start, end;
886
887 if (WARN_ON_ONCE(!memslot))
888 return;
889
890 start = memslot->base_gfn << PAGE_SHIFT;
891 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
c6473555 892
fcc5bf89 893 write_lock(&kvm->mmu_lock);
a0e50aa3 894 stage2_wp_range(&kvm->arch.mmu, start, end);
fcc5bf89 895 write_unlock(&kvm->mmu_lock);
c6473555
MS
896 kvm_flush_remote_tlbs(kvm);
897}
53c810c3
MS
898
899/**
3b0f1d01 900 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
53c810c3
MS
901 * @kvm: The KVM pointer
902 * @slot: The memory slot associated with mask
903 * @gfn_offset: The gfn offset in memory slot
904 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
905 * slot to be write protected
906 *
907 * Walks bits set in mask write protects the associated pte's. Caller must
908 * acquire kvm_mmu_lock.
909 */
3b0f1d01 910static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
53c810c3
MS
911 struct kvm_memory_slot *slot,
912 gfn_t gfn_offset, unsigned long mask)
913{
914 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
915 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
916 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
917
a0e50aa3 918 stage2_wp_range(&kvm->arch.mmu, start, end);
53c810c3 919}
c6473555 920
3b0f1d01
KH
921/*
922 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
923 * dirty pages.
924 *
925 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
926 * enable dirty logging for them.
927 */
928void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
929 struct kvm_memory_slot *slot,
930 gfn_t gfn_offset, unsigned long mask)
931{
932 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
933}
934
1559b758 935static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
196f878a 936{
795a8371 937 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
196f878a
JM
938}
939
a80868f3
SP
940static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
941 unsigned long hva,
942 unsigned long map_size)
6794ad54 943{
c2be79a0 944 gpa_t gpa_start;
6794ad54
CD
945 hva_t uaddr_start, uaddr_end;
946 size_t size;
947
9f283614
SP
948 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
949 if (map_size == PAGE_SIZE)
950 return true;
951
6794ad54
CD
952 size = memslot->npages * PAGE_SIZE;
953
954 gpa_start = memslot->base_gfn << PAGE_SHIFT;
6794ad54
CD
955
956 uaddr_start = memslot->userspace_addr;
957 uaddr_end = uaddr_start + size;
958
959 /*
960 * Pages belonging to memslots that don't have the same alignment
a80868f3
SP
961 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
962 * PMD/PUD entries, because we'll end up mapping the wrong pages.
6794ad54
CD
963 *
964 * Consider a layout like the following:
965 *
966 * memslot->userspace_addr:
967 * +-----+--------------------+--------------------+---+
a80868f3 968 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
6794ad54
CD
969 * +-----+--------------------+--------------------+---+
970 *
9f283614 971 * memslot->base_gfn << PAGE_SHIFT:
6794ad54 972 * +---+--------------------+--------------------+-----+
a80868f3 973 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
6794ad54
CD
974 * +---+--------------------+--------------------+-----+
975 *
a80868f3 976 * If we create those stage-2 blocks, we'll end up with this incorrect
6794ad54
CD
977 * mapping:
978 * d -> f
979 * e -> g
980 * f -> h
981 */
a80868f3 982 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
6794ad54
CD
983 return false;
984
985 /*
986 * Next, let's make sure we're not trying to map anything not covered
a80868f3
SP
987 * by the memslot. This means we have to prohibit block size mappings
988 * for the beginning and end of a non-block aligned and non-block sized
6794ad54
CD
989 * memory slot (illustrated by the head and tail parts of the
990 * userspace view above containing pages 'abcde' and 'xyz',
991 * respectively).
992 *
993 * Note that it doesn't matter if we do the check using the
994 * userspace_addr or the base_gfn, as both are equally aligned (per
995 * the check above) and equally sized.
996 */
a80868f3
SP
997 return (hva & ~(map_size - 1)) >= uaddr_start &&
998 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
6794ad54
CD
999}
1000
0529c902
SP
1001/*
1002 * Check if the given hva is backed by a transparent huge page (THP) and
1003 * whether it can be mapped using block mapping in stage2. If so, adjust
1004 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1005 * supported. This will need to be updated to support other THP sizes.
1006 *
1007 * Returns the size of the mapping.
1008 */
1009static unsigned long
6011cf68 1010transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
0529c902
SP
1011 unsigned long hva, kvm_pfn_t *pfnp,
1012 phys_addr_t *ipap)
1013{
1014 kvm_pfn_t pfn = *pfnp;
1015
1016 /*
1017 * Make sure the adjustment is done only for THP pages. Also make
1018 * sure that the HVA and IPA are sufficiently aligned and that the
1019 * block map is contained within the memslot.
1020 */
6011cf68
MZ
1021 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
1022 get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
0529c902
SP
1023 /*
1024 * The address we faulted on is backed by a transparent huge
1025 * page. However, because we map the compound huge page and
1026 * not the individual tail page, we need to transfer the
1027 * refcount to the head page. We have to be careful that the
1028 * THP doesn't start to split while we are adjusting the
1029 * refcounts.
1030 *
20ec3ebd 1031 * We are sure this doesn't happen, because mmu_invalidate_retry
0529c902
SP
1032 * was successful and we are holding the mmu_lock, so if this
1033 * THP is trying to split, it will be blocked in the mmu
1034 * notifier before touching any of the pages, specifically
1035 * before being able to call __split_huge_page_refcount().
1036 *
1037 * We can therefore safely transfer the refcount from PG_tail
1038 * to PG_head and switch the pfn from a tail page to the head
1039 * page accordingly.
1040 */
1041 *ipap &= PMD_MASK;
1042 kvm_release_pfn_clean(pfn);
1043 pfn &= ~(PTRS_PER_PMD - 1);
0fe49630 1044 get_page(pfn_to_page(pfn));
0529c902
SP
1045 *pfnp = pfn;
1046
1047 return PMD_SIZE;
1048 }
1049
1050 /* Use page mapping if we cannot use block mapping. */
1051 return PAGE_SIZE;
1052}
1053
2aa53d68
KZ
1054static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
1055{
1056 unsigned long pa;
1057
1058 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
1059 return huge_page_shift(hstate_vma(vma));
1060
1061 if (!(vma->vm_flags & VM_PFNMAP))
1062 return PAGE_SHIFT;
1063
1064 VM_BUG_ON(is_vm_hugetlb_page(vma));
1065
1066 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
1067
1068#ifndef __PAGETABLE_PMD_FOLDED
1069 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
1070 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
1071 ALIGN(hva, PUD_SIZE) <= vma->vm_end)
1072 return PUD_SHIFT;
1073#endif
1074
1075 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
1076 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
1077 ALIGN(hva, PMD_SIZE) <= vma->vm_end)
1078 return PMD_SHIFT;
1079
1080 return PAGE_SHIFT;
1081}
1082
ea7fc1bb
SP
1083/*
1084 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
1085 * able to see the page's tags and therefore they must be initialised first. If
1086 * PG_mte_tagged is set, tags have already been initialised.
1087 *
1088 * The race in the test/set of the PG_mte_tagged flag is handled by:
1089 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
1090 * racing to santise the same page
1091 * - mmap_lock protects between a VM faulting a page in and the VMM performing
1092 * an mprotect() to add VM_MTE
1093 */
2dbf12ae
CM
1094static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
1095 unsigned long size)
ea7fc1bb
SP
1096{
1097 unsigned long i, nr_pages = size >> PAGE_SHIFT;
2dbf12ae 1098 struct page *page = pfn_to_page(pfn);
ea7fc1bb
SP
1099
1100 if (!kvm_has_mte(kvm))
2dbf12ae 1101 return;
ea7fc1bb
SP
1102
1103 for (i = 0; i < nr_pages; i++, page++) {
d77e59a8 1104 if (try_page_mte_tagging(page)) {
ea7fc1bb 1105 mte_clear_page_tags(page_address(page));
e059853d 1106 set_page_mte_tagged(page);
ea7fc1bb
SP
1107 }
1108 }
ea7fc1bb
SP
1109}
1110
94f8e641 1111static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
98047888 1112 struct kvm_memory_slot *memslot, unsigned long hva,
94f8e641
CD
1113 unsigned long fault_status)
1114{
ffd1b63a 1115 int ret = 0;
6396b852 1116 bool write_fault, writable, force_pte = false;
6f745f1b
WD
1117 bool exec_fault;
1118 bool device = false;
94f8e641 1119 unsigned long mmu_seq;
ad361f09 1120 struct kvm *kvm = vcpu->kvm;
94f8e641 1121 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
ad361f09 1122 struct vm_area_struct *vma;
1559b758 1123 short vma_shift;
6f745f1b 1124 gfn_t gfn;
ba049e93 1125 kvm_pfn_t pfn;
15a49a44 1126 bool logging_active = memslot_is_logging(memslot);
f587661f 1127 bool use_read_lock = false;
7d894834
YW
1128 unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
1129 unsigned long vma_pagesize, fault_granule;
6f745f1b
WD
1130 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
1131 struct kvm_pgtable *pgt;
94f8e641 1132
7d894834 1133 fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
a7d079ce 1134 write_fault = kvm_is_write_fault(vcpu);
c4ad98e4 1135 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
d0e22b4a
MZ
1136 VM_BUG_ON(write_fault && exec_fault);
1137
1138 if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
94f8e641
CD
1139 kvm_err("Unexpected L2 read permission error\n");
1140 return -EFAULT;
1141 }
1142
2aa53d68
KZ
1143 /*
1144 * Let's check if we will get back a huge page backed by hugetlbfs, or
1145 * get block mapping for device MMIO region.
1146 */
89154dd5 1147 mmap_read_lock(current->mm);
09eef83a 1148 vma = vma_lookup(current->mm, hva);
37b54408
AB
1149 if (unlikely(!vma)) {
1150 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
89154dd5 1151 mmap_read_unlock(current->mm);
37b54408
AB
1152 return -EFAULT;
1153 }
1154
2aa53d68
KZ
1155 /*
1156 * logging_active is guaranteed to never be true for VM_PFNMAP
1157 * memslots.
1158 */
1159 if (logging_active) {
a80868f3 1160 force_pte = true;
523b3999 1161 vma_shift = PAGE_SHIFT;
f587661f
OU
1162 use_read_lock = (fault_status == FSC_PERM && write_fault &&
1163 fault_granule == PAGE_SIZE);
2aa53d68
KZ
1164 } else {
1165 vma_shift = get_vma_page_shift(vma, hva);
523b3999
AE
1166 }
1167
2f40c460 1168 switch (vma_shift) {
faf00039 1169#ifndef __PAGETABLE_PMD_FOLDED
2f40c460
GS
1170 case PUD_SHIFT:
1171 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
1172 break;
1173 fallthrough;
faf00039 1174#endif
2f40c460
GS
1175 case CONT_PMD_SHIFT:
1176 vma_shift = PMD_SHIFT;
1177 fallthrough;
1178 case PMD_SHIFT:
1179 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
1180 break;
1181 fallthrough;
1182 case CONT_PTE_SHIFT:
523b3999 1183 vma_shift = PAGE_SHIFT;
2f40c460
GS
1184 force_pte = true;
1185 fallthrough;
1186 case PAGE_SHIFT:
1187 break;
1188 default:
1189 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
a80868f3
SP
1190 }
1191
523b3999 1192 vma_pagesize = 1UL << vma_shift;
6f745f1b 1193 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
523b3999 1194 fault_ipa &= ~(vma_pagesize - 1);
6f745f1b
WD
1195
1196 gfn = fault_ipa >> PAGE_SHIFT;
89154dd5 1197 mmap_read_unlock(current->mm);
ad361f09 1198
6f745f1b
WD
1199 /*
1200 * Permission faults just need to update the existing leaf entry,
1201 * and so normally don't require allocations from the memcache. The
1202 * only exception to this is when dirty logging is enabled at runtime
1203 * and a write fault needs to collapse a block entry into a table.
1204 */
1205 if (fault_status != FSC_PERM || (logging_active && write_fault)) {
1206 ret = kvm_mmu_topup_memory_cache(memcache,
1207 kvm_mmu_cache_min_pages(kvm));
1208 if (ret)
1209 return ret;
1210 }
94f8e641 1211
20ec3ebd 1212 mmu_seq = vcpu->kvm->mmu_invalidate_seq;
94f8e641 1213 /*
20ec3ebd 1214 * Ensure the read of mmu_invalidate_seq happens before we call
94f8e641
CD
1215 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1216 * the page we just got a reference to gets unmapped before we have a
1217 * chance to grab the mmu_lock, which ensure that if the page gets
cd4c7183 1218 * unmapped afterwards, the call to kvm_unmap_gfn will take it away
94f8e641
CD
1219 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1220 * in kvm_mmu_notifier_invalidate_<page|range_end>.
10ba2d17
GS
1221 *
1222 * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
1223 * used to avoid unnecessary overhead introduced to locate the memory
1224 * slot because it's always fixed even @gfn is adjusted for huge pages.
94f8e641
CD
1225 */
1226 smp_rmb();
1227
10ba2d17
GS
1228 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
1229 write_fault, &writable, NULL);
196f878a 1230 if (pfn == KVM_PFN_ERR_HWPOISON) {
1559b758 1231 kvm_send_hwpoison_signal(hva, vma_shift);
196f878a
JM
1232 return 0;
1233 }
9ac71595 1234 if (is_error_noslot_pfn(pfn))
94f8e641
CD
1235 return -EFAULT;
1236
15a49a44 1237 if (kvm_is_device_pfn(pfn)) {
2aa53d68
KZ
1238 /*
1239 * If the page was identified as device early by looking at
1240 * the VMA flags, vma_pagesize is already representing the
1241 * largest quantity we can map. If instead it was mapped
1242 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
1243 * and must not be upgraded.
1244 *
1245 * In both cases, we don't let transparent_hugepage_adjust()
1246 * change things at the last minute.
1247 */
6f745f1b
WD
1248 device = true;
1249 } else if (logging_active && !write_fault) {
15a49a44
MS
1250 /*
1251 * Only actually map the page as writable if this was a write
1252 * fault.
1253 */
6f745f1b 1254 writable = false;
15a49a44 1255 }
b8865767 1256
6f745f1b 1257 if (exec_fault && device)
6d674e28
MZ
1258 return -ENOEXEC;
1259
f783ef1c
JZ
1260 /*
1261 * To reduce MMU contentions and enhance concurrency during dirty
1262 * logging dirty logging, only acquire read lock for permission
1263 * relaxation.
1264 */
f587661f 1265 if (use_read_lock)
f783ef1c
JZ
1266 read_lock(&kvm->mmu_lock);
1267 else
1268 write_lock(&kvm->mmu_lock);
6f745f1b 1269 pgt = vcpu->arch.hw_mmu->pgt;
20ec3ebd 1270 if (mmu_invalidate_retry(kvm, mmu_seq))
94f8e641 1271 goto out_unlock;
15a49a44 1272
0529c902
SP
1273 /*
1274 * If we are not forced to use page mapping, check if we are
1275 * backed by a THP and thus use block mapping if possible.
1276 */
f2cc3273
MZ
1277 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
1278 if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
1279 vma_pagesize = fault_granule;
1280 else
1281 vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
1282 hva, &pfn,
1283 &fault_ipa);
1284 }
ad361f09 1285
9f03db66 1286 if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
ea7fc1bb 1287 /* Check the VMM hasn't introduced a new VM_SHARED VMA */
2dbf12ae
CM
1288 if ((vma->vm_flags & VM_MTE_ALLOWED) &&
1289 !(vma->vm_flags & VM_SHARED)) {
1290 sanitise_mte_tags(kvm, pfn, vma_pagesize);
1291 } else {
ea7fc1bb 1292 ret = -EFAULT;
ea7fc1bb 1293 goto out_unlock;
2dbf12ae 1294 }
ea7fc1bb 1295 }
3f58bf63 1296
509552e6 1297 if (writable)
6f745f1b 1298 prot |= KVM_PGTABLE_PROT_W;
ad361f09 1299
25aa2869 1300 if (exec_fault)
6f745f1b 1301 prot |= KVM_PGTABLE_PROT_X;
3f58bf63 1302
6f745f1b
WD
1303 if (device)
1304 prot |= KVM_PGTABLE_PROT_DEVICE;
1305 else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
1306 prot |= KVM_PGTABLE_PROT_X;
a15f6939 1307
7d894834
YW
1308 /*
1309 * Under the premise of getting a FSC_PERM fault, we just need to relax
1310 * permissions only if vma_pagesize equals fault_granule. Otherwise,
1311 * kvm_pgtable_stage2_map() should be called to change block size.
1312 */
1313 if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
6f745f1b 1314 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
ad361f09 1315 } else {
f587661f
OU
1316 WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
1317
6f745f1b
WD
1318 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
1319 __pfn_to_phys(pfn), prot,
1320 memcache);
94f8e641 1321 }
ad361f09 1322
509552e6
YW
1323 /* Mark the page dirty only if the fault is handled successfully */
1324 if (writable && !ret) {
1325 kvm_set_pfn_dirty(pfn);
10ba2d17 1326 mark_page_dirty_in_slot(kvm, memslot, gfn);
509552e6
YW
1327 }
1328
94f8e641 1329out_unlock:
f587661f 1330 if (use_read_lock)
f783ef1c
JZ
1331 read_unlock(&kvm->mmu_lock);
1332 else
1333 write_unlock(&kvm->mmu_lock);
35307b9a 1334 kvm_set_pfn_accessed(pfn);
94f8e641 1335 kvm_release_pfn_clean(pfn);
509552e6 1336 return ret != -EAGAIN ? ret : 0;
94f8e641
CD
1337}
1338
ee8efad7 1339/* Resolve the access fault by making the page young again. */
aeda9130
MZ
1340static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1341{
ee8efad7
WD
1342 pte_t pte;
1343 kvm_pte_t kpte;
1344 struct kvm_s2_mmu *mmu;
aeda9130
MZ
1345
1346 trace_kvm_access_fault(fault_ipa);
1347
fcc5bf89 1348 write_lock(&vcpu->kvm->mmu_lock);
ee8efad7
WD
1349 mmu = vcpu->arch.hw_mmu;
1350 kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
fcc5bf89 1351 write_unlock(&vcpu->kvm->mmu_lock);
ee8efad7
WD
1352
1353 pte = __pte(kpte);
1354 if (pte_valid(pte))
1355 kvm_set_pfn_accessed(pte_pfn(pte));
aeda9130
MZ
1356}
1357
94f8e641
CD
1358/**
1359 * kvm_handle_guest_abort - handles all 2nd stage aborts
1360 * @vcpu: the VCPU pointer
94f8e641
CD
1361 *
1362 * Any abort that gets to the host is almost guaranteed to be caused by a
1363 * missing second stage translation table entry, which can mean that either the
1364 * guest simply needs more memory and we must allocate an appropriate page or it
1365 * can mean that the guest tried to access I/O memory, which is emulated by user
1366 * space. The distinction is based on the IPA causing the fault and whether this
1367 * memory region has been registered as standard RAM by user space.
1368 */
74cc7e0c 1369int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
342cd0ab 1370{
94f8e641
CD
1371 unsigned long fault_status;
1372 phys_addr_t fault_ipa;
1373 struct kvm_memory_slot *memslot;
98047888
CD
1374 unsigned long hva;
1375 bool is_iabt, write_fault, writable;
94f8e641
CD
1376 gfn_t gfn;
1377 int ret, idx;
1378
621f48e4
TB
1379 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1380
1381 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
bb428921 1382 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
621f48e4 1383
85ea6b1e
MZ
1384 if (fault_status == FSC_FAULT) {
1385 /* Beyond sanitised PARange (which is the IPA limit) */
1386 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
1387 kvm_inject_size_fault(vcpu);
1388 return 1;
1389 }
1390
1391 /* Falls between the IPA range and the PARange? */
1392 if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
1393 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
1394
1395 if (is_iabt)
1396 kvm_inject_pabt(vcpu, fault_ipa);
1397 else
1398 kvm_inject_dabt(vcpu, fault_ipa);
1399 return 1;
1400 }
1401 }
1402
bb428921 1403 /* Synchronous External Abort? */
c9a636f2 1404 if (kvm_vcpu_abt_issea(vcpu)) {
bb428921
JM
1405 /*
1406 * For RAS the host kernel may handle this abort.
1407 * There is no need to pass the error into the guest.
1408 */
84b951a8 1409 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
bb428921 1410 kvm_inject_vabt(vcpu);
84b951a8
WD
1411
1412 return 1;
4055710b
MZ
1413 }
1414
3a949f4c 1415 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
7393b599 1416 kvm_vcpu_get_hfar(vcpu), fault_ipa);
94f8e641
CD
1417
1418 /* Check the stage-2 fault is trans. fault or write fault */
35307b9a
MZ
1419 if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1420 fault_status != FSC_ACCESS) {
0496daa5
CD
1421 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1422 kvm_vcpu_trap_get_class(vcpu),
1423 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
3a949f4c 1424 (unsigned long)kvm_vcpu_get_esr(vcpu));
94f8e641
CD
1425 return -EFAULT;
1426 }
1427
1428 idx = srcu_read_lock(&vcpu->kvm->srcu);
1429
1430 gfn = fault_ipa >> PAGE_SHIFT;
98047888
CD
1431 memslot = gfn_to_memslot(vcpu->kvm, gfn);
1432 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
a7d079ce 1433 write_fault = kvm_is_write_fault(vcpu);
98047888 1434 if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
022c8328
WD
1435 /*
1436 * The guest has put either its instructions or its page-tables
1437 * somewhere it shouldn't have. Userspace won't be able to do
1438 * anything about this (there's no syndrome for a start), so
1439 * re-inject the abort back into the guest.
1440 */
94f8e641 1441 if (is_iabt) {
6d674e28
MZ
1442 ret = -ENOEXEC;
1443 goto out;
94f8e641
CD
1444 }
1445
c4ad98e4 1446 if (kvm_vcpu_abt_iss1tw(vcpu)) {
022c8328
WD
1447 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1448 ret = 1;
1449 goto out_unlock;
1450 }
1451
57c841f1
MZ
1452 /*
1453 * Check for a cache maintenance operation. Since we
1454 * ended-up here, we know it is outside of any memory
1455 * slot. But we can't find out if that is for a device,
1456 * or if the guest is just being stupid. The only thing
1457 * we know for sure is that this range cannot be cached.
1458 *
1459 * So let's assume that the guest is just being
1460 * cautious, and skip the instruction.
1461 */
54dc0d24 1462 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
cdb5e02e 1463 kvm_incr_pc(vcpu);
57c841f1
MZ
1464 ret = 1;
1465 goto out_unlock;
1466 }
1467
cfe3950c
MZ
1468 /*
1469 * The IPA is reported as [MAX:12], so we need to
1470 * complement it with the bottom 12 bits from the
1471 * faulting VA. This is always 12 bits, irrespective
1472 * of the page size.
1473 */
1474 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
74cc7e0c 1475 ret = io_mem_abort(vcpu, fault_ipa);
94f8e641
CD
1476 goto out_unlock;
1477 }
1478
c3058d5d 1479 /* Userspace should not be able to register out-of-bounds IPAs */
e55cac5b 1480 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
c3058d5d 1481
aeda9130
MZ
1482 if (fault_status == FSC_ACCESS) {
1483 handle_access_fault(vcpu, fault_ipa);
1484 ret = 1;
1485 goto out_unlock;
1486 }
1487
98047888 1488 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
94f8e641
CD
1489 if (ret == 0)
1490 ret = 1;
6d674e28
MZ
1491out:
1492 if (ret == -ENOEXEC) {
1493 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1494 ret = 1;
1495 }
94f8e641
CD
1496out_unlock:
1497 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1498 return ret;
342cd0ab
CD
1499}
1500
cd4c7183 1501bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
d5d8184d 1502{
cd4c7183 1503 if (!kvm->arch.mmu.pgt)
fcb82839 1504 return false;
d5d8184d 1505
cd4c7183
SC
1506 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
1507 (range->end - range->start) << PAGE_SHIFT,
1508 range->may_block);
b5331379 1509
fcb82839 1510 return false;
d5d8184d
CD
1511}
1512
cd4c7183 1513bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
d5d8184d 1514{
cd4c7183
SC
1515 kvm_pfn_t pfn = pte_pfn(range->pte);
1516
063deeb1 1517 if (!kvm->arch.mmu.pgt)
fcb82839 1518 return false;
d5d8184d 1519
cd4c7183 1520 WARN_ON(range->end - range->start != 1);
d5d8184d 1521
2dbf12ae
CM
1522 /*
1523 * If the page isn't tagged, defer to user_mem_abort() for sanitising
1524 * the MTE tags. The S2 pte should have been unmapped by
1525 * mmu_notifier_invalidate_range_end().
1526 */
1527 if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
ea7fc1bb
SP
1528 return false;
1529
cd4c7183 1530 /*
25aa2869
YW
1531 * We've moved a page around, probably through CoW, so let's treat
1532 * it just like a translation fault and the map handler will clean
1533 * the cache to the PoC.
1534 *
e9edb17a 1535 * The MMU notifiers will have unmapped a huge PMD before calling
cd4c7183 1536 * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
e9edb17a
WD
1537 * therefore we never need to clear out a huge PMD through this
1538 * calling path and a memcache is not required.
15a49a44 1539 */
cd4c7183
SC
1540 kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
1541 PAGE_SIZE, __pfn_to_phys(pfn),
1542 KVM_PGTABLE_PROT_R, NULL);
1543
fcb82839 1544 return false;
d5d8184d
CD
1545}
1546
cd4c7183 1547bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
d5d8184d 1548{
cd4c7183
SC
1549 u64 size = (range->end - range->start) << PAGE_SHIFT;
1550 kvm_pte_t kpte;
1551 pte_t pte;
d5d8184d 1552
e9edb17a 1553 if (!kvm->arch.mmu.pgt)
fcb82839 1554 return false;
d5d8184d 1555
35a63966 1556 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
cd4c7183
SC
1557
1558 kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
1559 range->start << PAGE_SHIFT);
ee8efad7
WD
1560 pte = __pte(kpte);
1561 return pte_valid(pte) && pte_young(pte);
35307b9a
MZ
1562}
1563
cd4c7183 1564bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
35307b9a 1565{
063deeb1 1566 if (!kvm->arch.mmu.pgt)
fcb82839 1567 return false;
501b9185 1568
cd4c7183
SC
1569 return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
1570 range->start << PAGE_SHIFT);
35307b9a
MZ
1571}
1572
342cd0ab
CD
1573phys_addr_t kvm_mmu_get_httbr(void)
1574{
0f9d09b8 1575 return __pa(hyp_pgtable->pgd);
342cd0ab
CD
1576}
1577
5a677ce0
MZ
1578phys_addr_t kvm_get_idmap_vector(void)
1579{
1580 return hyp_idmap_vector;
1581}
1582
0f9d09b8 1583static int kvm_map_idmap_text(void)
0535a3e2 1584{
0f9d09b8
WD
1585 unsigned long size = hyp_idmap_end - hyp_idmap_start;
1586 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
1587 PAGE_HYP_EXEC);
0535a3e2
MZ
1588 if (err)
1589 kvm_err("Failed to idmap %lx-%lx\n",
1590 hyp_idmap_start, hyp_idmap_end);
1591
1592 return err;
1593}
1594
7aef0cbc
QP
1595static void *kvm_hyp_zalloc_page(void *arg)
1596{
1597 return (void *)get_zeroed_page(GFP_KERNEL);
1598}
1599
1600static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
1601 .zalloc_page = kvm_hyp_zalloc_page,
1602 .get_page = kvm_host_get_page,
1603 .put_page = kvm_host_put_page,
1604 .phys_to_virt = kvm_host_va,
1605 .virt_to_phys = kvm_host_pa,
1606};
1607
bfa79a80 1608int kvm_mmu_init(u32 *hyp_va_bits)
342cd0ab 1609{
2fb41059
MZ
1610 int err;
1611
0a78791c 1612 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
46fef158 1613 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
0a78791c 1614 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
46fef158 1615 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
0a78791c 1616 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
5a677ce0 1617
06f75a1f
AB
1618 /*
1619 * We rely on the linker script to ensure at build time that the HYP
1620 * init code does not cross a page boundary.
1621 */
1622 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
5a677ce0 1623
bfa79a80
QP
1624 *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1625 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
b4ef0499
MZ
1626 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1627 kvm_debug("HYP VA range: %lx:%lx\n",
1628 kern_hyp_va(PAGE_OFFSET),
1629 kern_hyp_va((unsigned long)high_memory - 1));
eac378a9 1630
6c41a413 1631 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
ed57cac8 1632 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
d2896d4b 1633 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
eac378a9
MZ
1634 /*
1635 * The idmap page is intersecting with the VA space,
1636 * it is not safe to continue further.
1637 */
1638 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
1639 err = -EINVAL;
1640 goto out;
1641 }
1642
0f9d09b8
WD
1643 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
1644 if (!hyp_pgtable) {
1645 kvm_err("Hyp mode page-table not allocated\n");
2fb41059
MZ
1646 err = -ENOMEM;
1647 goto out;
1648 }
1649
bfa79a80 1650 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
0f9d09b8
WD
1651 if (err)
1652 goto out_free_pgtable;
d5d8184d 1653
0f9d09b8
WD
1654 err = kvm_map_idmap_text();
1655 if (err)
1656 goto out_destroy_pgtable;
5a677ce0 1657
e3f019b3 1658 io_map_base = hyp_idmap_start;
d5d8184d 1659 return 0;
0f9d09b8
WD
1660
1661out_destroy_pgtable:
1662 kvm_pgtable_hyp_destroy(hyp_pgtable);
1663out_free_pgtable:
1664 kfree(hyp_pgtable);
1665 hyp_pgtable = NULL;
2fb41059 1666out:
2fb41059 1667 return err;
342cd0ab 1668}
df6ce24f
EA
1669
1670void kvm_arch_commit_memory_region(struct kvm *kvm,
9d4c197c 1671 struct kvm_memory_slot *old,
f36f3f28 1672 const struct kvm_memory_slot *new,
df6ce24f
EA
1673 enum kvm_mr_change change)
1674{
c6473555
MS
1675 /*
1676 * At this point memslot has been committed and there is an
656012c7 1677 * allocated dirty_bitmap[], dirty pages will be tracked while the
c6473555
MS
1678 * memory slot is write protected.
1679 */
509c594c 1680 if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
c862626e
KZ
1681 /*
1682 * If we're with initial-all-set, we don't need to write
1683 * protect any pages because they're all reported as dirty.
1684 * Huge pages and normal pages will be write protect gradually.
1685 */
1686 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
509c594c 1687 kvm_mmu_wp_memory_region(kvm, new->id);
c862626e
KZ
1688 }
1689 }
df6ce24f
EA
1690}
1691
1692int kvm_arch_prepare_memory_region(struct kvm *kvm,
537a17b3
SC
1693 const struct kvm_memory_slot *old,
1694 struct kvm_memory_slot *new,
df6ce24f
EA
1695 enum kvm_mr_change change)
1696{
509c594c 1697 hva_t hva, reg_end;
8eef9123
AB
1698 int ret = 0;
1699
15a49a44
MS
1700 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1701 change != KVM_MR_FLAGS_ONLY)
8eef9123
AB
1702 return 0;
1703
c3058d5d
CD
1704 /*
1705 * Prevent userspace from creating a memory region outside of the IPA
1706 * space addressable by the KVM guest IPA space.
1707 */
537a17b3 1708 if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
c3058d5d
CD
1709 return -EFAULT;
1710
509c594c
SC
1711 hva = new->userspace_addr;
1712 reg_end = hva + (new->npages << PAGE_SHIFT);
1713
89154dd5 1714 mmap_read_lock(current->mm);
8eef9123
AB
1715 /*
1716 * A memory region could potentially cover multiple VMAs, and any holes
fd6f17ba 1717 * between them, so iterate over all of them.
8eef9123
AB
1718 *
1719 * +--------------------------------------------+
1720 * +---------------+----------------+ +----------------+
1721 * | : VMA 1 | VMA 2 | | VMA 3 : |
1722 * +---------------+----------------+ +----------------+
1723 * | memory region |
1724 * +--------------------------------------------+
1725 */
1726 do {
c728fd4c 1727 struct vm_area_struct *vma;
8eef9123 1728
c728fd4c
GS
1729 vma = find_vma_intersection(current->mm, hva, reg_end);
1730 if (!vma)
8eef9123
AB
1731 break;
1732
ea7fc1bb
SP
1733 /*
1734 * VM_SHARED mappings are not allowed with MTE to avoid races
1735 * when updating the PG_mte_tagged page flag, see
1736 * sanitise_mte_tags for more details.
1737 */
6e6a8ef0
QP
1738 if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
1739 ret = -EINVAL;
1740 break;
1741 }
ea7fc1bb 1742
8eef9123 1743 if (vma->vm_flags & VM_PFNMAP) {
15a49a44 1744 /* IO region dirty page logging not allowed */
537a17b3 1745 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
72f31048 1746 ret = -EINVAL;
8eef9123 1747 break;
fd6f17ba 1748 }
8eef9123 1749 }
fd6f17ba 1750 hva = min(reg_end, vma->vm_end);
8eef9123
AB
1751 } while (hva < reg_end);
1752
89154dd5 1753 mmap_read_unlock(current->mm);
8eef9123 1754 return ret;
df6ce24f
EA
1755}
1756
e96c81ee 1757void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
df6ce24f
EA
1758{
1759}
1760
15248258 1761void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
df6ce24f
EA
1762{
1763}
1764
1765void kvm_arch_flush_shadow_all(struct kvm *kvm)
1766{
a0e50aa3 1767 kvm_free_stage2_pgd(&kvm->arch.mmu);
df6ce24f
EA
1768}
1769
1770void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1771 struct kvm_memory_slot *slot)
1772{
8eef9123
AB
1773 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1774 phys_addr_t size = slot->npages << PAGE_SHIFT;
1775
fcc5bf89 1776 write_lock(&kvm->mmu_lock);
a0e50aa3 1777 unmap_stage2_range(&kvm->arch.mmu, gpa, size);
fcc5bf89 1778 write_unlock(&kvm->mmu_lock);
df6ce24f 1779}
3c1e7165
MZ
1780
1781/*
1782 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
1783 *
1784 * Main problems:
1785 * - S/W ops are local to a CPU (not broadcast)
1786 * - We have line migration behind our back (speculation)
1787 * - System caches don't support S/W at all (damn!)
1788 *
1789 * In the face of the above, the best we can do is to try and convert
1790 * S/W ops to VA ops. Because the guest is not allowed to infer the
1791 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
1792 * which is a rather good thing for us.
1793 *
1794 * Also, it is only used when turning caches on/off ("The expected
1795 * usage of the cache maintenance instructions that operate by set/way
1796 * is associated with the cache maintenance instructions associated
1797 * with the powerdown and powerup of caches, if this is required by
1798 * the implementation.").
1799 *
1800 * We use the following policy:
1801 *
1802 * - If we trap a S/W operation, we enable VM trapping to detect
1803 * caches being turned on/off, and do a full clean.
1804 *
1805 * - We flush the caches on both caches being turned on and off.
1806 *
1807 * - Once the caches are enabled, we stop trapping VM ops.
1808 */
1809void kvm_set_way_flush(struct kvm_vcpu *vcpu)
1810{
3df59d8d 1811 unsigned long hcr = *vcpu_hcr(vcpu);
3c1e7165
MZ
1812
1813 /*
1814 * If this is the first time we do a S/W operation
1815 * (i.e. HCR_TVM not set) flush the whole memory, and set the
1816 * VM trapping.
1817 *
1818 * Otherwise, rely on the VM trapping to wait for the MMU +
1819 * Caches to be turned off. At that point, we'll be able to
1820 * clean the caches again.
1821 */
1822 if (!(hcr & HCR_TVM)) {
1823 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
1824 vcpu_has_cache_enabled(vcpu));
1825 stage2_flush_vm(vcpu->kvm);
3df59d8d 1826 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
3c1e7165
MZ
1827 }
1828}
1829
1830void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
1831{
1832 bool now_enabled = vcpu_has_cache_enabled(vcpu);
1833
1834 /*
1835 * If switching the MMU+caches on, need to invalidate the caches.
1836 * If switching it off, need to clean the caches.
1837 * Clean + invalidate does the trick always.
1838 */
1839 if (now_enabled != was_enabled)
1840 stage2_flush_vm(vcpu->kvm);
1841
1842 /* Caches are now on, stop trapping VM ops (until a S/W op) */
1843 if (now_enabled)
3df59d8d 1844 *vcpu_hcr(vcpu) &= ~HCR_TVM;
3c1e7165
MZ
1845
1846 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
1847}