KVM: arm64: Add support for stage-2 cache flushing in generic page-table
[linux-block.git] / arch / arm64 / kvm / mmu.c
CommitLineData
d94d71cb 1// SPDX-License-Identifier: GPL-2.0-only
749cf76c
CD
2/*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
749cf76c 5 */
342cd0ab
CD
6
7#include <linux/mman.h>
8#include <linux/kvm_host.h>
9#include <linux/io.h>
ad361f09 10#include <linux/hugetlb.h>
196f878a 11#include <linux/sched/signal.h>
45e96ea6 12#include <trace/events/kvm.h>
342cd0ab 13#include <asm/pgalloc.h>
94f8e641 14#include <asm/cacheflush.h>
342cd0ab
CD
15#include <asm/kvm_arm.h>
16#include <asm/kvm_mmu.h>
0f9d09b8 17#include <asm/kvm_pgtable.h>
0db5e022 18#include <asm/kvm_ras.h>
d5d8184d 19#include <asm/kvm_asm.h>
94f8e641 20#include <asm/kvm_emulate.h>
1e947bad 21#include <asm/virt.h>
d5d8184d
CD
22
23#include "trace.h"
342cd0ab 24
0f9d09b8 25static struct kvm_pgtable *hyp_pgtable;
342cd0ab
CD
26static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
27
5a677ce0
MZ
28static unsigned long hyp_idmap_start;
29static unsigned long hyp_idmap_end;
30static phys_addr_t hyp_idmap_vector;
31
e3f019b3
MZ
32static unsigned long io_map_base;
33
15a49a44
MS
34#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
35#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
36
6d674e28
MZ
37static bool is_iomap(unsigned long flags)
38{
39 return flags & KVM_S2PTE_FLAG_IS_IOMAP;
40}
41
52bae936
WD
42/*
43 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
44 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
45 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
46 * long will also starve other vCPUs. We have to also make sure that the page
47 * tables are not freed while we released the lock.
48 */
49static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
50 phys_addr_t end,
51 int (*fn)(struct kvm_pgtable *, u64, u64),
52 bool resched)
53{
54 int ret;
55 u64 next;
56
57 do {
58 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
59 if (!pgt)
60 return -EINVAL;
61
62 next = stage2_pgd_addr_end(kvm, addr, end);
63 ret = fn(pgt, addr, next - addr);
64 if (ret)
65 break;
66
67 if (resched && next != end)
68 cond_resched_lock(&kvm->mmu_lock);
69 } while (addr = next, addr != end);
70
71 return ret;
72}
73
cc38d61c
QP
74#define stage2_apply_range_resched(kvm, addr, end, fn) \
75 stage2_apply_range(kvm, addr, end, fn, true)
76
15a49a44
MS
77static bool memslot_is_logging(struct kvm_memory_slot *memslot)
78{
15a49a44 79 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
7276030a
MS
80}
81
82/**
83 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
84 * @kvm: pointer to kvm structure.
85 *
86 * Interface to HYP function to flush all VM TLB entries
87 */
88void kvm_flush_remote_tlbs(struct kvm *kvm)
89{
a0e50aa3 90 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
15a49a44 91}
ad361f09 92
efaa5b93
MZ
93static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
94 int level)
d5d8184d 95{
efaa5b93 96 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
d5d8184d
CD
97}
98
363ef89f
MZ
99/*
100 * D-Cache management functions. They take the page table entries by
101 * value, as they are flushing the cache using the kernel mapping (or
102 * kmap on 32bit).
103 */
104static void kvm_flush_dcache_pte(pte_t pte)
105{
106 __kvm_flush_dcache_pte(pte);
107}
108
109static void kvm_flush_dcache_pmd(pmd_t pmd)
110{
111 __kvm_flush_dcache_pmd(pmd);
112}
113
114static void kvm_flush_dcache_pud(pud_t pud)
115{
116 __kvm_flush_dcache_pud(pud);
117}
118
e6fab544
AB
119static bool kvm_is_device_pfn(unsigned long pfn)
120{
121 return !pfn_valid(pfn);
122}
123
15a49a44
MS
124/**
125 * stage2_dissolve_pmd() - clear and flush huge PMD entry
a0e50aa3 126 * @mmu: pointer to mmu structure to operate on
15a49a44
MS
127 * @addr: IPA
128 * @pmd: pmd pointer for IPA
129 *
8324c3d5 130 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
15a49a44 131 */
a0e50aa3 132static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
15a49a44 133{
bbb3b6b3 134 if (!pmd_thp_or_huge(*pmd))
15a49a44
MS
135 return;
136
137 pmd_clear(pmd);
efaa5b93 138 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
15a49a44
MS
139 put_page(virt_to_page(pmd));
140}
141
b8e0ba7c
PA
142/**
143 * stage2_dissolve_pud() - clear and flush huge PUD entry
a0e50aa3 144 * @mmu: pointer to mmu structure to operate on
b8e0ba7c
PA
145 * @addr: IPA
146 * @pud: pud pointer for IPA
147 *
8324c3d5 148 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
b8e0ba7c 149 */
a0e50aa3 150static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
b8e0ba7c 151{
a0e50aa3
CD
152 struct kvm *kvm = mmu->kvm;
153
b8e0ba7c
PA
154 if (!stage2_pud_huge(kvm, *pudp))
155 return;
156
157 stage2_pud_clear(kvm, pudp);
efaa5b93 158 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
b8e0ba7c
PA
159 put_page(virt_to_page(pudp));
160}
161
a0e50aa3 162static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
979acd5e 163{
a0e50aa3 164 struct kvm *kvm = mmu->kvm;
e9f63768 165 p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
e55cac5b 166 stage2_pgd_clear(kvm, pgd);
efaa5b93 167 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
e9f63768 168 stage2_p4d_free(kvm, p4d_table);
4f853a71 169 put_page(virt_to_page(pgd));
979acd5e
MZ
170}
171
a0e50aa3 172static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
e9f63768 173{
a0e50aa3 174 struct kvm *kvm = mmu->kvm;
e9f63768
MR
175 pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
176 stage2_p4d_clear(kvm, p4d);
efaa5b93 177 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
e9f63768
MR
178 stage2_pud_free(kvm, pud_table);
179 put_page(virt_to_page(p4d));
180}
181
a0e50aa3 182static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
342cd0ab 183{
a0e50aa3 184 struct kvm *kvm = mmu->kvm;
e55cac5b 185 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
a0e50aa3 186
e55cac5b
SP
187 VM_BUG_ON(stage2_pud_huge(kvm, *pud));
188 stage2_pud_clear(kvm, pud);
efaa5b93 189 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
e55cac5b 190 stage2_pmd_free(kvm, pmd_table);
4f728276
MZ
191 put_page(virt_to_page(pud));
192}
342cd0ab 193
a0e50aa3 194static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
4f728276 195{
4f853a71 196 pte_t *pte_table = pte_offset_kernel(pmd, 0);
bbb3b6b3 197 VM_BUG_ON(pmd_thp_or_huge(*pmd));
4f853a71 198 pmd_clear(pmd);
efaa5b93 199 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
14b94d07 200 free_page((unsigned long)pte_table);
4f728276
MZ
201 put_page(virt_to_page(pmd));
202}
203
88dc25e8
MZ
204static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
205{
206 WRITE_ONCE(*ptep, new_pte);
207 dsb(ishst);
208}
209
210static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
211{
212 WRITE_ONCE(*pmdp, new_pmd);
213 dsb(ishst);
214}
215
0db9dd8a
MZ
216static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
217{
218 kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
219}
220
221static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
222{
223 WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
224 dsb(ishst);
225}
226
e9f63768 227static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
0db9dd8a 228{
e9f63768 229 WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
0db9dd8a
MZ
230 dsb(ishst);
231}
232
e9f63768
MR
233static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
234{
235#ifndef __PAGETABLE_P4D_FOLDED
236 WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
237 dsb(ishst);
238#endif
239}
240
363ef89f
MZ
241/*
242 * Unmapping vs dcache management:
243 *
244 * If a guest maps certain memory pages as uncached, all writes will
245 * bypass the data cache and go directly to RAM. However, the CPUs
246 * can still speculate reads (not writes) and fill cache lines with
247 * data.
248 *
249 * Those cache lines will be *clean* cache lines though, so a
250 * clean+invalidate operation is equivalent to an invalidate
251 * operation, because no cache lines are marked dirty.
252 *
253 * Those clean cache lines could be filled prior to an uncached write
254 * by the guest, and the cache coherent IO subsystem would therefore
255 * end up writing old data to disk.
256 *
257 * This is why right after unmapping a page/section and invalidating
52bae936
WD
258 * the corresponding TLBs, we flush to make sure the IO subsystem will
259 * never hit in the cache.
e48d53a9
MZ
260 *
261 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
262 * we then fully enforce cacheability of RAM, no matter what the guest
263 * does.
363ef89f 264 */
a0e50aa3 265static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
4f853a71 266 phys_addr_t addr, phys_addr_t end)
4f728276 267{
4f853a71
CD
268 phys_addr_t start_addr = addr;
269 pte_t *pte, *start_pte;
270
271 start_pte = pte = pte_offset_kernel(pmd, addr);
272 do {
273 if (!pte_none(*pte)) {
363ef89f
MZ
274 pte_t old_pte = *pte;
275
4f853a71 276 kvm_set_pte(pte, __pte(0));
efaa5b93 277 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
363ef89f
MZ
278
279 /* No need to invalidate the cache for device mappings */
0de58f85 280 if (!kvm_is_device_pfn(pte_pfn(old_pte)))
363ef89f
MZ
281 kvm_flush_dcache_pte(old_pte);
282
283 put_page(virt_to_page(pte));
4f853a71
CD
284 }
285 } while (pte++, addr += PAGE_SIZE, addr != end);
286
a0e50aa3
CD
287 if (stage2_pte_table_empty(mmu->kvm, start_pte))
288 clear_stage2_pmd_entry(mmu, pmd, start_addr);
342cd0ab
CD
289}
290
a0e50aa3 291static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
4f853a71 292 phys_addr_t addr, phys_addr_t end)
000d3996 293{
a0e50aa3 294 struct kvm *kvm = mmu->kvm;
4f853a71
CD
295 phys_addr_t next, start_addr = addr;
296 pmd_t *pmd, *start_pmd;
000d3996 297
e55cac5b 298 start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
4f853a71 299 do {
e55cac5b 300 next = stage2_pmd_addr_end(kvm, addr, end);
4f853a71 301 if (!pmd_none(*pmd)) {
bbb3b6b3 302 if (pmd_thp_or_huge(*pmd)) {
363ef89f
MZ
303 pmd_t old_pmd = *pmd;
304
4f853a71 305 pmd_clear(pmd);
efaa5b93 306 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
363ef89f
MZ
307
308 kvm_flush_dcache_pmd(old_pmd);
309
4f853a71
CD
310 put_page(virt_to_page(pmd));
311 } else {
a0e50aa3 312 unmap_stage2_ptes(mmu, pmd, addr, next);
4f853a71 313 }
ad361f09 314 }
4f853a71 315 } while (pmd++, addr = next, addr != end);
ad361f09 316
e55cac5b 317 if (stage2_pmd_table_empty(kvm, start_pmd))
a0e50aa3 318 clear_stage2_pud_entry(mmu, pud, start_addr);
4f853a71 319}
000d3996 320
a0e50aa3 321static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
4f853a71
CD
322 phys_addr_t addr, phys_addr_t end)
323{
a0e50aa3 324 struct kvm *kvm = mmu->kvm;
4f853a71
CD
325 phys_addr_t next, start_addr = addr;
326 pud_t *pud, *start_pud;
4f728276 327
e9f63768 328 start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
4f853a71 329 do {
e55cac5b
SP
330 next = stage2_pud_addr_end(kvm, addr, end);
331 if (!stage2_pud_none(kvm, *pud)) {
332 if (stage2_pud_huge(kvm, *pud)) {
363ef89f
MZ
333 pud_t old_pud = *pud;
334
e55cac5b 335 stage2_pud_clear(kvm, pud);
efaa5b93 336 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
363ef89f 337 kvm_flush_dcache_pud(old_pud);
4f853a71
CD
338 put_page(virt_to_page(pud));
339 } else {
a0e50aa3 340 unmap_stage2_pmds(mmu, pud, addr, next);
4f728276
MZ
341 }
342 }
4f853a71 343 } while (pud++, addr = next, addr != end);
4f728276 344
e55cac5b 345 if (stage2_pud_table_empty(kvm, start_pud))
a0e50aa3 346 clear_stage2_p4d_entry(mmu, p4d, start_addr);
e9f63768
MR
347}
348
a0e50aa3 349static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
e9f63768
MR
350 phys_addr_t addr, phys_addr_t end)
351{
a0e50aa3 352 struct kvm *kvm = mmu->kvm;
e9f63768
MR
353 phys_addr_t next, start_addr = addr;
354 p4d_t *p4d, *start_p4d;
355
356 start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
357 do {
358 next = stage2_p4d_addr_end(kvm, addr, end);
359 if (!stage2_p4d_none(kvm, *p4d))
a0e50aa3 360 unmap_stage2_puds(mmu, p4d, addr, next);
e9f63768
MR
361 } while (p4d++, addr = next, addr != end);
362
363 if (stage2_p4d_table_empty(kvm, start_p4d))
a0e50aa3 364 clear_stage2_pgd_entry(mmu, pgd, start_addr);
4f853a71
CD
365}
366
7a1c831e
SP
367/**
368 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
369 * @kvm: The VM pointer
370 * @start: The intermediate physical base address of the range to unmap
371 * @size: The size of the area to unmap
372 *
373 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
374 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
375 * destroying the VM), otherwise another faulting VCPU may come in and mess
376 * with things behind our backs.
377 */
b5331379
WD
378static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
379 bool may_block)
4f853a71 380{
a0e50aa3 381 struct kvm *kvm = mmu->kvm;
52bae936 382 phys_addr_t end = start + size;
4f853a71 383
8b3405e3 384 assert_spin_locked(&kvm->mmu_lock);
47a91b72 385 WARN_ON(size & ~PAGE_MASK);
52bae936
WD
386 WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
387 may_block));
000d3996
MZ
388}
389
b5331379
WD
390static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
391{
392 __unmap_stage2_range(mmu, start, size, true);
393}
394
a0e50aa3 395static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
9d218a1f
MZ
396 phys_addr_t addr, phys_addr_t end)
397{
398 pte_t *pte;
399
400 pte = pte_offset_kernel(pmd, addr);
401 do {
0de58f85 402 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
363ef89f 403 kvm_flush_dcache_pte(*pte);
9d218a1f
MZ
404 } while (pte++, addr += PAGE_SIZE, addr != end);
405}
406
a0e50aa3 407static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
9d218a1f
MZ
408 phys_addr_t addr, phys_addr_t end)
409{
a0e50aa3 410 struct kvm *kvm = mmu->kvm;
9d218a1f
MZ
411 pmd_t *pmd;
412 phys_addr_t next;
413
e55cac5b 414 pmd = stage2_pmd_offset(kvm, pud, addr);
9d218a1f 415 do {
e55cac5b 416 next = stage2_pmd_addr_end(kvm, addr, end);
9d218a1f 417 if (!pmd_none(*pmd)) {
bbb3b6b3 418 if (pmd_thp_or_huge(*pmd))
363ef89f
MZ
419 kvm_flush_dcache_pmd(*pmd);
420 else
a0e50aa3 421 stage2_flush_ptes(mmu, pmd, addr, next);
9d218a1f
MZ
422 }
423 } while (pmd++, addr = next, addr != end);
424}
425
a0e50aa3 426static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
9d218a1f
MZ
427 phys_addr_t addr, phys_addr_t end)
428{
a0e50aa3 429 struct kvm *kvm = mmu->kvm;
9d218a1f
MZ
430 pud_t *pud;
431 phys_addr_t next;
432
e9f63768 433 pud = stage2_pud_offset(kvm, p4d, addr);
9d218a1f 434 do {
e55cac5b
SP
435 next = stage2_pud_addr_end(kvm, addr, end);
436 if (!stage2_pud_none(kvm, *pud)) {
437 if (stage2_pud_huge(kvm, *pud))
363ef89f
MZ
438 kvm_flush_dcache_pud(*pud);
439 else
a0e50aa3 440 stage2_flush_pmds(mmu, pud, addr, next);
9d218a1f
MZ
441 }
442 } while (pud++, addr = next, addr != end);
443}
444
a0e50aa3 445static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
e9f63768
MR
446 phys_addr_t addr, phys_addr_t end)
447{
a0e50aa3 448 struct kvm *kvm = mmu->kvm;
e9f63768
MR
449 p4d_t *p4d;
450 phys_addr_t next;
451
452 p4d = stage2_p4d_offset(kvm, pgd, addr);
453 do {
454 next = stage2_p4d_addr_end(kvm, addr, end);
455 if (!stage2_p4d_none(kvm, *p4d))
a0e50aa3 456 stage2_flush_puds(mmu, p4d, addr, next);
e9f63768
MR
457 } while (p4d++, addr = next, addr != end);
458}
459
9d218a1f
MZ
460static void stage2_flush_memslot(struct kvm *kvm,
461 struct kvm_memory_slot *memslot)
462{
a0e50aa3 463 struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
9d218a1f
MZ
464 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
465 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
466 phys_addr_t next;
467 pgd_t *pgd;
468
a0e50aa3 469 pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
9d218a1f 470 do {
e55cac5b
SP
471 next = stage2_pgd_addr_end(kvm, addr, end);
472 if (!stage2_pgd_none(kvm, *pgd))
a0e50aa3 473 stage2_flush_p4ds(mmu, pgd, addr, next);
48c963e3
JY
474
475 if (next != end)
476 cond_resched_lock(&kvm->mmu_lock);
9d218a1f
MZ
477 } while (pgd++, addr = next, addr != end);
478}
479
480/**
481 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
482 * @kvm: The struct kvm pointer
483 *
484 * Go through the stage 2 page tables and invalidate any cache lines
485 * backing memory already mapped to the VM.
486 */
3c1e7165 487static void stage2_flush_vm(struct kvm *kvm)
9d218a1f
MZ
488{
489 struct kvm_memslots *slots;
490 struct kvm_memory_slot *memslot;
491 int idx;
492
493 idx = srcu_read_lock(&kvm->srcu);
494 spin_lock(&kvm->mmu_lock);
495
496 slots = kvm_memslots(kvm);
497 kvm_for_each_memslot(memslot, slots)
498 stage2_flush_memslot(kvm, memslot);
499
500 spin_unlock(&kvm->mmu_lock);
501 srcu_read_unlock(&kvm->srcu, idx);
502}
503
342cd0ab 504/**
4f728276 505 * free_hyp_pgds - free Hyp-mode page tables
342cd0ab 506 */
4f728276 507void free_hyp_pgds(void)
342cd0ab 508{
d157f4a5 509 mutex_lock(&kvm_hyp_pgd_mutex);
0f9d09b8
WD
510 if (hyp_pgtable) {
511 kvm_pgtable_hyp_destroy(hyp_pgtable);
512 kfree(hyp_pgtable);
26781f9c 513 }
342cd0ab
CD
514 mutex_unlock(&kvm_hyp_pgd_mutex);
515}
516
0f9d09b8
WD
517static int __create_hyp_mappings(unsigned long start, unsigned long size,
518 unsigned long phys, enum kvm_pgtable_prot prot)
342cd0ab 519{
0f9d09b8 520 int err;
342cd0ab 521
342cd0ab 522 mutex_lock(&kvm_hyp_pgd_mutex);
0f9d09b8 523 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
342cd0ab 524 mutex_unlock(&kvm_hyp_pgd_mutex);
0f9d09b8 525
342cd0ab
CD
526 return err;
527}
528
40c2729b
CD
529static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
530{
531 if (!is_vmalloc_addr(kaddr)) {
532 BUG_ON(!virt_addr_valid(kaddr));
533 return __pa(kaddr);
534 } else {
535 return page_to_phys(vmalloc_to_page(kaddr)) +
536 offset_in_page(kaddr);
537 }
538}
539
342cd0ab 540/**
06e8c3b0 541 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
342cd0ab
CD
542 * @from: The virtual kernel start address of the range
543 * @to: The virtual kernel end address of the range (exclusive)
c8dddecd 544 * @prot: The protection to be applied to this range
342cd0ab 545 *
06e8c3b0
MZ
546 * The same virtual address as the kernel virtual address is also used
547 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
548 * physical pages.
342cd0ab 549 */
0f9d09b8 550int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
342cd0ab 551{
40c2729b
CD
552 phys_addr_t phys_addr;
553 unsigned long virt_addr;
6c41a413
MZ
554 unsigned long start = kern_hyp_va((unsigned long)from);
555 unsigned long end = kern_hyp_va((unsigned long)to);
6060df84 556
1e947bad
MZ
557 if (is_kernel_in_hyp_mode())
558 return 0;
559
40c2729b
CD
560 start = start & PAGE_MASK;
561 end = PAGE_ALIGN(end);
6060df84 562
40c2729b
CD
563 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
564 int err;
6060df84 565
40c2729b 566 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
0f9d09b8 567 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
c8dddecd 568 prot);
40c2729b
CD
569 if (err)
570 return err;
571 }
572
573 return 0;
342cd0ab
CD
574}
575
dc2e4633 576static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
0f9d09b8
WD
577 unsigned long *haddr,
578 enum kvm_pgtable_prot prot)
342cd0ab 579{
e3f019b3
MZ
580 unsigned long base;
581 int ret = 0;
6060df84 582
e3f019b3 583 mutex_lock(&kvm_hyp_pgd_mutex);
6060df84 584
e3f019b3 585 /*
656012c7 586 * This assumes that we have enough space below the idmap
e3f019b3
MZ
587 * page to allocate our VAs. If not, the check below will
588 * kick. A potential alternative would be to detect that
589 * overflow and switch to an allocation above the idmap.
590 *
591 * The allocated size is always a multiple of PAGE_SIZE.
592 */
593 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
594 base = io_map_base - size;
1bb32a44 595
e3f019b3
MZ
596 /*
597 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
598 * allocating the new area, as it would indicate we've
599 * overflowed the idmap/IO address range.
600 */
601 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
602 ret = -ENOMEM;
603 else
604 io_map_base = base;
605
606 mutex_unlock(&kvm_hyp_pgd_mutex);
607
608 if (ret)
609 goto out;
610
0f9d09b8 611 ret = __create_hyp_mappings(base, size, phys_addr, prot);
e3f019b3
MZ
612 if (ret)
613 goto out;
614
dc2e4633 615 *haddr = base + offset_in_page(phys_addr);
e3f019b3 616out:
dc2e4633
MZ
617 return ret;
618}
619
620/**
621 * create_hyp_io_mappings - Map IO into both kernel and HYP
622 * @phys_addr: The physical start address which gets mapped
623 * @size: Size of the region being mapped
624 * @kaddr: Kernel VA for this mapping
625 * @haddr: HYP VA for this mapping
626 */
627int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
628 void __iomem **kaddr,
629 void __iomem **haddr)
630{
631 unsigned long addr;
632 int ret;
633
634 *kaddr = ioremap(phys_addr, size);
635 if (!*kaddr)
636 return -ENOMEM;
637
638 if (is_kernel_in_hyp_mode()) {
639 *haddr = *kaddr;
640 return 0;
641 }
642
643 ret = __create_hyp_private_mapping(phys_addr, size,
644 &addr, PAGE_HYP_DEVICE);
1bb32a44
MZ
645 if (ret) {
646 iounmap(*kaddr);
647 *kaddr = NULL;
dc2e4633
MZ
648 *haddr = NULL;
649 return ret;
650 }
651
652 *haddr = (void __iomem *)addr;
653 return 0;
654}
655
656/**
657 * create_hyp_exec_mappings - Map an executable range into HYP
658 * @phys_addr: The physical start address which gets mapped
659 * @size: Size of the region being mapped
660 * @haddr: HYP VA for this mapping
661 */
662int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
663 void **haddr)
664{
665 unsigned long addr;
666 int ret;
667
668 BUG_ON(is_kernel_in_hyp_mode());
669
670 ret = __create_hyp_private_mapping(phys_addr, size,
671 &addr, PAGE_HYP_EXEC);
672 if (ret) {
673 *haddr = NULL;
1bb32a44
MZ
674 return ret;
675 }
676
dc2e4633 677 *haddr = (void *)addr;
1bb32a44 678 return 0;
342cd0ab
CD
679}
680
d5d8184d 681/**
a0e50aa3
CD
682 * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
683 * @kvm: The pointer to the KVM structure
684 * @mmu: The pointer to the s2 MMU structure
d5d8184d 685 *
71233d05 686 * Allocates only the stage-2 HW PGD level table(s).
d5d8184d
CD
687 * Note we don't need locking here as this is only called when the VM is
688 * created, which can only be done once.
689 */
a0e50aa3 690int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
d5d8184d 691{
71233d05
WD
692 int cpu, err;
693 struct kvm_pgtable *pgt;
d5d8184d 694
71233d05 695 if (mmu->pgt != NULL) {
d5d8184d
CD
696 kvm_err("kvm_arch already initialized?\n");
697 return -EINVAL;
698 }
699
71233d05
WD
700 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
701 if (!pgt)
a987370f
MZ
702 return -ENOMEM;
703
71233d05
WD
704 err = kvm_pgtable_stage2_init(pgt, kvm);
705 if (err)
706 goto out_free_pgtable;
e329fb75 707
a0e50aa3
CD
708 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
709 if (!mmu->last_vcpu_ran) {
71233d05
WD
710 err = -ENOMEM;
711 goto out_destroy_pgtable;
a0e50aa3
CD
712 }
713
714 for_each_possible_cpu(cpu)
715 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
716
717 mmu->kvm = kvm;
71233d05
WD
718 mmu->pgt = pgt;
719 mmu->pgd_phys = __pa(pgt->pgd);
720 mmu->pgd = (void *)pgt->pgd;
a0e50aa3 721 mmu->vmid.vmid_gen = 0;
d5d8184d 722 return 0;
71233d05
WD
723
724out_destroy_pgtable:
725 kvm_pgtable_stage2_destroy(pgt);
726out_free_pgtable:
727 kfree(pgt);
728 return err;
d5d8184d
CD
729}
730
957db105
CD
731static void stage2_unmap_memslot(struct kvm *kvm,
732 struct kvm_memory_slot *memslot)
733{
734 hva_t hva = memslot->userspace_addr;
735 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
736 phys_addr_t size = PAGE_SIZE * memslot->npages;
737 hva_t reg_end = hva + size;
738
739 /*
740 * A memory region could potentially cover multiple VMAs, and any holes
741 * between them, so iterate over all of them to find out if we should
742 * unmap any of them.
743 *
744 * +--------------------------------------------+
745 * +---------------+----------------+ +----------------+
746 * | : VMA 1 | VMA 2 | | VMA 3 : |
747 * +---------------+----------------+ +----------------+
748 * | memory region |
749 * +--------------------------------------------+
750 */
751 do {
752 struct vm_area_struct *vma = find_vma(current->mm, hva);
753 hva_t vm_start, vm_end;
754
755 if (!vma || vma->vm_start >= reg_end)
756 break;
757
758 /*
759 * Take the intersection of this VMA with the memory region
760 */
761 vm_start = max(hva, vma->vm_start);
762 vm_end = min(reg_end, vma->vm_end);
763
764 if (!(vma->vm_flags & VM_PFNMAP)) {
765 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
a0e50aa3 766 unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
957db105
CD
767 }
768 hva = vm_end;
769 } while (hva < reg_end);
770}
771
772/**
773 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
774 * @kvm: The struct kvm pointer
775 *
656012c7 776 * Go through the memregions and unmap any regular RAM
957db105
CD
777 * backing memory already mapped to the VM.
778 */
779void stage2_unmap_vm(struct kvm *kvm)
780{
781 struct kvm_memslots *slots;
782 struct kvm_memory_slot *memslot;
783 int idx;
784
785 idx = srcu_read_lock(&kvm->srcu);
89154dd5 786 mmap_read_lock(current->mm);
957db105
CD
787 spin_lock(&kvm->mmu_lock);
788
789 slots = kvm_memslots(kvm);
790 kvm_for_each_memslot(memslot, slots)
791 stage2_unmap_memslot(kvm, memslot);
792
793 spin_unlock(&kvm->mmu_lock);
89154dd5 794 mmap_read_unlock(current->mm);
957db105
CD
795 srcu_read_unlock(&kvm->srcu, idx);
796}
797
a0e50aa3 798void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
d5d8184d 799{
a0e50aa3 800 struct kvm *kvm = mmu->kvm;
71233d05 801 struct kvm_pgtable *pgt = NULL;
d5d8184d 802
8b3405e3 803 spin_lock(&kvm->mmu_lock);
71233d05
WD
804 pgt = mmu->pgt;
805 if (pgt) {
a0e50aa3 806 mmu->pgd = NULL;
71233d05
WD
807 mmu->pgd_phys = 0;
808 mmu->pgt = NULL;
809 free_percpu(mmu->last_vcpu_ran);
6c0d706b 810 }
8b3405e3
SP
811 spin_unlock(&kvm->mmu_lock);
812
71233d05
WD
813 if (pgt) {
814 kvm_pgtable_stage2_destroy(pgt);
815 kfree(pgt);
a0e50aa3 816 }
d5d8184d
CD
817}
818
a0e50aa3 819static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
ad361f09 820 phys_addr_t addr)
d5d8184d 821{
a0e50aa3 822 struct kvm *kvm = mmu->kvm;
d5d8184d 823 pgd_t *pgd;
e9f63768 824 p4d_t *p4d;
d5d8184d 825
a0e50aa3 826 pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
e55cac5b 827 if (stage2_pgd_none(kvm, *pgd)) {
38f791a4
CD
828 if (!cache)
829 return NULL;
c1a33aeb 830 p4d = kvm_mmu_memory_cache_alloc(cache);
e9f63768 831 stage2_pgd_populate(kvm, pgd, p4d);
38f791a4
CD
832 get_page(virt_to_page(pgd));
833 }
834
e9f63768
MR
835 return stage2_p4d_offset(kvm, pgd, addr);
836}
837
a0e50aa3 838static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
e9f63768
MR
839 phys_addr_t addr)
840{
a0e50aa3 841 struct kvm *kvm = mmu->kvm;
e9f63768
MR
842 p4d_t *p4d;
843 pud_t *pud;
844
a0e50aa3 845 p4d = stage2_get_p4d(mmu, cache, addr);
e9f63768
MR
846 if (stage2_p4d_none(kvm, *p4d)) {
847 if (!cache)
848 return NULL;
c1a33aeb 849 pud = kvm_mmu_memory_cache_alloc(cache);
e9f63768
MR
850 stage2_p4d_populate(kvm, p4d, pud);
851 get_page(virt_to_page(p4d));
852 }
853
854 return stage2_pud_offset(kvm, p4d, addr);
38f791a4
CD
855}
856
a0e50aa3 857static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
38f791a4
CD
858 phys_addr_t addr)
859{
a0e50aa3 860 struct kvm *kvm = mmu->kvm;
38f791a4
CD
861 pud_t *pud;
862 pmd_t *pmd;
863
a0e50aa3 864 pud = stage2_get_pud(mmu, cache, addr);
b8e0ba7c 865 if (!pud || stage2_pud_huge(kvm, *pud))
d6dbdd3c
MZ
866 return NULL;
867
e55cac5b 868 if (stage2_pud_none(kvm, *pud)) {
d5d8184d 869 if (!cache)
ad361f09 870 return NULL;
c1a33aeb 871 pmd = kvm_mmu_memory_cache_alloc(cache);
e55cac5b 872 stage2_pud_populate(kvm, pud, pmd);
d5d8184d 873 get_page(virt_to_page(pud));
c62ee2b2
MZ
874 }
875
e55cac5b 876 return stage2_pmd_offset(kvm, pud, addr);
ad361f09
CD
877}
878
a0e50aa3
CD
879static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
880 struct kvm_mmu_memory_cache *cache,
881 phys_addr_t addr, const pmd_t *new_pmd)
ad361f09
CD
882{
883 pmd_t *pmd, old_pmd;
884
3c3736cd 885retry:
a0e50aa3 886 pmd = stage2_get_pmd(mmu, cache, addr);
ad361f09 887 VM_BUG_ON(!pmd);
d5d8184d 888
ad361f09 889 old_pmd = *pmd;
3c3736cd
SP
890 /*
891 * Multiple vcpus faulting on the same PMD entry, can
892 * lead to them sequentially updating the PMD with the
893 * same value. Following the break-before-make
894 * (pmd_clear() followed by tlb_flush()) process can
895 * hinder forward progress due to refaults generated
896 * on missing translations.
897 *
898 * Skip updating the page table if the entry is
899 * unchanged.
900 */
901 if (pmd_val(old_pmd) == pmd_val(*new_pmd))
902 return 0;
903
d4b9e079 904 if (pmd_present(old_pmd)) {
86658b81 905 /*
3c3736cd
SP
906 * If we already have PTE level mapping for this block,
907 * we must unmap it to avoid inconsistent TLB state and
908 * leaking the table page. We could end up in this situation
909 * if the memory slot was marked for dirty logging and was
910 * reverted, leaving PTE level mappings for the pages accessed
911 * during the period. So, unmap the PTE level mapping for this
912 * block and retry, as we could have released the upper level
913 * table in the process.
86658b81 914 *
3c3736cd
SP
915 * Normal THP split/merge follows mmu_notifier callbacks and do
916 * get handled accordingly.
86658b81 917 */
3c3736cd 918 if (!pmd_thp_or_huge(old_pmd)) {
a0e50aa3 919 unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
3c3736cd
SP
920 goto retry;
921 }
86658b81
PA
922 /*
923 * Mapping in huge pages should only happen through a
924 * fault. If a page is merged into a transparent huge
925 * page, the individual subpages of that huge page
926 * should be unmapped through MMU notifiers before we
927 * get here.
928 *
929 * Merging of CompoundPages is not supported; they
930 * should become splitting first, unmapped, merged,
931 * and mapped back in on-demand.
932 */
3c3736cd 933 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
d4b9e079 934 pmd_clear(pmd);
efaa5b93 935 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
d4b9e079 936 } else {
ad361f09 937 get_page(virt_to_page(pmd));
d4b9e079
MZ
938 }
939
940 kvm_set_pmd(pmd, *new_pmd);
ad361f09
CD
941 return 0;
942}
943
a0e50aa3
CD
944static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
945 struct kvm_mmu_memory_cache *cache,
b8e0ba7c
PA
946 phys_addr_t addr, const pud_t *new_pudp)
947{
a0e50aa3 948 struct kvm *kvm = mmu->kvm;
b8e0ba7c
PA
949 pud_t *pudp, old_pud;
950
3c3736cd 951retry:
a0e50aa3 952 pudp = stage2_get_pud(mmu, cache, addr);
b8e0ba7c
PA
953 VM_BUG_ON(!pudp);
954
955 old_pud = *pudp;
956
957 /*
958 * A large number of vcpus faulting on the same stage 2 entry,
3c3736cd
SP
959 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
960 * Skip updating the page tables if there is no change.
b8e0ba7c
PA
961 */
962 if (pud_val(old_pud) == pud_val(*new_pudp))
963 return 0;
964
965 if (stage2_pud_present(kvm, old_pud)) {
3c3736cd
SP
966 /*
967 * If we already have table level mapping for this block, unmap
968 * the range for this block and retry.
969 */
970 if (!stage2_pud_huge(kvm, old_pud)) {
a0e50aa3 971 unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
3c3736cd
SP
972 goto retry;
973 }
974
975 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
b8e0ba7c 976 stage2_pud_clear(kvm, pudp);
efaa5b93 977 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
b8e0ba7c
PA
978 } else {
979 get_page(virt_to_page(pudp));
980 }
981
982 kvm_set_pud(pudp, *new_pudp);
983 return 0;
984}
985
86d1c55e
PA
986/*
987 * stage2_get_leaf_entry - walk the stage2 VM page tables and return
988 * true if a valid and present leaf-entry is found. A pointer to the
989 * leaf-entry is returned in the appropriate level variable - pudpp,
990 * pmdpp, ptepp.
991 */
a0e50aa3 992static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
86d1c55e 993 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
7a3796d2 994{
a0e50aa3 995 struct kvm *kvm = mmu->kvm;
86d1c55e 996 pud_t *pudp;
7a3796d2
MZ
997 pmd_t *pmdp;
998 pte_t *ptep;
999
86d1c55e
PA
1000 *pudpp = NULL;
1001 *pmdpp = NULL;
1002 *ptepp = NULL;
1003
a0e50aa3 1004 pudp = stage2_get_pud(mmu, NULL, addr);
86d1c55e
PA
1005 if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1006 return false;
1007
1008 if (stage2_pud_huge(kvm, *pudp)) {
1009 *pudpp = pudp;
1010 return true;
1011 }
1012
1013 pmdp = stage2_pmd_offset(kvm, pudp, addr);
7a3796d2
MZ
1014 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1015 return false;
1016
86d1c55e
PA
1017 if (pmd_thp_or_huge(*pmdp)) {
1018 *pmdpp = pmdp;
1019 return true;
1020 }
7a3796d2
MZ
1021
1022 ptep = pte_offset_kernel(pmdp, addr);
1023 if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1024 return false;
1025
86d1c55e
PA
1026 *ptepp = ptep;
1027 return true;
1028}
1029
0378daef 1030static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
86d1c55e
PA
1031{
1032 pud_t *pudp;
1033 pmd_t *pmdp;
1034 pte_t *ptep;
1035 bool found;
1036
a0e50aa3 1037 found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
86d1c55e
PA
1038 if (!found)
1039 return false;
1040
1041 if (pudp)
b757b47a 1042 return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
86d1c55e 1043 else if (pmdp)
b757b47a 1044 return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
86d1c55e 1045 else
b757b47a 1046 return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
7a3796d2
MZ
1047}
1048
a0e50aa3
CD
1049static int stage2_set_pte(struct kvm_s2_mmu *mmu,
1050 struct kvm_mmu_memory_cache *cache,
15a49a44
MS
1051 phys_addr_t addr, const pte_t *new_pte,
1052 unsigned long flags)
ad361f09 1053{
a0e50aa3 1054 struct kvm *kvm = mmu->kvm;
b8e0ba7c 1055 pud_t *pud;
ad361f09
CD
1056 pmd_t *pmd;
1057 pte_t *pte, old_pte;
15a49a44
MS
1058 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1059 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1060
1061 VM_BUG_ON(logging_active && !cache);
ad361f09 1062
38f791a4 1063 /* Create stage-2 page table mapping - Levels 0 and 1 */
a0e50aa3 1064 pud = stage2_get_pud(mmu, cache, addr);
b8e0ba7c
PA
1065 if (!pud) {
1066 /*
1067 * Ignore calls from kvm_set_spte_hva for unallocated
1068 * address ranges.
1069 */
1070 return 0;
1071 }
1072
1073 /*
1074 * While dirty page logging - dissolve huge PUD, then continue
1075 * on to allocate page.
1076 */
1077 if (logging_active)
a0e50aa3 1078 stage2_dissolve_pud(mmu, addr, pud);
b8e0ba7c
PA
1079
1080 if (stage2_pud_none(kvm, *pud)) {
1081 if (!cache)
1082 return 0; /* ignore calls from kvm_set_spte_hva */
c1a33aeb 1083 pmd = kvm_mmu_memory_cache_alloc(cache);
b8e0ba7c
PA
1084 stage2_pud_populate(kvm, pud, pmd);
1085 get_page(virt_to_page(pud));
1086 }
1087
1088 pmd = stage2_pmd_offset(kvm, pud, addr);
ad361f09
CD
1089 if (!pmd) {
1090 /*
1091 * Ignore calls from kvm_set_spte_hva for unallocated
1092 * address ranges.
1093 */
1094 return 0;
1095 }
1096
15a49a44
MS
1097 /*
1098 * While dirty page logging - dissolve huge PMD, then continue on to
1099 * allocate page.
1100 */
1101 if (logging_active)
a0e50aa3 1102 stage2_dissolve_pmd(mmu, addr, pmd);
15a49a44 1103
ad361f09 1104 /* Create stage-2 page mappings - Level 2 */
d5d8184d
CD
1105 if (pmd_none(*pmd)) {
1106 if (!cache)
1107 return 0; /* ignore calls from kvm_set_spte_hva */
c1a33aeb 1108 pte = kvm_mmu_memory_cache_alloc(cache);
0db9dd8a 1109 kvm_pmd_populate(pmd, pte);
d5d8184d 1110 get_page(virt_to_page(pmd));
c62ee2b2
MZ
1111 }
1112
1113 pte = pte_offset_kernel(pmd, addr);
d5d8184d
CD
1114
1115 if (iomap && pte_present(*pte))
1116 return -EFAULT;
1117
1118 /* Create 2nd stage page table mapping - Level 3 */
1119 old_pte = *pte;
d4b9e079 1120 if (pte_present(old_pte)) {
976d34e2
PA
1121 /* Skip page table update if there is no change */
1122 if (pte_val(old_pte) == pte_val(*new_pte))
1123 return 0;
1124
d4b9e079 1125 kvm_set_pte(pte, __pte(0));
efaa5b93 1126 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
d4b9e079 1127 } else {
d5d8184d 1128 get_page(virt_to_page(pte));
d4b9e079 1129 }
d5d8184d 1130
d4b9e079 1131 kvm_set_pte(pte, *new_pte);
d5d8184d
CD
1132 return 0;
1133}
d5d8184d 1134
06485053
CM
1135#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1136static int stage2_ptep_test_and_clear_young(pte_t *pte)
1137{
1138 if (pte_young(*pte)) {
1139 *pte = pte_mkold(*pte);
1140 return 1;
1141 }
d5d8184d
CD
1142 return 0;
1143}
06485053
CM
1144#else
1145static int stage2_ptep_test_and_clear_young(pte_t *pte)
1146{
1147 return __ptep_test_and_clear_young(pte);
1148}
1149#endif
1150
1151static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1152{
1153 return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1154}
d5d8184d 1155
35a63966
PA
1156static int stage2_pudp_test_and_clear_young(pud_t *pud)
1157{
1158 return stage2_ptep_test_and_clear_young((pte_t *)pud);
1159}
1160
d5d8184d
CD
1161/**
1162 * kvm_phys_addr_ioremap - map a device range to guest IPA
1163 *
1164 * @kvm: The KVM pointer
1165 * @guest_ipa: The IPA at which to insert the mapping
1166 * @pa: The physical address of the device
1167 * @size: The size of the mapping
1168 */
1169int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
c40f2f8f 1170 phys_addr_t pa, unsigned long size, bool writable)
d5d8184d 1171{
02bbd374 1172 phys_addr_t addr;
d5d8184d 1173 int ret = 0;
c1a33aeb 1174 struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
02bbd374
WD
1175 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
1176 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
1177 KVM_PGTABLE_PROT_R |
1178 (writable ? KVM_PGTABLE_PROT_W : 0);
d5d8184d 1179
02bbd374
WD
1180 size += offset_in_page(guest_ipa);
1181 guest_ipa &= PAGE_MASK;
c40f2f8f 1182
02bbd374 1183 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
c1a33aeb
SC
1184 ret = kvm_mmu_topup_memory_cache(&cache,
1185 kvm_mmu_cache_min_pages(kvm));
d5d8184d 1186 if (ret)
02bbd374
WD
1187 break;
1188
d5d8184d 1189 spin_lock(&kvm->mmu_lock);
02bbd374
WD
1190 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
1191 &cache);
d5d8184d
CD
1192 spin_unlock(&kvm->mmu_lock);
1193 if (ret)
02bbd374 1194 break;
d5d8184d 1195
02bbd374 1196 pa += PAGE_SIZE;
d5d8184d
CD
1197 }
1198
c1a33aeb 1199 kvm_mmu_free_memory_cache(&cache);
d5d8184d
CD
1200 return ret;
1201}
1202
c6473555
MS
1203/**
1204 * stage2_wp_ptes - write protect PMD range
1205 * @pmd: pointer to pmd entry
1206 * @addr: range start address
1207 * @end: range end address
1208 */
1209static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1210{
1211 pte_t *pte;
1212
1213 pte = pte_offset_kernel(pmd, addr);
1214 do {
1215 if (!pte_none(*pte)) {
1216 if (!kvm_s2pte_readonly(pte))
1217 kvm_set_s2pte_readonly(pte);
1218 }
1219 } while (pte++, addr += PAGE_SIZE, addr != end);
1220}
1221
1222/**
1223 * stage2_wp_pmds - write protect PUD range
e55cac5b 1224 * kvm: kvm instance for the VM
c6473555
MS
1225 * @pud: pointer to pud entry
1226 * @addr: range start address
1227 * @end: range end address
1228 */
a0e50aa3 1229static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
e55cac5b 1230 phys_addr_t addr, phys_addr_t end)
c6473555 1231{
a0e50aa3 1232 struct kvm *kvm = mmu->kvm;
c6473555
MS
1233 pmd_t *pmd;
1234 phys_addr_t next;
1235
e55cac5b 1236 pmd = stage2_pmd_offset(kvm, pud, addr);
c6473555
MS
1237
1238 do {
e55cac5b 1239 next = stage2_pmd_addr_end(kvm, addr, end);
c6473555 1240 if (!pmd_none(*pmd)) {
bbb3b6b3 1241 if (pmd_thp_or_huge(*pmd)) {
c6473555
MS
1242 if (!kvm_s2pmd_readonly(pmd))
1243 kvm_set_s2pmd_readonly(pmd);
1244 } else {
1245 stage2_wp_ptes(pmd, addr, next);
1246 }
1247 }
1248 } while (pmd++, addr = next, addr != end);
1249}
1250
1251/**
e9f63768 1252 * stage2_wp_puds - write protect P4D range
a0e50aa3 1253 * @p4d: pointer to p4d entry
8324c3d5
ZY
1254 * @addr: range start address
1255 * @end: range end address
1256 */
a0e50aa3 1257static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
e55cac5b 1258 phys_addr_t addr, phys_addr_t end)
c6473555 1259{
a0e50aa3 1260 struct kvm *kvm = mmu->kvm;
c6473555
MS
1261 pud_t *pud;
1262 phys_addr_t next;
1263
e9f63768 1264 pud = stage2_pud_offset(kvm, p4d, addr);
c6473555 1265 do {
e55cac5b
SP
1266 next = stage2_pud_addr_end(kvm, addr, end);
1267 if (!stage2_pud_none(kvm, *pud)) {
4ea5af53
PA
1268 if (stage2_pud_huge(kvm, *pud)) {
1269 if (!kvm_s2pud_readonly(pud))
1270 kvm_set_s2pud_readonly(pud);
1271 } else {
a0e50aa3 1272 stage2_wp_pmds(mmu, pud, addr, next);
4ea5af53 1273 }
c6473555
MS
1274 }
1275 } while (pud++, addr = next, addr != end);
1276}
1277
e9f63768
MR
1278/**
1279 * stage2_wp_p4ds - write protect PGD range
1280 * @pgd: pointer to pgd entry
1281 * @addr: range start address
1282 * @end: range end address
1283 */
a0e50aa3 1284static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
e9f63768
MR
1285 phys_addr_t addr, phys_addr_t end)
1286{
a0e50aa3 1287 struct kvm *kvm = mmu->kvm;
e9f63768
MR
1288 p4d_t *p4d;
1289 phys_addr_t next;
1290
1291 p4d = stage2_p4d_offset(kvm, pgd, addr);
1292 do {
1293 next = stage2_p4d_addr_end(kvm, addr, end);
1294 if (!stage2_p4d_none(kvm, *p4d))
a0e50aa3 1295 stage2_wp_puds(mmu, p4d, addr, next);
e9f63768
MR
1296 } while (p4d++, addr = next, addr != end);
1297}
1298
c6473555
MS
1299/**
1300 * stage2_wp_range() - write protect stage2 memory region range
1301 * @kvm: The KVM pointer
1302 * @addr: Start address of range
1303 * @end: End address of range
1304 */
a0e50aa3 1305static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
c6473555 1306{
a0e50aa3 1307 struct kvm *kvm = mmu->kvm;
cc38d61c 1308 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
c6473555
MS
1309}
1310
1311/**
1312 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1313 * @kvm: The KVM pointer
1314 * @slot: The memory slot to write protect
1315 *
1316 * Called to start logging dirty pages after memory region
1317 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
4ea5af53 1318 * all present PUD, PMD and PTEs are write protected in the memory region.
c6473555
MS
1319 * Afterwards read of dirty page log can be called.
1320 *
1321 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1322 * serializing operations for VM memory regions.
1323 */
1324void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1325{
9f6b8029
PB
1326 struct kvm_memslots *slots = kvm_memslots(kvm);
1327 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
0577d1ab
SC
1328 phys_addr_t start, end;
1329
1330 if (WARN_ON_ONCE(!memslot))
1331 return;
1332
1333 start = memslot->base_gfn << PAGE_SHIFT;
1334 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
c6473555
MS
1335
1336 spin_lock(&kvm->mmu_lock);
a0e50aa3 1337 stage2_wp_range(&kvm->arch.mmu, start, end);
c6473555
MS
1338 spin_unlock(&kvm->mmu_lock);
1339 kvm_flush_remote_tlbs(kvm);
1340}
53c810c3
MS
1341
1342/**
3b0f1d01 1343 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
53c810c3
MS
1344 * @kvm: The KVM pointer
1345 * @slot: The memory slot associated with mask
1346 * @gfn_offset: The gfn offset in memory slot
1347 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
1348 * slot to be write protected
1349 *
1350 * Walks bits set in mask write protects the associated pte's. Caller must
1351 * acquire kvm_mmu_lock.
1352 */
3b0f1d01 1353static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
53c810c3
MS
1354 struct kvm_memory_slot *slot,
1355 gfn_t gfn_offset, unsigned long mask)
1356{
1357 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1358 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
1359 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1360
a0e50aa3 1361 stage2_wp_range(&kvm->arch.mmu, start, end);
53c810c3 1362}
c6473555 1363
3b0f1d01
KH
1364/*
1365 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1366 * dirty pages.
1367 *
1368 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1369 * enable dirty logging for them.
1370 */
1371void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1372 struct kvm_memory_slot *slot,
1373 gfn_t gfn_offset, unsigned long mask)
1374{
1375 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1376}
1377
17ab9d57 1378static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
0d3e4d4f 1379{
17ab9d57 1380 __clean_dcache_guest_page(pfn, size);
a15f6939
MZ
1381}
1382
17ab9d57 1383static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
a15f6939 1384{
17ab9d57 1385 __invalidate_icache_guest_page(pfn, size);
0d3e4d4f
MZ
1386}
1387
1559b758 1388static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
196f878a 1389{
795a8371 1390 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
196f878a
JM
1391}
1392
a80868f3
SP
1393static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1394 unsigned long hva,
1395 unsigned long map_size)
6794ad54 1396{
c2be79a0 1397 gpa_t gpa_start;
6794ad54
CD
1398 hva_t uaddr_start, uaddr_end;
1399 size_t size;
1400
9f283614
SP
1401 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1402 if (map_size == PAGE_SIZE)
1403 return true;
1404
6794ad54
CD
1405 size = memslot->npages * PAGE_SIZE;
1406
1407 gpa_start = memslot->base_gfn << PAGE_SHIFT;
6794ad54
CD
1408
1409 uaddr_start = memslot->userspace_addr;
1410 uaddr_end = uaddr_start + size;
1411
1412 /*
1413 * Pages belonging to memslots that don't have the same alignment
a80868f3
SP
1414 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1415 * PMD/PUD entries, because we'll end up mapping the wrong pages.
6794ad54
CD
1416 *
1417 * Consider a layout like the following:
1418 *
1419 * memslot->userspace_addr:
1420 * +-----+--------------------+--------------------+---+
a80868f3 1421 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
6794ad54
CD
1422 * +-----+--------------------+--------------------+---+
1423 *
9f283614 1424 * memslot->base_gfn << PAGE_SHIFT:
6794ad54 1425 * +---+--------------------+--------------------+-----+
a80868f3 1426 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
6794ad54
CD
1427 * +---+--------------------+--------------------+-----+
1428 *
a80868f3 1429 * If we create those stage-2 blocks, we'll end up with this incorrect
6794ad54
CD
1430 * mapping:
1431 * d -> f
1432 * e -> g
1433 * f -> h
1434 */
a80868f3 1435 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
6794ad54
CD
1436 return false;
1437
1438 /*
1439 * Next, let's make sure we're not trying to map anything not covered
a80868f3
SP
1440 * by the memslot. This means we have to prohibit block size mappings
1441 * for the beginning and end of a non-block aligned and non-block sized
6794ad54
CD
1442 * memory slot (illustrated by the head and tail parts of the
1443 * userspace view above containing pages 'abcde' and 'xyz',
1444 * respectively).
1445 *
1446 * Note that it doesn't matter if we do the check using the
1447 * userspace_addr or the base_gfn, as both are equally aligned (per
1448 * the check above) and equally sized.
1449 */
a80868f3
SP
1450 return (hva & ~(map_size - 1)) >= uaddr_start &&
1451 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
6794ad54
CD
1452}
1453
0529c902
SP
1454/*
1455 * Check if the given hva is backed by a transparent huge page (THP) and
1456 * whether it can be mapped using block mapping in stage2. If so, adjust
1457 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1458 * supported. This will need to be updated to support other THP sizes.
1459 *
1460 * Returns the size of the mapping.
1461 */
1462static unsigned long
1463transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1464 unsigned long hva, kvm_pfn_t *pfnp,
1465 phys_addr_t *ipap)
1466{
1467 kvm_pfn_t pfn = *pfnp;
1468
1469 /*
1470 * Make sure the adjustment is done only for THP pages. Also make
1471 * sure that the HVA and IPA are sufficiently aligned and that the
1472 * block map is contained within the memslot.
1473 */
1474 if (kvm_is_transparent_hugepage(pfn) &&
1475 fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1476 /*
1477 * The address we faulted on is backed by a transparent huge
1478 * page. However, because we map the compound huge page and
1479 * not the individual tail page, we need to transfer the
1480 * refcount to the head page. We have to be careful that the
1481 * THP doesn't start to split while we are adjusting the
1482 * refcounts.
1483 *
1484 * We are sure this doesn't happen, because mmu_notifier_retry
1485 * was successful and we are holding the mmu_lock, so if this
1486 * THP is trying to split, it will be blocked in the mmu
1487 * notifier before touching any of the pages, specifically
1488 * before being able to call __split_huge_page_refcount().
1489 *
1490 * We can therefore safely transfer the refcount from PG_tail
1491 * to PG_head and switch the pfn from a tail page to the head
1492 * page accordingly.
1493 */
1494 *ipap &= PMD_MASK;
1495 kvm_release_pfn_clean(pfn);
1496 pfn &= ~(PTRS_PER_PMD - 1);
1497 kvm_get_pfn(pfn);
1498 *pfnp = pfn;
1499
1500 return PMD_SIZE;
1501 }
1502
1503 /* Use page mapping if we cannot use block mapping. */
1504 return PAGE_SIZE;
1505}
1506
94f8e641 1507static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
98047888 1508 struct kvm_memory_slot *memslot, unsigned long hva,
94f8e641
CD
1509 unsigned long fault_status)
1510{
94f8e641 1511 int ret;
6396b852
PA
1512 bool write_fault, writable, force_pte = false;
1513 bool exec_fault, needs_exec;
94f8e641 1514 unsigned long mmu_seq;
ad361f09 1515 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
ad361f09 1516 struct kvm *kvm = vcpu->kvm;
94f8e641 1517 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
ad361f09 1518 struct vm_area_struct *vma;
1559b758 1519 short vma_shift;
ba049e93 1520 kvm_pfn_t pfn;
b8865767 1521 pgprot_t mem_type = PAGE_S2;
15a49a44 1522 bool logging_active = memslot_is_logging(memslot);
3f58bf63 1523 unsigned long vma_pagesize, flags = 0;
a0e50aa3 1524 struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
94f8e641 1525
a7d079ce 1526 write_fault = kvm_is_write_fault(vcpu);
d0e22b4a
MZ
1527 exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1528 VM_BUG_ON(write_fault && exec_fault);
1529
1530 if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
94f8e641
CD
1531 kvm_err("Unexpected L2 read permission error\n");
1532 return -EFAULT;
1533 }
1534
ad361f09 1535 /* Let's check if we will get back a huge page backed by hugetlbfs */
89154dd5 1536 mmap_read_lock(current->mm);
ad361f09 1537 vma = find_vma_intersection(current->mm, hva, hva + 1);
37b54408
AB
1538 if (unlikely(!vma)) {
1539 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
89154dd5 1540 mmap_read_unlock(current->mm);
37b54408
AB
1541 return -EFAULT;
1542 }
1543
1559b758
JM
1544 if (is_vm_hugetlb_page(vma))
1545 vma_shift = huge_page_shift(hstate_vma(vma));
1546 else
1547 vma_shift = PAGE_SHIFT;
1548
1549 vma_pagesize = 1ULL << vma_shift;
a80868f3 1550 if (logging_active ||
6d674e28 1551 (vma->vm_flags & VM_PFNMAP) ||
a80868f3
SP
1552 !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1553 force_pte = true;
1554 vma_pagesize = PAGE_SIZE;
1555 }
1556
b8e0ba7c 1557 /*
280cebfd
SP
1558 * The stage2 has a minimum of 2 level table (For arm64 see
1559 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1560 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1561 * As for PUD huge maps, we must make sure that we have at least
1562 * 3 levels, i.e, PMD is not folded.
b8e0ba7c 1563 */
a80868f3
SP
1564 if (vma_pagesize == PMD_SIZE ||
1565 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
b8e0ba7c 1566 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
89154dd5 1567 mmap_read_unlock(current->mm);
ad361f09 1568
94f8e641 1569 /* We need minimum second+third level pages */
c1a33aeb 1570 ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
94f8e641
CD
1571 if (ret)
1572 return ret;
1573
1574 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1575 /*
1576 * Ensure the read of mmu_notifier_seq happens before we call
1577 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1578 * the page we just got a reference to gets unmapped before we have a
1579 * chance to grab the mmu_lock, which ensure that if the page gets
1580 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1581 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1582 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1583 */
1584 smp_rmb();
1585
ad361f09 1586 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
196f878a 1587 if (pfn == KVM_PFN_ERR_HWPOISON) {
1559b758 1588 kvm_send_hwpoison_signal(hva, vma_shift);
196f878a
JM
1589 return 0;
1590 }
9ac71595 1591 if (is_error_noslot_pfn(pfn))
94f8e641
CD
1592 return -EFAULT;
1593
15a49a44 1594 if (kvm_is_device_pfn(pfn)) {
b8865767 1595 mem_type = PAGE_S2_DEVICE;
15a49a44
MS
1596 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1597 } else if (logging_active) {
1598 /*
1599 * Faults on pages in a memslot with logging enabled
1600 * should not be mapped with huge pages (it introduces churn
1601 * and performance degradation), so force a pte mapping.
1602 */
15a49a44
MS
1603 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1604
1605 /*
1606 * Only actually map the page as writable if this was a write
1607 * fault.
1608 */
1609 if (!write_fault)
1610 writable = false;
1611 }
b8865767 1612
6d674e28
MZ
1613 if (exec_fault && is_iomap(flags))
1614 return -ENOEXEC;
1615
ad361f09
CD
1616 spin_lock(&kvm->mmu_lock);
1617 if (mmu_notifier_retry(kvm, mmu_seq))
94f8e641 1618 goto out_unlock;
15a49a44 1619
0529c902
SP
1620 /*
1621 * If we are not forced to use page mapping, check if we are
1622 * backed by a THP and thus use block mapping if possible.
1623 */
1624 if (vma_pagesize == PAGE_SIZE && !force_pte)
1625 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1626 &pfn, &fault_ipa);
3f58bf63
PA
1627 if (writable)
1628 kvm_set_pfn_dirty(pfn);
ad361f09 1629
6d674e28 1630 if (fault_status != FSC_PERM && !is_iomap(flags))
3f58bf63
PA
1631 clean_dcache_guest_page(pfn, vma_pagesize);
1632
1633 if (exec_fault)
1634 invalidate_icache_guest_page(pfn, vma_pagesize);
1635
6396b852
PA
1636 /*
1637 * If we took an execution fault we have made the
1638 * icache/dcache coherent above and should now let the s2
1639 * mapping be executable.
1640 *
1641 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1642 * execute permissions, and we preserve whatever we have.
1643 */
1644 needs_exec = exec_fault ||
b757b47a 1645 (fault_status == FSC_PERM &&
0378daef 1646 stage2_is_exec(mmu, fault_ipa, vma_pagesize));
6396b852 1647
b8e0ba7c
PA
1648 if (vma_pagesize == PUD_SIZE) {
1649 pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1650
1651 new_pud = kvm_pud_mkhuge(new_pud);
1652 if (writable)
1653 new_pud = kvm_s2pud_mkwrite(new_pud);
1654
1655 if (needs_exec)
1656 new_pud = kvm_s2pud_mkexec(new_pud);
1657
a0e50aa3 1658 ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
b8e0ba7c 1659 } else if (vma_pagesize == PMD_SIZE) {
f8df7338
PA
1660 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1661
1662 new_pmd = kvm_pmd_mkhuge(new_pmd);
1663
3f58bf63 1664 if (writable)
06485053 1665 new_pmd = kvm_s2pmd_mkwrite(new_pmd);
d0e22b4a 1666
6396b852 1667 if (needs_exec)
d0e22b4a 1668 new_pmd = kvm_s2pmd_mkexec(new_pmd);
a15f6939 1669
a0e50aa3 1670 ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
ad361f09 1671 } else {
f8df7338 1672 pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
15a49a44 1673
ad361f09 1674 if (writable) {
06485053 1675 new_pte = kvm_s2pte_mkwrite(new_pte);
15a49a44 1676 mark_page_dirty(kvm, gfn);
ad361f09 1677 }
a9c0e12e 1678
6396b852 1679 if (needs_exec)
d0e22b4a 1680 new_pte = kvm_s2pte_mkexec(new_pte);
a15f6939 1681
a0e50aa3 1682 ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
94f8e641 1683 }
ad361f09 1684
94f8e641 1685out_unlock:
ad361f09 1686 spin_unlock(&kvm->mmu_lock);
35307b9a 1687 kvm_set_pfn_accessed(pfn);
94f8e641 1688 kvm_release_pfn_clean(pfn);
ad361f09 1689 return ret;
94f8e641
CD
1690}
1691
ee8efad7 1692/* Resolve the access fault by making the page young again. */
aeda9130
MZ
1693static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1694{
ee8efad7
WD
1695 pte_t pte;
1696 kvm_pte_t kpte;
1697 struct kvm_s2_mmu *mmu;
aeda9130
MZ
1698
1699 trace_kvm_access_fault(fault_ipa);
1700
1701 spin_lock(&vcpu->kvm->mmu_lock);
ee8efad7
WD
1702 mmu = vcpu->arch.hw_mmu;
1703 kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
aeda9130 1704 spin_unlock(&vcpu->kvm->mmu_lock);
ee8efad7
WD
1705
1706 pte = __pte(kpte);
1707 if (pte_valid(pte))
1708 kvm_set_pfn_accessed(pte_pfn(pte));
aeda9130
MZ
1709}
1710
94f8e641
CD
1711/**
1712 * kvm_handle_guest_abort - handles all 2nd stage aborts
1713 * @vcpu: the VCPU pointer
94f8e641
CD
1714 *
1715 * Any abort that gets to the host is almost guaranteed to be caused by a
1716 * missing second stage translation table entry, which can mean that either the
1717 * guest simply needs more memory and we must allocate an appropriate page or it
1718 * can mean that the guest tried to access I/O memory, which is emulated by user
1719 * space. The distinction is based on the IPA causing the fault and whether this
1720 * memory region has been registered as standard RAM by user space.
1721 */
74cc7e0c 1722int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
342cd0ab 1723{
94f8e641
CD
1724 unsigned long fault_status;
1725 phys_addr_t fault_ipa;
1726 struct kvm_memory_slot *memslot;
98047888
CD
1727 unsigned long hva;
1728 bool is_iabt, write_fault, writable;
94f8e641
CD
1729 gfn_t gfn;
1730 int ret, idx;
1731
621f48e4
TB
1732 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1733
1734 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
bb428921 1735 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
621f48e4 1736
bb428921 1737 /* Synchronous External Abort? */
c9a636f2 1738 if (kvm_vcpu_abt_issea(vcpu)) {
bb428921
JM
1739 /*
1740 * For RAS the host kernel may handle this abort.
1741 * There is no need to pass the error into the guest.
1742 */
84b951a8 1743 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
bb428921 1744 kvm_inject_vabt(vcpu);
84b951a8
WD
1745
1746 return 1;
4055710b
MZ
1747 }
1748
3a949f4c 1749 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
7393b599 1750 kvm_vcpu_get_hfar(vcpu), fault_ipa);
94f8e641
CD
1751
1752 /* Check the stage-2 fault is trans. fault or write fault */
35307b9a
MZ
1753 if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1754 fault_status != FSC_ACCESS) {
0496daa5
CD
1755 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1756 kvm_vcpu_trap_get_class(vcpu),
1757 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
3a949f4c 1758 (unsigned long)kvm_vcpu_get_esr(vcpu));
94f8e641
CD
1759 return -EFAULT;
1760 }
1761
1762 idx = srcu_read_lock(&vcpu->kvm->srcu);
1763
1764 gfn = fault_ipa >> PAGE_SHIFT;
98047888
CD
1765 memslot = gfn_to_memslot(vcpu->kvm, gfn);
1766 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
a7d079ce 1767 write_fault = kvm_is_write_fault(vcpu);
98047888 1768 if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
022c8328
WD
1769 /*
1770 * The guest has put either its instructions or its page-tables
1771 * somewhere it shouldn't have. Userspace won't be able to do
1772 * anything about this (there's no syndrome for a start), so
1773 * re-inject the abort back into the guest.
1774 */
94f8e641 1775 if (is_iabt) {
6d674e28
MZ
1776 ret = -ENOEXEC;
1777 goto out;
94f8e641
CD
1778 }
1779
022c8328
WD
1780 if (kvm_vcpu_dabt_iss1tw(vcpu)) {
1781 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1782 ret = 1;
1783 goto out_unlock;
1784 }
1785
57c841f1
MZ
1786 /*
1787 * Check for a cache maintenance operation. Since we
1788 * ended-up here, we know it is outside of any memory
1789 * slot. But we can't find out if that is for a device,
1790 * or if the guest is just being stupid. The only thing
1791 * we know for sure is that this range cannot be cached.
1792 *
1793 * So let's assume that the guest is just being
1794 * cautious, and skip the instruction.
1795 */
54dc0d24 1796 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
57c841f1
MZ
1797 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1798 ret = 1;
1799 goto out_unlock;
1800 }
1801
cfe3950c
MZ
1802 /*
1803 * The IPA is reported as [MAX:12], so we need to
1804 * complement it with the bottom 12 bits from the
1805 * faulting VA. This is always 12 bits, irrespective
1806 * of the page size.
1807 */
1808 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
74cc7e0c 1809 ret = io_mem_abort(vcpu, fault_ipa);
94f8e641
CD
1810 goto out_unlock;
1811 }
1812
c3058d5d 1813 /* Userspace should not be able to register out-of-bounds IPAs */
e55cac5b 1814 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
c3058d5d 1815
aeda9130
MZ
1816 if (fault_status == FSC_ACCESS) {
1817 handle_access_fault(vcpu, fault_ipa);
1818 ret = 1;
1819 goto out_unlock;
1820 }
1821
98047888 1822 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
94f8e641
CD
1823 if (ret == 0)
1824 ret = 1;
6d674e28
MZ
1825out:
1826 if (ret == -ENOEXEC) {
1827 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1828 ret = 1;
1829 }
94f8e641
CD
1830out_unlock:
1831 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1832 return ret;
342cd0ab
CD
1833}
1834
1d2ebacc
MZ
1835static int handle_hva_to_gpa(struct kvm *kvm,
1836 unsigned long start,
1837 unsigned long end,
1838 int (*handler)(struct kvm *kvm,
056aad67
SP
1839 gpa_t gpa, u64 size,
1840 void *data),
1d2ebacc 1841 void *data)
d5d8184d
CD
1842{
1843 struct kvm_memslots *slots;
1844 struct kvm_memory_slot *memslot;
1d2ebacc 1845 int ret = 0;
d5d8184d
CD
1846
1847 slots = kvm_memslots(kvm);
1848
1849 /* we only care about the pages that the guest sees */
1850 kvm_for_each_memslot(memslot, slots) {
1851 unsigned long hva_start, hva_end;
056aad67 1852 gfn_t gpa;
d5d8184d
CD
1853
1854 hva_start = max(start, memslot->userspace_addr);
1855 hva_end = min(end, memslot->userspace_addr +
1856 (memslot->npages << PAGE_SHIFT));
1857 if (hva_start >= hva_end)
1858 continue;
1859
056aad67
SP
1860 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1861 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
d5d8184d 1862 }
1d2ebacc
MZ
1863
1864 return ret;
d5d8184d
CD
1865}
1866
056aad67 1867static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
d5d8184d 1868{
b5331379
WD
1869 unsigned flags = *(unsigned *)data;
1870 bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
1871
1872 __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
1d2ebacc 1873 return 0;
d5d8184d
CD
1874}
1875
d5d8184d 1876int kvm_unmap_hva_range(struct kvm *kvm,
fdfe7cbd 1877 unsigned long start, unsigned long end, unsigned flags)
d5d8184d 1878{
a0e50aa3 1879 if (!kvm->arch.mmu.pgd)
d5d8184d
CD
1880 return 0;
1881
1882 trace_kvm_unmap_hva_range(start, end);
b5331379 1883 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
d5d8184d
CD
1884 return 0;
1885}
1886
056aad67 1887static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
d5d8184d 1888{
e9edb17a 1889 kvm_pfn_t *pfn = (kvm_pfn_t *)data;
d5d8184d 1890
056aad67 1891 WARN_ON(size != PAGE_SIZE);
e9edb17a 1892
15a49a44 1893 /*
e9edb17a
WD
1894 * The MMU notifiers will have unmapped a huge PMD before calling
1895 * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1896 * therefore we never need to clear out a huge PMD through this
1897 * calling path and a memcache is not required.
15a49a44 1898 */
e9edb17a
WD
1899 kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
1900 __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
1d2ebacc 1901 return 0;
d5d8184d
CD
1902}
1903
748c0e31 1904int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
d5d8184d
CD
1905{
1906 unsigned long end = hva + PAGE_SIZE;
694556d5 1907 kvm_pfn_t pfn = pte_pfn(pte);
d5d8184d 1908
e9edb17a 1909 if (!kvm->arch.mmu.pgt)
748c0e31 1910 return 0;
d5d8184d
CD
1911
1912 trace_kvm_set_spte_hva(hva);
694556d5
MZ
1913
1914 /*
1915 * We've moved a page around, probably through CoW, so let's treat it
1916 * just like a translation fault and clean the cache to the PoC.
1917 */
1918 clean_dcache_guest_page(pfn, PAGE_SIZE);
e9edb17a 1919 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
748c0e31 1920 return 0;
d5d8184d
CD
1921}
1922
056aad67 1923static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
35307b9a 1924{
ee8efad7
WD
1925 pte_t pte;
1926 kvm_pte_t kpte;
35307b9a 1927
35a63966 1928 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
ee8efad7
WD
1929 kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
1930 pte = __pte(kpte);
1931 return pte_valid(pte) && pte_young(pte);
35307b9a
MZ
1932}
1933
056aad67 1934static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
35307b9a 1935{
35a63966 1936 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
ee8efad7 1937 return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
35307b9a
MZ
1938}
1939
1940int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1941{
a0e50aa3 1942 if (!kvm->arch.mmu.pgd)
7e5a6722 1943 return 0;
35307b9a
MZ
1944 trace_kvm_age_hva(start, end);
1945 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1946}
1947
1948int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1949{
a0e50aa3 1950 if (!kvm->arch.mmu.pgd)
7e5a6722 1951 return 0;
35307b9a 1952 trace_kvm_test_age_hva(hva);
cf2d23e0
GS
1953 return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
1954 kvm_test_age_hva_handler, NULL);
35307b9a
MZ
1955}
1956
342cd0ab
CD
1957phys_addr_t kvm_mmu_get_httbr(void)
1958{
0f9d09b8 1959 return __pa(hyp_pgtable->pgd);
342cd0ab
CD
1960}
1961
5a677ce0
MZ
1962phys_addr_t kvm_get_idmap_vector(void)
1963{
1964 return hyp_idmap_vector;
1965}
1966
0f9d09b8 1967static int kvm_map_idmap_text(void)
0535a3e2 1968{
0f9d09b8
WD
1969 unsigned long size = hyp_idmap_end - hyp_idmap_start;
1970 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
1971 PAGE_HYP_EXEC);
0535a3e2
MZ
1972 if (err)
1973 kvm_err("Failed to idmap %lx-%lx\n",
1974 hyp_idmap_start, hyp_idmap_end);
1975
1976 return err;
1977}
1978
342cd0ab
CD
1979int kvm_mmu_init(void)
1980{
2fb41059 1981 int err;
0f9d09b8 1982 u32 hyp_va_bits;
2fb41059 1983
0a78791c 1984 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
46fef158 1985 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
0a78791c 1986 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
46fef158 1987 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
0a78791c 1988 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
5a677ce0 1989
06f75a1f
AB
1990 /*
1991 * We rely on the linker script to ensure at build time that the HYP
1992 * init code does not cross a page boundary.
1993 */
1994 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
5a677ce0 1995
0f9d09b8
WD
1996 hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1997 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
b4ef0499
MZ
1998 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1999 kvm_debug("HYP VA range: %lx:%lx\n",
2000 kern_hyp_va(PAGE_OFFSET),
2001 kern_hyp_va((unsigned long)high_memory - 1));
eac378a9 2002
6c41a413 2003 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
ed57cac8 2004 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
d2896d4b 2005 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
eac378a9
MZ
2006 /*
2007 * The idmap page is intersecting with the VA space,
2008 * it is not safe to continue further.
2009 */
2010 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2011 err = -EINVAL;
2012 goto out;
2013 }
2014
0f9d09b8
WD
2015 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
2016 if (!hyp_pgtable) {
2017 kvm_err("Hyp mode page-table not allocated\n");
2fb41059
MZ
2018 err = -ENOMEM;
2019 goto out;
2020 }
2021
0f9d09b8
WD
2022 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
2023 if (err)
2024 goto out_free_pgtable;
d5d8184d 2025
0f9d09b8
WD
2026 err = kvm_map_idmap_text();
2027 if (err)
2028 goto out_destroy_pgtable;
5a677ce0 2029
e3f019b3 2030 io_map_base = hyp_idmap_start;
d5d8184d 2031 return 0;
0f9d09b8
WD
2032
2033out_destroy_pgtable:
2034 kvm_pgtable_hyp_destroy(hyp_pgtable);
2035out_free_pgtable:
2036 kfree(hyp_pgtable);
2037 hyp_pgtable = NULL;
2fb41059 2038out:
2fb41059 2039 return err;
342cd0ab 2040}
df6ce24f
EA
2041
2042void kvm_arch_commit_memory_region(struct kvm *kvm,
09170a49 2043 const struct kvm_userspace_memory_region *mem,
9d4c197c 2044 struct kvm_memory_slot *old,
f36f3f28 2045 const struct kvm_memory_slot *new,
df6ce24f
EA
2046 enum kvm_mr_change change)
2047{
c6473555
MS
2048 /*
2049 * At this point memslot has been committed and there is an
656012c7 2050 * allocated dirty_bitmap[], dirty pages will be tracked while the
c6473555
MS
2051 * memory slot is write protected.
2052 */
c862626e
KZ
2053 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2054 /*
2055 * If we're with initial-all-set, we don't need to write
2056 * protect any pages because they're all reported as dirty.
2057 * Huge pages and normal pages will be write protect gradually.
2058 */
2059 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2060 kvm_mmu_wp_memory_region(kvm, mem->slot);
2061 }
2062 }
df6ce24f
EA
2063}
2064
2065int kvm_arch_prepare_memory_region(struct kvm *kvm,
2066 struct kvm_memory_slot *memslot,
09170a49 2067 const struct kvm_userspace_memory_region *mem,
df6ce24f
EA
2068 enum kvm_mr_change change)
2069{
8eef9123
AB
2070 hva_t hva = mem->userspace_addr;
2071 hva_t reg_end = hva + mem->memory_size;
2072 bool writable = !(mem->flags & KVM_MEM_READONLY);
2073 int ret = 0;
2074
15a49a44
MS
2075 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2076 change != KVM_MR_FLAGS_ONLY)
8eef9123
AB
2077 return 0;
2078
c3058d5d
CD
2079 /*
2080 * Prevent userspace from creating a memory region outside of the IPA
2081 * space addressable by the KVM guest IPA space.
2082 */
2083 if (memslot->base_gfn + memslot->npages >=
e55cac5b 2084 (kvm_phys_size(kvm) >> PAGE_SHIFT))
c3058d5d
CD
2085 return -EFAULT;
2086
89154dd5 2087 mmap_read_lock(current->mm);
8eef9123
AB
2088 /*
2089 * A memory region could potentially cover multiple VMAs, and any holes
2090 * between them, so iterate over all of them to find out if we can map
2091 * any of them right now.
2092 *
2093 * +--------------------------------------------+
2094 * +---------------+----------------+ +----------------+
2095 * | : VMA 1 | VMA 2 | | VMA 3 : |
2096 * +---------------+----------------+ +----------------+
2097 * | memory region |
2098 * +--------------------------------------------+
2099 */
2100 do {
2101 struct vm_area_struct *vma = find_vma(current->mm, hva);
2102 hva_t vm_start, vm_end;
2103
2104 if (!vma || vma->vm_start >= reg_end)
2105 break;
2106
8eef9123
AB
2107 /*
2108 * Take the intersection of this VMA with the memory region
2109 */
2110 vm_start = max(hva, vma->vm_start);
2111 vm_end = min(reg_end, vma->vm_end);
2112
2113 if (vma->vm_flags & VM_PFNMAP) {
2114 gpa_t gpa = mem->guest_phys_addr +
2115 (vm_start - mem->userspace_addr);
ca09f02f
MM
2116 phys_addr_t pa;
2117
2118 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2119 pa += vm_start - vma->vm_start;
8eef9123 2120
15a49a44 2121 /* IO region dirty page logging not allowed */
72f31048
MZ
2122 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2123 ret = -EINVAL;
2124 goto out;
2125 }
15a49a44 2126
8eef9123
AB
2127 ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2128 vm_end - vm_start,
2129 writable);
2130 if (ret)
2131 break;
2132 }
2133 hva = vm_end;
2134 } while (hva < reg_end);
2135
15a49a44 2136 if (change == KVM_MR_FLAGS_ONLY)
72f31048 2137 goto out;
15a49a44 2138
849260c7
AB
2139 spin_lock(&kvm->mmu_lock);
2140 if (ret)
a0e50aa3 2141 unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
849260c7
AB
2142 else
2143 stage2_flush_memslot(kvm, memslot);
2144 spin_unlock(&kvm->mmu_lock);
72f31048 2145out:
89154dd5 2146 mmap_read_unlock(current->mm);
8eef9123 2147 return ret;
df6ce24f
EA
2148}
2149
e96c81ee 2150void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
df6ce24f
EA
2151{
2152}
2153
15248258 2154void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
df6ce24f
EA
2155{
2156}
2157
2158void kvm_arch_flush_shadow_all(struct kvm *kvm)
2159{
a0e50aa3 2160 kvm_free_stage2_pgd(&kvm->arch.mmu);
df6ce24f
EA
2161}
2162
2163void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2164 struct kvm_memory_slot *slot)
2165{
8eef9123
AB
2166 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2167 phys_addr_t size = slot->npages << PAGE_SHIFT;
2168
2169 spin_lock(&kvm->mmu_lock);
a0e50aa3 2170 unmap_stage2_range(&kvm->arch.mmu, gpa, size);
8eef9123 2171 spin_unlock(&kvm->mmu_lock);
df6ce24f 2172}
3c1e7165
MZ
2173
2174/*
2175 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2176 *
2177 * Main problems:
2178 * - S/W ops are local to a CPU (not broadcast)
2179 * - We have line migration behind our back (speculation)
2180 * - System caches don't support S/W at all (damn!)
2181 *
2182 * In the face of the above, the best we can do is to try and convert
2183 * S/W ops to VA ops. Because the guest is not allowed to infer the
2184 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2185 * which is a rather good thing for us.
2186 *
2187 * Also, it is only used when turning caches on/off ("The expected
2188 * usage of the cache maintenance instructions that operate by set/way
2189 * is associated with the cache maintenance instructions associated
2190 * with the powerdown and powerup of caches, if this is required by
2191 * the implementation.").
2192 *
2193 * We use the following policy:
2194 *
2195 * - If we trap a S/W operation, we enable VM trapping to detect
2196 * caches being turned on/off, and do a full clean.
2197 *
2198 * - We flush the caches on both caches being turned on and off.
2199 *
2200 * - Once the caches are enabled, we stop trapping VM ops.
2201 */
2202void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2203{
3df59d8d 2204 unsigned long hcr = *vcpu_hcr(vcpu);
3c1e7165
MZ
2205
2206 /*
2207 * If this is the first time we do a S/W operation
2208 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2209 * VM trapping.
2210 *
2211 * Otherwise, rely on the VM trapping to wait for the MMU +
2212 * Caches to be turned off. At that point, we'll be able to
2213 * clean the caches again.
2214 */
2215 if (!(hcr & HCR_TVM)) {
2216 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2217 vcpu_has_cache_enabled(vcpu));
2218 stage2_flush_vm(vcpu->kvm);
3df59d8d 2219 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
3c1e7165
MZ
2220 }
2221}
2222
2223void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2224{
2225 bool now_enabled = vcpu_has_cache_enabled(vcpu);
2226
2227 /*
2228 * If switching the MMU+caches on, need to invalidate the caches.
2229 * If switching it off, need to clean the caches.
2230 * Clean + invalidate does the trick always.
2231 */
2232 if (now_enabled != was_enabled)
2233 stage2_flush_vm(vcpu->kvm);
2234
2235 /* Caches are now on, stop trapping VM ops (until a S/W op) */
2236 if (now_enabled)
3df59d8d 2237 *vcpu_hcr(vcpu) &= ~HCR_TVM;
3c1e7165
MZ
2238
2239 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2240}