Merge branches 'pm-cpuidle', 'pm-sleep' and 'pm-powercap'
[linux-block.git] / arch / x86 / virt / svm / sev.c
CommitLineData
216d106c
BS
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * AMD SVM-SEV Host Support.
4 *
5 * Copyright (C) 2023 Advanced Micro Devices, Inc.
6 *
7 * Author: Ashish Kalra <ashish.kalra@amd.com>
8 *
9 */
10
11#include <linux/cc_platform.h>
12#include <linux/printk.h>
13#include <linux/mm_types.h>
14#include <linux/set_memory.h>
15#include <linux/memblock.h>
16#include <linux/kernel.h>
17#include <linux/mm.h>
18#include <linux/cpumask.h>
19#include <linux/iommu.h>
20#include <linux/amd-iommu.h>
21
22#include <asm/sev.h>
23#include <asm/processor.h>
24#include <asm/setup.h>
25#include <asm/svm.h>
26#include <asm/smp.h>
27#include <asm/cpu.h>
28#include <asm/apic.h>
29#include <asm/cpuid.h>
30#include <asm/cmdline.h>
31#include <asm/iommu.h>
32
33/*
34 * The RMP entry format is not architectural. The format is defined in PPR
35 * Family 19h Model 01h, Rev B1 processor.
36 */
37struct rmpentry {
1f568d36
BS
38 union {
39 struct {
40 u64 assigned : 1,
41 pagesize : 1,
42 immutable : 1,
43 rsvd1 : 9,
44 gpa : 39,
45 asid : 10,
46 vmsa : 1,
47 validated : 1,
48 rsvd2 : 1;
49 };
50 u64 lo;
51 };
52 u64 hi;
216d106c
BS
53} __packed;
54
55/*
56 * The first 16KB from the RMP_BASE is used by the processor for the
57 * bookkeeping, the range needs to be added during the RMP entry lookup.
58 */
59#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000
60
94b36bc2
BS
61/* Mask to apply to a PFN to get the first PFN of a 2MB page */
62#define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
63
216d106c
BS
64static u64 probed_rmp_base, probed_rmp_size;
65static struct rmpentry *rmptable __ro_after_init;
66static u64 rmptable_max_pfn __ro_after_init;
67
8dac6429
AK
68static LIST_HEAD(snp_leaked_pages_list);
69static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
70
71static unsigned long snp_nr_leaked_pages;
72
216d106c
BS
73#undef pr_fmt
74#define pr_fmt(fmt) "SEV-SNP: " fmt
75
76static int __mfd_enable(unsigned int cpu)
77{
78 u64 val;
79
0ecaefb3 80 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
216d106c
BS
81 return 0;
82
83 rdmsrl(MSR_AMD64_SYSCFG, val);
84
85 val |= MSR_AMD64_SYSCFG_MFDM;
86
87 wrmsrl(MSR_AMD64_SYSCFG, val);
88
89 return 0;
90}
91
92static __init void mfd_enable(void *arg)
93{
94 __mfd_enable(smp_processor_id());
95}
96
97static int __snp_enable(unsigned int cpu)
98{
99 u64 val;
100
0ecaefb3 101 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
216d106c
BS
102 return 0;
103
104 rdmsrl(MSR_AMD64_SYSCFG, val);
105
106 val |= MSR_AMD64_SYSCFG_SNP_EN;
107 val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
108
109 wrmsrl(MSR_AMD64_SYSCFG, val);
110
111 return 0;
112}
113
114static __init void snp_enable(void *arg)
115{
116 __snp_enable(smp_processor_id());
117}
118
119#define RMP_ADDR_MASK GENMASK_ULL(51, 13)
120
121bool snp_probe_rmptable_info(void)
122{
123 u64 max_rmp_pfn, calc_rmp_sz, rmp_sz, rmp_base, rmp_end;
124
125 rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
126 rdmsrl(MSR_AMD64_RMP_END, rmp_end);
127
128 if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
129 pr_err("Memory for the RMP table has not been reserved by BIOS\n");
130 return false;
131 }
132
133 if (rmp_base > rmp_end) {
134 pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
135 return false;
136 }
137
138 rmp_sz = rmp_end - rmp_base + 1;
139
140 /*
141 * Calculate the amount the memory that must be reserved by the BIOS to
142 * address the whole RAM, including the bookkeeping area. The RMP itself
143 * must also be covered.
144 */
145 max_rmp_pfn = max_pfn;
146 if (PHYS_PFN(rmp_end) > max_pfn)
147 max_rmp_pfn = PHYS_PFN(rmp_end);
148
149 calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
150
151 if (calc_rmp_sz > rmp_sz) {
152 pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
153 calc_rmp_sz, rmp_sz);
154 return false;
155 }
156
157 probed_rmp_base = rmp_base;
158 probed_rmp_size = rmp_sz;
159
160 pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
161 probed_rmp_base, probed_rmp_base + probed_rmp_size - 1);
162
163 return true;
164}
165
400fea4b
AK
166static void __init __snp_fixup_e820_tables(u64 pa)
167{
168 if (IS_ALIGNED(pa, PMD_SIZE))
169 return;
170
171 /*
172 * Handle cases where the RMP table placement by the BIOS is not
173 * 2M aligned and the kexec kernel could try to allocate
174 * from within that chunk which then causes a fatal RMP fault.
175 *
176 * The e820_table needs to be updated as it is converted to
177 * kernel memory resources and used by KEXEC_FILE_LOAD syscall
178 * to load kexec segments.
179 *
180 * The e820_table_firmware needs to be updated as it is exposed
181 * to sysfs and used by the KEXEC_LOAD syscall to load kexec
182 * segments.
183 *
184 * The e820_table_kexec needs to be updated as it passed to
185 * the kexec-ed kernel.
186 */
187 pa = ALIGN_DOWN(pa, PMD_SIZE);
188 if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
189 pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
190 e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
191 e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
192 e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
193 }
194}
195
196void __init snp_fixup_e820_tables(void)
197{
198 __snp_fixup_e820_tables(probed_rmp_base);
199 __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
200}
201
216d106c
BS
202/*
203 * Do the necessary preparations which are verified by the firmware as
204 * described in the SNP_INIT_EX firmware command description in the SNP
205 * firmware ABI spec.
206 */
207static int __init snp_rmptable_init(void)
208{
209 void *rmptable_start;
210 u64 rmptable_size;
211 u64 val;
212
0ecaefb3 213 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
216d106c
BS
214 return 0;
215
216 if (!amd_iommu_snp_en)
0ecaefb3 217 goto nosnp;
216d106c
BS
218
219 if (!probed_rmp_size)
220 goto nosnp;
221
222 rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB);
223 if (!rmptable_start) {
224 pr_err("Failed to map RMP table\n");
225 return 1;
226 }
227
228 /*
229 * Check if SEV-SNP is already enabled, this can happen in case of
230 * kexec boot.
231 */
232 rdmsrl(MSR_AMD64_SYSCFG, val);
233 if (val & MSR_AMD64_SYSCFG_SNP_EN)
234 goto skip_enable;
235
236 memset(rmptable_start, 0, probed_rmp_size);
237
238 /* Flush the caches to ensure that data is written before SNP is enabled. */
239 wbinvd_on_all_cpus();
240
241 /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
242 on_each_cpu(mfd_enable, NULL, 1);
243
244 on_each_cpu(snp_enable, NULL, 1);
245
246skip_enable:
247 rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ;
248 rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
249
250 rmptable = (struct rmpentry *)rmptable_start;
251 rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1;
252
253 cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
254
8ef97958
AK
255 /*
256 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
257 * notifier is invoked to do SNP IOMMU shutdown before kdump.
258 */
259 crash_kexec_post_notifiers = true;
260
216d106c
BS
261 return 0;
262
263nosnp:
0ecaefb3 264 cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
216d106c
BS
265 return -ENOSYS;
266}
267
268/*
269 * This must be called after the IOMMU has been initialized.
270 */
271device_initcall(snp_rmptable_init);
94b36bc2
BS
272
273static struct rmpentry *get_rmpentry(u64 pfn)
274{
275 if (WARN_ON_ONCE(pfn > rmptable_max_pfn))
276 return ERR_PTR(-EFAULT);
277
278 return &rmptable[pfn];
279}
280
281static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
282{
283 struct rmpentry *large_entry, *entry;
284
0ecaefb3 285 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
94b36bc2
BS
286 return ERR_PTR(-ENODEV);
287
288 entry = get_rmpentry(pfn);
289 if (IS_ERR(entry))
290 return entry;
291
292 /*
293 * Find the authoritative RMP entry for a PFN. This can be either a 4K
294 * RMP entry or a special large RMP entry that is authoritative for a
295 * whole 2M area.
296 */
297 large_entry = get_rmpentry(pfn & PFN_PMD_MASK);
298 if (IS_ERR(large_entry))
299 return large_entry;
300
301 *level = RMP_TO_PG_LEVEL(large_entry->pagesize);
302
303 return entry;
304}
305
306int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
307{
308 struct rmpentry *e;
309
310 e = __snp_lookup_rmpentry(pfn, level);
311 if (IS_ERR(e))
312 return PTR_ERR(e);
313
314 *assigned = !!e->assigned;
315 return 0;
316}
317EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
1f568d36
BS
318
319/*
320 * Dump the raw RMP entry for a particular PFN. These bits are documented in the
321 * PPR for a particular CPU model and provide useful information about how a
322 * particular PFN is being utilized by the kernel/firmware at the time certain
323 * unexpected events occur, such as RMP faults.
324 */
325static void dump_rmpentry(u64 pfn)
326{
327 u64 pfn_i, pfn_end;
328 struct rmpentry *e;
329 int level;
330
331 e = __snp_lookup_rmpentry(pfn, &level);
332 if (IS_ERR(e)) {
333 pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n",
334 pfn, PTR_ERR(e));
335 return;
336 }
337
338 if (e->assigned) {
339 pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
340 pfn, e->lo, e->hi);
341 return;
342 }
343
344 /*
345 * If the RMP entry for a particular PFN is not in an assigned state,
346 * then it is sometimes useful to get an idea of whether or not any RMP
347 * entries for other PFNs within the same 2MB region are assigned, since
348 * those too can affect the ability to access a particular PFN in
349 * certain situations, such as when the PFN is being accessed via a 2MB
350 * mapping in the host page table.
351 */
352 pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
353 pfn_end = pfn_i + PTRS_PER_PMD;
354
355 pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
356 pfn, pfn_i, pfn_end);
357
358 while (pfn_i < pfn_end) {
359 e = __snp_lookup_rmpentry(pfn_i, &level);
360 if (IS_ERR(e)) {
361 pr_err("Error %ld reading RMP entry for PFN 0x%llx\n",
362 PTR_ERR(e), pfn_i);
363 pfn_i++;
364 continue;
365 }
366
367 if (e->lo || e->hi)
368 pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi);
369 pfn_i++;
370 }
371}
372
373void snp_dump_hva_rmpentry(unsigned long hva)
374{
375 unsigned long paddr;
376 unsigned int level;
377 pgd_t *pgd;
378 pte_t *pte;
379
380 pgd = __va(read_cr3_pa());
381 pgd += pgd_index(hva);
382 pte = lookup_address_in_pgd(pgd, hva, &level);
383
384 if (!pte) {
385 pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
386 return;
387 }
388
389 paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
390 dump_rmpentry(PHYS_PFN(paddr));
391}
2c35819e
BS
392
393/*
394 * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
395 * Validated bit.
396 */
397int psmash(u64 pfn)
398{
399 unsigned long paddr = pfn << PAGE_SHIFT;
400 int ret;
401
0ecaefb3 402 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
2c35819e
BS
403 return -ENODEV;
404
405 if (!pfn_valid(pfn))
406 return -EINVAL;
407
408 /* Binutils version 2.36 supports the PSMASH mnemonic. */
409 asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
410 : "=a" (ret)
411 : "a" (paddr)
412 : "memory", "cc");
413
414 return ret;
415}
416EXPORT_SYMBOL_GPL(psmash);
417
661b1c61
MR
418/*
419 * If the kernel uses a 2MB or larger directmap mapping to write to an address,
420 * and that mapping contains any 4KB pages that are set to private in the RMP
421 * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
422 * owns the PFNs being transitioned will never attempt such a write, but other
423 * kernel tasks writing to other PFNs in the range may trigger these checks
424 * inadvertently due a large directmap mapping that happens to overlap such a
425 * PFN.
426 *
427 * Prevent this by splitting any 2MB+ mappings that might end up containing a
428 * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
429 * PFN/rmp_level passed in.
430 *
431 * Note that there is no attempt here to scan all the RMP entries for the 2MB
432 * physical range, since it would only be worthwhile in determining if a
433 * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
434 * the same shared/private state, thus avoiding the need to split the mapping.
435 * But that would mean the entries are currently in a mixed state, and so the
436 * mapping would have already been split as a result of prior transitions.
437 * And since the 4K split is only done if the mapping is 2MB+, and there isn't
438 * currently a mechanism in place to restore 2MB+ mappings, such a check would
439 * not provide any usable benefit.
440 *
441 * More specifics on how these checks are carried out can be found in APM
442 * Volume 2, "RMP and VMPL Access Checks".
443 */
444static int adjust_direct_map(u64 pfn, int rmp_level)
445{
446 unsigned long vaddr;
447 unsigned int level;
448 int npages, ret;
449 pte_t *pte;
450
451 /*
452 * pfn_to_kaddr() will return a vaddr only within the direct
453 * map range.
454 */
455 vaddr = (unsigned long)pfn_to_kaddr(pfn);
456
457 /* Only 4KB/2MB RMP entries are supported by current hardware. */
458 if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
459 return -EINVAL;
460
461 if (!pfn_valid(pfn))
462 return -EINVAL;
463
464 if (rmp_level == PG_LEVEL_2M &&
465 (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
466 return -EINVAL;
467
468 /*
469 * If an entire 2MB physical range is being transitioned, then there is
470 * no risk of RMP #PFs due to write accesses from overlapping mappings,
471 * since even accesses from 1GB mappings will be treated as 2MB accesses
472 * as far as RMP table checks are concerned.
473 */
474 if (rmp_level == PG_LEVEL_2M)
475 return 0;
476
477 pte = lookup_address(vaddr, &level);
478 if (!pte || pte_none(*pte))
479 return 0;
480
481 if (level == PG_LEVEL_4K)
482 return 0;
483
484 npages = page_level_size(rmp_level) / PAGE_SIZE;
485 ret = set_memory_4k(vaddr, npages);
486 if (ret)
487 pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
488 pfn, ret);
489
490 return ret;
491}
492
2c35819e
BS
493/*
494 * It is expected that those operations are seldom enough so that no mutual
495 * exclusion of updaters is needed and thus the overlap error condition below
496 * should happen very rarely and would get resolved relatively quickly by
497 * the firmware.
498 *
499 * If not, one could consider introducing a mutex or so here to sync concurrent
500 * RMP updates and thus diminish the amount of cases where firmware needs to
501 * lock 2M ranges to protect against concurrent updates.
502 *
503 * The optimal solution would be range locking to avoid locking disjoint
504 * regions unnecessarily but there's no support for that yet.
505 */
506static int rmpupdate(u64 pfn, struct rmp_state *state)
507{
508 unsigned long paddr = pfn << PAGE_SHIFT;
661b1c61 509 int ret, level;
2c35819e 510
0ecaefb3 511 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
2c35819e
BS
512 return -ENODEV;
513
661b1c61
MR
514 level = RMP_TO_PG_LEVEL(state->pagesize);
515
516 if (adjust_direct_map(pfn, level))
517 return -EFAULT;
518
2c35819e
BS
519 do {
520 /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
521 asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
522 : "=a" (ret)
523 : "a" (paddr), "c" ((unsigned long)state)
524 : "memory", "cc");
525 } while (ret == RMPUPDATE_FAIL_OVERLAP);
526
527 if (ret) {
661b1c61
MR
528 pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
529 pfn, level, ret);
2c35819e
BS
530 dump_rmpentry(pfn);
531 dump_stack();
532 return -EFAULT;
533 }
534
535 return 0;
536}
537
538/* Transition a page to guest-owned/private state in the RMP table. */
539int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
540{
541 struct rmp_state state;
542
543 memset(&state, 0, sizeof(state));
544 state.assigned = 1;
545 state.asid = asid;
546 state.immutable = immutable;
547 state.gpa = gpa;
548 state.pagesize = PG_LEVEL_TO_RMP(level);
549
550 return rmpupdate(pfn, &state);
551}
552EXPORT_SYMBOL_GPL(rmp_make_private);
553
554/* Transition a page to hypervisor-owned/shared state in the RMP table. */
555int rmp_make_shared(u64 pfn, enum pg_level level)
556{
557 struct rmp_state state;
558
559 memset(&state, 0, sizeof(state));
560 state.pagesize = PG_LEVEL_TO_RMP(level);
561
562 return rmpupdate(pfn, &state);
563}
564EXPORT_SYMBOL_GPL(rmp_make_shared);
8dac6429
AK
565
566void snp_leak_pages(u64 pfn, unsigned int npages)
567{
568 struct page *page = pfn_to_page(pfn);
569
570 pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
571
572 spin_lock(&snp_leaked_pages_list_lock);
573 while (npages--) {
574
575 /*
576 * Reuse the page's buddy list for chaining into the leaked
577 * pages list. This page should not be on a free list currently
578 * and is also unsafe to be added to a free list.
579 */
580 if (likely(!PageCompound(page)) ||
581
582 /*
583 * Skip inserting tail pages of compound page as
584 * page->buddy_list of tail pages is not usable.
585 */
586 (PageHead(page) && compound_nr(page) <= npages))
587 list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
588
589 dump_rmpentry(pfn);
590 snp_nr_leaked_pages++;
591 pfn++;
592 page++;
593 }
594 spin_unlock(&snp_leaked_pages_list_lock);
595}
596EXPORT_SYMBOL_GPL(snp_leak_pages);
0ecaefb3
BPA
597
598void kdump_sev_callback(void)
599{
600 /*
601 * Do wbinvd() on remote CPUs when SNP is enabled in order to
602 * safely do SNP_SHUTDOWN on the local CPU.
603 */
604 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
605 wbinvd();
606}