arch/x86/virt/svm/sev.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * AMD SVM-SEV Host Support.
   4  *
   5  * Copyright (C) 2023 Advanced Micro Devices, Inc.
   6  *
   7  * Author: Ashish Kalra <ashish.kalra@amd.com>
   8  *
   9  */
  10
  11 #include <linux/cc_platform.h>
  12 #include <linux/printk.h>
  13 #include <linux/mm_types.h>
  14 #include <linux/set_memory.h>
  15 #include <linux/memblock.h>
  16 #include <linux/kernel.h>
  17 #include <linux/mm.h>
  18 #include <linux/cpumask.h>
  19 #include <linux/iommu.h>
  20 #include <linux/amd-iommu.h>
  21
  22 #include <asm/sev.h>
  23 #include <asm/processor.h>
  24 #include <asm/setup.h>
  25 #include <asm/svm.h>
  26 #include <asm/smp.h>
  27 #include <asm/cpu.h>
  28 #include <asm/apic.h>
  29 #include <asm/cpuid.h>
  30 #include <asm/cmdline.h>
  31 #include <asm/iommu.h>
  32
  33 /*
  34  * The RMP entry format is not architectural. The format is defined in PPR
  35  * Family 19h Model 01h, Rev B1 processor.
  36  */
  37 struct rmpentry {
  38         union {
  39                 struct {
  40                         u64 assigned    : 1,
  41                             pagesize    : 1,
  42                             immutable   : 1,
  43                             rsvd1       : 9,
  44                             gpa         : 39,
  45                             asid        : 10,
  46                             vmsa        : 1,
  47                             validated   : 1,
  48                             rsvd2       : 1;
  49                 };
  50                 u64 lo;
  51         };
  52         u64 hi;
  53 } __packed;
  54
  55 /*
  56  * The first 16KB from the RMP_BASE is used by the processor for the
  57  * bookkeeping, the range needs to be added during the RMP entry lookup.
  58  */
  59 #define RMPTABLE_CPU_BOOKKEEPING_SZ     0x4000
  60
  61 /* Mask to apply to a PFN to get the first PFN of a 2MB page */
  62 #define PFN_PMD_MASK    GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
  63
  64 static u64 probed_rmp_base, probed_rmp_size;
  65 static struct rmpentry *rmptable __ro_after_init;
  66 static u64 rmptable_max_pfn __ro_after_init;
  67
  68 static LIST_HEAD(snp_leaked_pages_list);
  69 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
  70
  71 static unsigned long snp_nr_leaked_pages;
  72
  73 #undef pr_fmt
  74 #define pr_fmt(fmt)     "SEV-SNP: " fmt
  75
  76 static int __mfd_enable(unsigned int cpu)
  77 {
  78         u64 val;
  79
  80         if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
  81                 return 0;
  82
  83         rdmsrl(MSR_AMD64_SYSCFG, val);
  84
  85         val |= MSR_AMD64_SYSCFG_MFDM;
  86
  87         wrmsrl(MSR_AMD64_SYSCFG, val);
  88
  89         return 0;
  90 }
  91
  92 static __init void mfd_enable(void *arg)
  93 {
  94         __mfd_enable(smp_processor_id());
  95 }
  96
  97 static int __snp_enable(unsigned int cpu)
  98 {
  99         u64 val;
 100
 101         if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
 102                 return 0;
 103
 104         rdmsrl(MSR_AMD64_SYSCFG, val);
 105
 106         val |= MSR_AMD64_SYSCFG_SNP_EN;
 107         val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
 108
 109         wrmsrl(MSR_AMD64_SYSCFG, val);
 110
 111         return 0;
 112 }
 113
 114 static __init void snp_enable(void *arg)
 115 {
 116         __snp_enable(smp_processor_id());
 117 }
 118
 119 #define RMP_ADDR_MASK GENMASK_ULL(51, 13)
 120
 121 bool snp_probe_rmptable_info(void)
 122 {
 123         u64 max_rmp_pfn, calc_rmp_sz, rmp_sz, rmp_base, rmp_end;
 124
 125         rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
 126         rdmsrl(MSR_AMD64_RMP_END, rmp_end);
 127
 128         if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
 129                 pr_err("Memory for the RMP table has not been reserved by BIOS\n");
 130                 return false;
 131         }
 132
 133         if (rmp_base > rmp_end) {
 134                 pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
 135                 return false;
 136         }
 137
 138         rmp_sz = rmp_end - rmp_base + 1;
 139
 140         /*
 141          * Calculate the amount the memory that must be reserved by the BIOS to
 142          * address the whole RAM, including the bookkeeping area. The RMP itself
 143          * must also be covered.
 144          */
 145         max_rmp_pfn = max_pfn;
 146         if (PHYS_PFN(rmp_end) > max_pfn)
 147                 max_rmp_pfn = PHYS_PFN(rmp_end);
 148
 149         calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
 150
 151         if (calc_rmp_sz > rmp_sz) {
 152                 pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
 153                        calc_rmp_sz, rmp_sz);
 154                 return false;
 155         }
 156
 157         probed_rmp_base = rmp_base;
 158         probed_rmp_size = rmp_sz;
 159
 160         pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
 161                 probed_rmp_base, probed_rmp_base + probed_rmp_size - 1);
 162
 163         return true;
 164 }
 165
 166 /*
 167  * Do the necessary preparations which are verified by the firmware as
 168  * described in the SNP_INIT_EX firmware command description in the SNP
 169  * firmware ABI spec.
 170  */
 171 static int __init snp_rmptable_init(void)
 172 {
 173         void *rmptable_start;
 174         u64 rmptable_size;
 175         u64 val;
 176
 177         if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
 178                 return 0;
 179
 180         if (!amd_iommu_snp_en)
 181                 return 0;
 182
 183         if (!probed_rmp_size)
 184                 goto nosnp;
 185
 186         rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB);
 187         if (!rmptable_start) {
 188                 pr_err("Failed to map RMP table\n");
 189                 return 1;
 190         }
 191
 192         /*
 193          * Check if SEV-SNP is already enabled, this can happen in case of
 194          * kexec boot.
 195          */
 196         rdmsrl(MSR_AMD64_SYSCFG, val);
 197         if (val & MSR_AMD64_SYSCFG_SNP_EN)
 198                 goto skip_enable;
 199
 200         memset(rmptable_start, 0, probed_rmp_size);
 201
 202         /* Flush the caches to ensure that data is written before SNP is enabled. */
 203         wbinvd_on_all_cpus();
 204
 205         /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
 206         on_each_cpu(mfd_enable, NULL, 1);
 207
 208         on_each_cpu(snp_enable, NULL, 1);
 209
 210 skip_enable:
 211         rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ;
 212         rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
 213
 214         rmptable = (struct rmpentry *)rmptable_start;
 215         rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1;
 216
 217         cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
 218
 219         /*
 220          * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
 221          * notifier is invoked to do SNP IOMMU shutdown before kdump.
 222          */
 223         crash_kexec_post_notifiers = true;
 224
 225         return 0;
 226
 227 nosnp:
 228         setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
 229         return -ENOSYS;
 230 }
 231
 232 /*
 233  * This must be called after the IOMMU has been initialized.
 234  */
 235 device_initcall(snp_rmptable_init);
 236
 237 static struct rmpentry *get_rmpentry(u64 pfn)
 238 {
 239         if (WARN_ON_ONCE(pfn > rmptable_max_pfn))
 240                 return ERR_PTR(-EFAULT);
 241
 242         return &rmptable[pfn];
 243 }
 244
 245 static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
 246 {
 247         struct rmpentry *large_entry, *entry;
 248
 249         if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
 250                 return ERR_PTR(-ENODEV);
 251
 252         entry = get_rmpentry(pfn);
 253         if (IS_ERR(entry))
 254                 return entry;
 255
 256         /*
 257          * Find the authoritative RMP entry for a PFN. This can be either a 4K
 258          * RMP entry or a special large RMP entry that is authoritative for a
 259          * whole 2M area.
 260          */
 261         large_entry = get_rmpentry(pfn & PFN_PMD_MASK);
 262         if (IS_ERR(large_entry))
 263                 return large_entry;
 264
 265         *level = RMP_TO_PG_LEVEL(large_entry->pagesize);
 266
 267         return entry;
 268 }
 269
 270 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
 271 {
 272         struct rmpentry *e;
 273
 274         e = __snp_lookup_rmpentry(pfn, level);
 275         if (IS_ERR(e))
 276                 return PTR_ERR(e);
 277
 278         *assigned = !!e->assigned;
 279         return 0;
 280 }
 281 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
 282
 283 /*
 284  * Dump the raw RMP entry for a particular PFN. These bits are documented in the
 285  * PPR for a particular CPU model and provide useful information about how a
 286  * particular PFN is being utilized by the kernel/firmware at the time certain
 287  * unexpected events occur, such as RMP faults.
 288  */
 289 static void dump_rmpentry(u64 pfn)
 290 {
 291         u64 pfn_i, pfn_end;
 292         struct rmpentry *e;
 293         int level;
 294
 295         e = __snp_lookup_rmpentry(pfn, &level);
 296         if (IS_ERR(e)) {
 297                 pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n",
 298                        pfn, PTR_ERR(e));
 299                 return;
 300         }
 301
 302         if (e->assigned) {
 303                 pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
 304                         pfn, e->lo, e->hi);
 305                 return;
 306         }
 307
 308         /*
 309          * If the RMP entry for a particular PFN is not in an assigned state,
 310          * then it is sometimes useful to get an idea of whether or not any RMP
 311          * entries for other PFNs within the same 2MB region are assigned, since
 312          * those too can affect the ability to access a particular PFN in
 313          * certain situations, such as when the PFN is being accessed via a 2MB
 314          * mapping in the host page table.
 315          */
 316         pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
 317         pfn_end = pfn_i + PTRS_PER_PMD;
 318
 319         pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
 320                 pfn, pfn_i, pfn_end);
 321
 322         while (pfn_i < pfn_end) {
 323                 e = __snp_lookup_rmpentry(pfn_i, &level);
 324                 if (IS_ERR(e)) {
 325                         pr_err("Error %ld reading RMP entry for PFN 0x%llx\n",
 326                                PTR_ERR(e), pfn_i);
 327                         pfn_i++;
 328                         continue;
 329                 }
 330
 331                 if (e->lo || e->hi)
 332                         pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi);
 333                 pfn_i++;
 334         }
 335 }
 336
 337 void snp_dump_hva_rmpentry(unsigned long hva)
 338 {
 339         unsigned long paddr;
 340         unsigned int level;
 341         pgd_t *pgd;
 342         pte_t *pte;
 343
 344         pgd = __va(read_cr3_pa());
 345         pgd += pgd_index(hva);
 346         pte = lookup_address_in_pgd(pgd, hva, &level);
 347
 348         if (!pte) {
 349                 pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
 350                 return;
 351         }
 352
 353         paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
 354         dump_rmpentry(PHYS_PFN(paddr));
 355 }
 356
 357 /*
 358  * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
 359  * Validated bit.
 360  */
 361 int psmash(u64 pfn)
 362 {
 363         unsigned long paddr = pfn << PAGE_SHIFT;
 364         int ret;
 365
 366         if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
 367                 return -ENODEV;
 368
 369         if (!pfn_valid(pfn))
 370                 return -EINVAL;
 371
 372         /* Binutils version 2.36 supports the PSMASH mnemonic. */
 373         asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
 374                       : "=a" (ret)
 375                       : "a" (paddr)
 376                       : "memory", "cc");
 377
 378         return ret;
 379 }
 380 EXPORT_SYMBOL_GPL(psmash);
 381
 382 /*
 383  * If the kernel uses a 2MB or larger directmap mapping to write to an address,
 384  * and that mapping contains any 4KB pages that are set to private in the RMP
 385  * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
 386  * owns the PFNs being transitioned will never attempt such a write, but other
 387  * kernel tasks writing to other PFNs in the range may trigger these checks
 388  * inadvertently due a large directmap mapping that happens to overlap such a
 389  * PFN.
 390  *
 391  * Prevent this by splitting any 2MB+ mappings that might end up containing a
 392  * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
 393  * PFN/rmp_level passed in.
 394  *
 395  * Note that there is no attempt here to scan all the RMP entries for the 2MB
 396  * physical range, since it would only be worthwhile in determining if a
 397  * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
 398  * the same shared/private state, thus avoiding the need to split the mapping.
 399  * But that would mean the entries are currently in a mixed state, and so the
 400  * mapping would have already been split as a result of prior transitions.
 401  * And since the 4K split is only done if the mapping is 2MB+, and there isn't
 402  * currently a mechanism in place to restore 2MB+ mappings, such a check would
 403  * not provide any usable benefit.
 404  *
 405  * More specifics on how these checks are carried out can be found in APM
 406  * Volume 2, "RMP and VMPL Access Checks".
 407  */
 408 static int adjust_direct_map(u64 pfn, int rmp_level)
 409 {
 410         unsigned long vaddr;
 411         unsigned int level;
 412         int npages, ret;
 413         pte_t *pte;
 414
 415         /*
 416          * pfn_to_kaddr() will return a vaddr only within the direct
 417          * map range.
 418          */
 419         vaddr = (unsigned long)pfn_to_kaddr(pfn);
 420
 421         /* Only 4KB/2MB RMP entries are supported by current hardware. */
 422         if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
 423                 return -EINVAL;
 424
 425         if (!pfn_valid(pfn))
 426                 return -EINVAL;
 427
 428         if (rmp_level == PG_LEVEL_2M &&
 429             (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
 430                 return -EINVAL;
 431
 432         /*
 433          * If an entire 2MB physical range is being transitioned, then there is
 434          * no risk of RMP #PFs due to write accesses from overlapping mappings,
 435          * since even accesses from 1GB mappings will be treated as 2MB accesses
 436          * as far as RMP table checks are concerned.
 437          */
 438         if (rmp_level == PG_LEVEL_2M)
 439                 return 0;
 440
 441         pte = lookup_address(vaddr, &level);
 442         if (!pte || pte_none(*pte))
 443                 return 0;
 444
 445         if (level == PG_LEVEL_4K)
 446                 return 0;
 447
 448         npages = page_level_size(rmp_level) / PAGE_SIZE;
 449         ret = set_memory_4k(vaddr, npages);
 450         if (ret)
 451                 pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
 452                         pfn, ret);
 453
 454         return ret;
 455 }
 456
 457 /*
 458  * It is expected that those operations are seldom enough so that no mutual
 459  * exclusion of updaters is needed and thus the overlap error condition below
 460  * should happen very rarely and would get resolved relatively quickly by
 461  * the firmware.
 462  *
 463  * If not, one could consider introducing a mutex or so here to sync concurrent
 464  * RMP updates and thus diminish the amount of cases where firmware needs to
 465  * lock 2M ranges to protect against concurrent updates.
 466  *
 467  * The optimal solution would be range locking to avoid locking disjoint
 468  * regions unnecessarily but there's no support for that yet.
 469  */
 470 static int rmpupdate(u64 pfn, struct rmp_state *state)
 471 {
 472         unsigned long paddr = pfn << PAGE_SHIFT;
 473         int ret, level;
 474
 475         if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
 476                 return -ENODEV;
 477
 478         level = RMP_TO_PG_LEVEL(state->pagesize);
 479
 480         if (adjust_direct_map(pfn, level))
 481                 return -EFAULT;
 482
 483         do {
 484                 /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
 485                 asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
 486                              : "=a" (ret)
 487                              : "a" (paddr), "c" ((unsigned long)state)
 488                              : "memory", "cc");
 489         } while (ret == RMPUPDATE_FAIL_OVERLAP);
 490
 491         if (ret) {
 492                 pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
 493                        pfn, level, ret);
 494                 dump_rmpentry(pfn);
 495                 dump_stack();
 496                 return -EFAULT;
 497         }
 498
 499         return 0;
 500 }
 501
 502 /* Transition a page to guest-owned/private state in the RMP table. */
 503 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
 504 {
 505         struct rmp_state state;
 506
 507         memset(&state, 0, sizeof(state));
 508         state.assigned = 1;
 509         state.asid = asid;
 510         state.immutable = immutable;
 511         state.gpa = gpa;
 512         state.pagesize = PG_LEVEL_TO_RMP(level);
 513
 514         return rmpupdate(pfn, &state);
 515 }
 516 EXPORT_SYMBOL_GPL(rmp_make_private);
 517
 518 /* Transition a page to hypervisor-owned/shared state in the RMP table. */
 519 int rmp_make_shared(u64 pfn, enum pg_level level)
 520 {
 521         struct rmp_state state;
 522
 523         memset(&state, 0, sizeof(state));
 524         state.pagesize = PG_LEVEL_TO_RMP(level);
 525
 526         return rmpupdate(pfn, &state);
 527 }
 528 EXPORT_SYMBOL_GPL(rmp_make_shared);
 529
 530 void snp_leak_pages(u64 pfn, unsigned int npages)
 531 {
 532         struct page *page = pfn_to_page(pfn);
 533
 534         pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
 535
 536         spin_lock(&snp_leaked_pages_list_lock);
 537         while (npages--) {
 538
 539                 /*
 540                  * Reuse the page's buddy list for chaining into the leaked
 541                  * pages list. This page should not be on a free list currently
 542                  * and is also unsafe to be added to a free list.
 543                  */
 544                 if (likely(!PageCompound(page)) ||
 545
 546                         /*
 547                          * Skip inserting tail pages of compound page as
 548                          * page->buddy_list of tail pages is not usable.
 549                          */
 550                     (PageHead(page) && compound_nr(page) <= npages))
 551                         list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
 552
 553                 dump_rmpentry(pfn);
 554                 snp_nr_leaked_pages++;
 555                 pfn++;
 556                 page++;
 557         }
 558         spin_unlock(&snp_leaked_pages_list_lock);
 559 }
 560 EXPORT_SYMBOL_GPL(snp_leak_pages);