mm/pagewalk.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/pagewalk.h>
   3 #include <linux/highmem.h>
   4 #include <linux/sched.h>
   5 #include <linux/hugetlb.h>
   6
   7 /*
   8  * We want to know the real level where a entry is located ignoring any
   9  * folding of levels which may be happening. For example if p4d is folded then
  10  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
  11  */
  12 static int real_depth(int depth)
  13 {
  14         if (depth == 3 && PTRS_PER_PMD == 1)
  15                 depth = 2;
  16         if (depth == 2 && PTRS_PER_PUD == 1)
  17                 depth = 1;
  18         if (depth == 1 && PTRS_PER_P4D == 1)
  19                 depth = 0;
  20         return depth;
  21 }
  22
  23 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
  24                                 unsigned long end, struct mm_walk *walk)
  25 {
  26         const struct mm_walk_ops *ops = walk->ops;
  27         int err = 0;
  28
  29         for (;;) {
  30                 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
  31                 if (err)
  32                        break;
  33                 if (addr >= end - PAGE_SIZE)
  34                         break;
  35                 addr += PAGE_SIZE;
  36                 pte++;
  37         }
  38         return err;
  39 }
  40
  41 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
  42                           struct mm_walk *walk)
  43 {
  44         pte_t *pte;
  45         int err = 0;
  46         spinlock_t *ptl;
  47
  48         if (walk->no_vma) {
  49                 /*
  50                  * pte_offset_map() might apply user-specific validation.
  51                  * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
  52                  * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
  53                  * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
  54                  */
  55                 if (walk->mm == &init_mm || addr >= TASK_SIZE)
  56                         pte = pte_offset_kernel(pmd, addr);
  57                 else
  58                         pte = pte_offset_map(pmd, addr);
  59                 if (pte) {
  60                         err = walk_pte_range_inner(pte, addr, end, walk);
  61                         if (walk->mm != &init_mm && addr < TASK_SIZE)
  62                                 pte_unmap(pte);
  63                 }
  64         } else {
  65                 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
  66                 if (pte) {
  67                         err = walk_pte_range_inner(pte, addr, end, walk);
  68                         pte_unmap_unlock(pte, ptl);
  69                 }
  70         }
  71         if (!pte)
  72                 walk->action = ACTION_AGAIN;
  73         return err;
  74 }
  75
  76 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  77                           struct mm_walk *walk)
  78 {
  79         pmd_t *pmd;
  80         unsigned long next;
  81         const struct mm_walk_ops *ops = walk->ops;
  82         int err = 0;
  83         int depth = real_depth(3);
  84
  85         pmd = pmd_offset(pud, addr);
  86         do {
  87 again:
  88                 next = pmd_addr_end(addr, end);
  89                 if (pmd_none(*pmd)) {
  90                         if (ops->pte_hole)
  91                                 err = ops->pte_hole(addr, next, depth, walk);
  92                         if (err)
  93                                 break;
  94                         continue;
  95                 }
  96
  97                 walk->action = ACTION_SUBTREE;
  98
  99                 /*
 100                  * This implies that each ->pmd_entry() handler
 101                  * needs to know about pmd_trans_huge() pmds
 102                  */
 103                 if (ops->pmd_entry)
 104                         err = ops->pmd_entry(pmd, addr, next, walk);
 105                 if (err)
 106                         break;
 107
 108                 if (walk->action == ACTION_AGAIN)
 109                         goto again;
 110
 111                 /*
 112                  * Check this here so we only break down trans_huge
 113                  * pages when we _need_ to
 114                  */
 115                 if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
 116                     walk->action == ACTION_CONTINUE ||
 117                     !(ops->pte_entry))
 118                         continue;
 119
 120                 if (walk->vma)
 121                         split_huge_pmd(walk->vma, pmd, addr);
 122
 123                 err = walk_pte_range(pmd, addr, next, walk);
 124                 if (err)
 125                         break;
 126
 127                 if (walk->action == ACTION_AGAIN)
 128                         goto again;
 129
 130         } while (pmd++, addr = next, addr != end);
 131
 132         return err;
 133 }
 134
 135 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 136                           struct mm_walk *walk)
 137 {
 138         pud_t *pud;
 139         unsigned long next;
 140         const struct mm_walk_ops *ops = walk->ops;
 141         int err = 0;
 142         int depth = real_depth(2);
 143
 144         pud = pud_offset(p4d, addr);
 145         do {
 146  again:
 147                 next = pud_addr_end(addr, end);
 148                 if (pud_none(*pud)) {
 149                         if (ops->pte_hole)
 150                                 err = ops->pte_hole(addr, next, depth, walk);
 151                         if (err)
 152                                 break;
 153                         continue;
 154                 }
 155
 156                 walk->action = ACTION_SUBTREE;
 157
 158                 if (ops->pud_entry)
 159                         err = ops->pud_entry(pud, addr, next, walk);
 160                 if (err)
 161                         break;
 162
 163                 if (walk->action == ACTION_AGAIN)
 164                         goto again;
 165
 166                 if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
 167                     walk->action == ACTION_CONTINUE ||
 168                     !(ops->pmd_entry || ops->pte_entry))
 169                         continue;
 170
 171                 if (walk->vma)
 172                         split_huge_pud(walk->vma, pud, addr);
 173                 if (pud_none(*pud))
 174                         goto again;
 175
 176                 err = walk_pmd_range(pud, addr, next, walk);
 177                 if (err)
 178                         break;
 179         } while (pud++, addr = next, addr != end);
 180
 181         return err;
 182 }
 183
 184 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 185                           struct mm_walk *walk)
 186 {
 187         p4d_t *p4d;
 188         unsigned long next;
 189         const struct mm_walk_ops *ops = walk->ops;
 190         int err = 0;
 191         int depth = real_depth(1);
 192
 193         p4d = p4d_offset(pgd, addr);
 194         do {
 195                 next = p4d_addr_end(addr, end);
 196                 if (p4d_none_or_clear_bad(p4d)) {
 197                         if (ops->pte_hole)
 198                                 err = ops->pte_hole(addr, next, depth, walk);
 199                         if (err)
 200                                 break;
 201                         continue;
 202                 }
 203                 if (ops->p4d_entry) {
 204                         err = ops->p4d_entry(p4d, addr, next, walk);
 205                         if (err)
 206                                 break;
 207                 }
 208                 if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 209                         err = walk_pud_range(p4d, addr, next, walk);
 210                 if (err)
 211                         break;
 212         } while (p4d++, addr = next, addr != end);
 213
 214         return err;
 215 }
 216
 217 static int walk_pgd_range(unsigned long addr, unsigned long end,
 218                           struct mm_walk *walk)
 219 {
 220         pgd_t *pgd;
 221         unsigned long next;
 222         const struct mm_walk_ops *ops = walk->ops;
 223         int err = 0;
 224
 225         if (walk->pgd)
 226                 pgd = walk->pgd + pgd_index(addr);
 227         else
 228                 pgd = pgd_offset(walk->mm, addr);
 229         do {
 230                 next = pgd_addr_end(addr, end);
 231                 if (pgd_none_or_clear_bad(pgd)) {
 232                         if (ops->pte_hole)
 233                                 err = ops->pte_hole(addr, next, 0, walk);
 234                         if (err)
 235                                 break;
 236                         continue;
 237                 }
 238                 if (ops->pgd_entry) {
 239                         err = ops->pgd_entry(pgd, addr, next, walk);
 240                         if (err)
 241                                 break;
 242                 }
 243                 if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 244                         err = walk_p4d_range(pgd, addr, next, walk);
 245                 if (err)
 246                         break;
 247         } while (pgd++, addr = next, addr != end);
 248
 249         return err;
 250 }
 251
 252 #ifdef CONFIG_HUGETLB_PAGE
 253 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 254                                        unsigned long end)
 255 {
 256         unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
 257         return boundary < end ? boundary : end;
 258 }
 259
 260 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 261                               struct mm_walk *walk)
 262 {
 263         struct vm_area_struct *vma = walk->vma;
 264         struct hstate *h = hstate_vma(vma);
 265         unsigned long next;
 266         unsigned long hmask = huge_page_mask(h);
 267         unsigned long sz = huge_page_size(h);
 268         pte_t *pte;
 269         const struct mm_walk_ops *ops = walk->ops;
 270         int err = 0;
 271
 272         hugetlb_vma_lock_read(vma);
 273         do {
 274                 next = hugetlb_entry_end(h, addr, end);
 275                 pte = hugetlb_walk(vma, addr & hmask, sz);
 276                 if (pte)
 277                         err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
 278                 else if (ops->pte_hole)
 279                         err = ops->pte_hole(addr, next, -1, walk);
 280                 if (err)
 281                         break;
 282         } while (addr = next, addr != end);
 283         hugetlb_vma_unlock_read(vma);
 284
 285         return err;
 286 }
 287
 288 #else /* CONFIG_HUGETLB_PAGE */
 289 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 290                               struct mm_walk *walk)
 291 {
 292         return 0;
 293 }
 294
 295 #endif /* CONFIG_HUGETLB_PAGE */
 296
 297 /*
 298  * Decide whether we really walk over the current vma on [@start, @end)
 299  * or skip it via the returned value. Return 0 if we do walk over the
 300  * current vma, and return 1 if we skip the vma. Negative values means
 301  * error, where we abort the current walk.
 302  */
 303 static int walk_page_test(unsigned long start, unsigned long end,
 304                         struct mm_walk *walk)
 305 {
 306         struct vm_area_struct *vma = walk->vma;
 307         const struct mm_walk_ops *ops = walk->ops;
 308
 309         if (ops->test_walk)
 310                 return ops->test_walk(start, end, walk);
 311
 312         /*
 313          * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
 314          * range, so we don't walk over it as we do for normal vmas. However,
 315          * Some callers are interested in handling hole range and they don't
 316          * want to just ignore any single address range. Such users certainly
 317          * define their ->pte_hole() callbacks, so let's delegate them to handle
 318          * vma(VM_PFNMAP).
 319          */
 320         if (vma->vm_flags & VM_PFNMAP) {
 321                 int err = 1;
 322                 if (ops->pte_hole)
 323                         err = ops->pte_hole(start, end, -1, walk);
 324                 return err ? err : 1;
 325         }
 326         return 0;
 327 }
 328
 329 static int __walk_page_range(unsigned long start, unsigned long end,
 330                         struct mm_walk *walk)
 331 {
 332         int err = 0;
 333         struct vm_area_struct *vma = walk->vma;
 334         const struct mm_walk_ops *ops = walk->ops;
 335
 336         if (ops->pre_vma) {
 337                 err = ops->pre_vma(start, end, walk);
 338                 if (err)
 339                         return err;
 340         }
 341
 342         if (is_vm_hugetlb_page(vma)) {
 343                 if (ops->hugetlb_entry)
 344                         err = walk_hugetlb_range(start, end, walk);
 345         } else
 346                 err = walk_pgd_range(start, end, walk);
 347
 348         if (ops->post_vma)
 349                 ops->post_vma(walk);
 350
 351         return err;
 352 }
 353
 354 static inline void process_mm_walk_lock(struct mm_struct *mm,
 355                                         enum page_walk_lock walk_lock)
 356 {
 357         if (walk_lock == PGWALK_RDLOCK)
 358                 mmap_assert_locked(mm);
 359         else
 360                 mmap_assert_write_locked(mm);
 361 }
 362
 363 static inline void process_vma_walk_lock(struct vm_area_struct *vma,
 364                                          enum page_walk_lock walk_lock)
 365 {
 366 #ifdef CONFIG_PER_VMA_LOCK
 367         switch (walk_lock) {
 368         case PGWALK_WRLOCK:
 369                 vma_start_write(vma);
 370                 break;
 371         case PGWALK_WRLOCK_VERIFY:
 372                 vma_assert_write_locked(vma);
 373                 break;
 374         case PGWALK_RDLOCK:
 375                 /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
 376                 break;
 377         }
 378 #endif
 379 }
 380
 381 /**
 382  * walk_page_range - walk page table with caller specific callbacks
 383  * @mm:         mm_struct representing the target process of page table walk
 384  * @start:      start address of the virtual address range
 385  * @end:        end address of the virtual address range
 386  * @ops:        operation to call during the walk
 387  * @private:    private data for callbacks' usage
 388  *
 389  * Recursively walk the page table tree of the process represented by @mm
 390  * within the virtual address range [@start, @end). During walking, we can do
 391  * some caller-specific works for each entry, by setting up pmd_entry(),
 392  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
 393  * callbacks, the associated entries/pages are just ignored.
 394  * The return values of these callbacks are commonly defined like below:
 395  *
 396  *  - 0  : succeeded to handle the current entry, and if you don't reach the
 397  *         end address yet, continue to walk.
 398  *  - >0 : succeeded to handle the current entry, and return to the caller
 399  *         with caller specific value.
 400  *  - <0 : failed to handle the current entry, and return to the caller
 401  *         with error code.
 402  *
 403  * Before starting to walk page table, some callers want to check whether
 404  * they really want to walk over the current vma, typically by checking
 405  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
 406  * purpose.
 407  *
 408  * If operations need to be staged before and committed after a vma is walked,
 409  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
 410  * since it is intended to handle commit-type operations, can't return any
 411  * errors.
 412  *
 413  * struct mm_walk keeps current values of some common data like vma and pmd,
 414  * which are useful for the access from callbacks. If you want to pass some
 415  * caller-specific data to callbacks, @private should be helpful.
 416  *
 417  * Locking:
 418  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
 419  *   because these function traverse vma list and/or access to vma's data.
 420  */
 421 int walk_page_range(struct mm_struct *mm, unsigned long start,
 422                 unsigned long end, const struct mm_walk_ops *ops,
 423                 void *private)
 424 {
 425         int err = 0;
 426         unsigned long next;
 427         struct vm_area_struct *vma;
 428         struct mm_walk walk = {
 429                 .ops            = ops,
 430                 .mm             = mm,
 431                 .private        = private,
 432         };
 433
 434         if (start >= end)
 435                 return -EINVAL;
 436
 437         if (!walk.mm)
 438                 return -EINVAL;
 439
 440         process_mm_walk_lock(walk.mm, ops->walk_lock);
 441
 442         vma = find_vma(walk.mm, start);
 443         do {
 444                 if (!vma) { /* after the last vma */
 445                         walk.vma = NULL;
 446                         next = end;
 447                         if (ops->pte_hole)
 448                                 err = ops->pte_hole(start, next, -1, &walk);
 449                 } else if (start < vma->vm_start) { /* outside vma */
 450                         walk.vma = NULL;
 451                         next = min(end, vma->vm_start);
 452                         if (ops->pte_hole)
 453                                 err = ops->pte_hole(start, next, -1, &walk);
 454                 } else { /* inside vma */
 455                         process_vma_walk_lock(vma, ops->walk_lock);
 456                         walk.vma = vma;
 457                         next = min(end, vma->vm_end);
 458                         vma = find_vma(mm, vma->vm_end);
 459
 460                         err = walk_page_test(start, next, &walk);
 461                         if (err > 0) {
 462                                 /*
 463                                  * positive return values are purely for
 464                                  * controlling the pagewalk, so should never
 465                                  * be passed to the callers.
 466                                  */
 467                                 err = 0;
 468                                 continue;
 469                         }
 470                         if (err < 0)
 471                                 break;
 472                         err = __walk_page_range(start, next, &walk);
 473                 }
 474                 if (err)
 475                         break;
 476         } while (start = next, start < end);
 477         return err;
 478 }
 479
 480 /**
 481  * walk_page_range_novma - walk a range of pagetables not backed by a vma
 482  * @mm:         mm_struct representing the target process of page table walk
 483  * @start:      start address of the virtual address range
 484  * @end:        end address of the virtual address range
 485  * @ops:        operation to call during the walk
 486  * @pgd:        pgd to walk if different from mm->pgd
 487  * @private:    private data for callbacks' usage
 488  *
 489  * Similar to walk_page_range() but can walk any page tables even if they are
 490  * not backed by VMAs. Because 'unusual' entries may be walked this function
 491  * will also not lock the PTEs for the pte_entry() callback. This is useful for
 492  * walking the kernel pages tables or page tables for firmware.
 493  *
 494  * Note: Be careful to walk the kernel pages tables, the caller may be need to
 495  * take other effective approache (mmap lock may be insufficient) to prevent
 496  * the intermediate kernel page tables belonging to the specified address range
 497  * from being freed (e.g. memory hot-remove).
 498  */
 499 int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
 500                           unsigned long end, const struct mm_walk_ops *ops,
 501                           pgd_t *pgd,
 502                           void *private)
 503 {
 504         struct mm_walk walk = {
 505                 .ops            = ops,
 506                 .mm             = mm,
 507                 .pgd            = pgd,
 508                 .private        = private,
 509                 .no_vma         = true
 510         };
 511
 512         if (start >= end || !walk.mm)
 513                 return -EINVAL;
 514
 515         /*
 516          * 1) For walking the user virtual address space:
 517          *
 518          * The mmap lock protects the page walker from changes to the page
 519          * tables during the walk.  However a read lock is insufficient to
 520          * protect those areas which don't have a VMA as munmap() detaches
 521          * the VMAs before downgrading to a read lock and actually tearing
 522          * down PTEs/page tables. In which case, the mmap write lock should
 523          * be hold.
 524          *
 525          * 2) For walking the kernel virtual address space:
 526          *
 527          * The kernel intermediate page tables usually do not be freed, so
 528          * the mmap map read lock is sufficient. But there are some exceptions.
 529          * E.g. memory hot-remove. In which case, the mmap lock is insufficient
 530          * to prevent the intermediate kernel pages tables belonging to the
 531          * specified address range from being freed. The caller should take
 532          * other actions to prevent this race.
 533          */
 534         if (mm == &init_mm)
 535                 mmap_assert_locked(walk.mm);
 536         else
 537                 mmap_assert_write_locked(walk.mm);
 538
 539         return walk_pgd_range(start, end, &walk);
 540 }
 541
 542 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
 543                         unsigned long end, const struct mm_walk_ops *ops,
 544                         void *private)
 545 {
 546         struct mm_walk walk = {
 547                 .ops            = ops,
 548                 .mm             = vma->vm_mm,
 549                 .vma            = vma,
 550                 .private        = private,
 551         };
 552
 553         if (start >= end || !walk.mm)
 554                 return -EINVAL;
 555         if (start < vma->vm_start || end > vma->vm_end)
 556                 return -EINVAL;
 557
 558         process_mm_walk_lock(walk.mm, ops->walk_lock);
 559         process_vma_walk_lock(vma, ops->walk_lock);
 560         return __walk_page_range(start, end, &walk);
 561 }
 562
 563 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
 564                 void *private)
 565 {
 566         struct mm_walk walk = {
 567                 .ops            = ops,
 568                 .mm             = vma->vm_mm,
 569                 .vma            = vma,
 570                 .private        = private,
 571         };
 572
 573         if (!walk.mm)
 574                 return -EINVAL;
 575
 576         process_mm_walk_lock(walk.mm, ops->walk_lock);
 577         process_vma_walk_lock(vma, ops->walk_lock);
 578         return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
 579 }
 580
 581 /**
 582  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
 583  * @mapping: Pointer to the struct address_space
 584  * @first_index: First page offset in the address_space
 585  * @nr: Number of incremental page offsets to cover
 586  * @ops:        operation to call during the walk
 587  * @private:    private data for callbacks' usage
 588  *
 589  * This function walks all memory areas mapped into a struct address_space.
 590  * The walk is limited to only the given page-size index range, but if
 591  * the index boundaries cross a huge page-table entry, that entry will be
 592  * included.
 593  *
 594  * Also see walk_page_range() for additional information.
 595  *
 596  * Locking:
 597  *   This function can't require that the struct mm_struct::mmap_lock is held,
 598  *   since @mapping may be mapped by multiple processes. Instead
 599  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
 600  *   callbacks, and it's up tho the caller to ensure that the
 601  *   struct mm_struct::mmap_lock is not needed.
 602  *
 603  *   Also this means that a caller can't rely on the struct
 604  *   vm_area_struct::vm_flags to be constant across a call,
 605  *   except for immutable flags. Callers requiring this shouldn't use
 606  *   this function.
 607  *
 608  * Return: 0 on success, negative error code on failure, positive number on
 609  * caller defined premature termination.
 610  */
 611 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
 612                       pgoff_t nr, const struct mm_walk_ops *ops,
 613                       void *private)
 614 {
 615         struct mm_walk walk = {
 616                 .ops            = ops,
 617                 .private        = private,
 618         };
 619         struct vm_area_struct *vma;
 620         pgoff_t vba, vea, cba, cea;
 621         unsigned long start_addr, end_addr;
 622         int err = 0;
 623
 624         lockdep_assert_held(&mapping->i_mmap_rwsem);
 625         vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
 626                                   first_index + nr - 1) {
 627                 /* Clip to the vma */
 628                 vba = vma->vm_pgoff;
 629                 vea = vba + vma_pages(vma);
 630                 cba = first_index;
 631                 cba = max(cba, vba);
 632                 cea = first_index + nr;
 633                 cea = min(cea, vea);
 634
 635                 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
 636                 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
 637                 if (start_addr >= end_addr)
 638                         continue;
 639
 640                 walk.vma = vma;
 641                 walk.mm = vma->vm_mm;
 642
 643                 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
 644                 if (err > 0) {
 645                         err = 0;
 646                         break;
 647                 } else if (err < 0)
 648                         break;
 649
 650                 err = __walk_page_range(start_addr, end_addr, &walk);
 651                 if (err)
 652                         break;
 653         }
 654
 655         return err;
 656 }