arch/powerpc/mm/hugetlbpage.c

   1 /*
   2  * PPC Huge TLB Page Support for Kernel.
   3  *
   4  * Copyright (C) 2003 David Gibson, IBM Corporation.
   5  * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
   6  *
   7  * Based on the IA-32 version:
   8  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   9  */
  10
  11 #include <linux/mm.h>
  12 #include <linux/io.h>
  13 #include <linux/slab.h>
  14 #include <linux/hugetlb.h>
  15 #include <linux/export.h>
  16 #include <linux/of_fdt.h>
  17 #include <linux/memblock.h>
  18 #include <linux/bootmem.h>
  19 #include <linux/moduleparam.h>
  20 #include <linux/swap.h>
  21 #include <linux/swapops.h>
  22 #include <linux/kmemleak.h>
  23 #include <asm/pgtable.h>
  24 #include <asm/pgalloc.h>
  25 #include <asm/tlb.h>
  26 #include <asm/setup.h>
  27 #include <asm/hugetlb.h>
  28 #include <asm/pte-walk.h>
  29
  30
  31 #ifdef CONFIG_HUGETLB_PAGE
  32
  33 #define PAGE_SHIFT_64K  16
  34 #define PAGE_SHIFT_512K 19
  35 #define PAGE_SHIFT_8M   23
  36 #define PAGE_SHIFT_16M  24
  37 #define PAGE_SHIFT_16G  34
  38
  39 bool hugetlb_disabled = false;
  40
  41 unsigned int HPAGE_SHIFT;
  42 EXPORT_SYMBOL(HPAGE_SHIFT);
  43
  44 #define hugepd_none(hpd)        (hpd_val(hpd) == 0)
  45
  46 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  47 {
  48         /*
  49          * Only called for hugetlbfs pages, hence can ignore THP and the
  50          * irq disabled walk.
  51          */
  52         return __find_linux_pte(mm->pgd, addr, NULL, NULL);
  53 }
  54
  55 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  56                            unsigned long address, unsigned int pdshift,
  57                            unsigned int pshift, spinlock_t *ptl)
  58 {
  59         struct kmem_cache *cachep;
  60         pte_t *new;
  61         int i;
  62         int num_hugepd;
  63
  64         if (pshift >= pdshift) {
  65                 cachep = hugepte_cache;
  66                 num_hugepd = 1 << (pshift - pdshift);
  67         } else {
  68                 cachep = PGT_CACHE(pdshift - pshift);
  69                 num_hugepd = 1;
  70         }
  71
  72         new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
  73
  74         BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  75         BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  76
  77         if (! new)
  78                 return -ENOMEM;
  79
  80         /*
  81          * Make sure other cpus find the hugepd set only after a
  82          * properly initialized page table is visible to them.
  83          * For more details look for comment in __pte_alloc().
  84          */
  85         smp_wmb();
  86
  87         spin_lock(ptl);
  88         /*
  89          * We have multiple higher-level entries that point to the same
  90          * actual pte location.  Fill in each as we go and backtrack on error.
  91          * We need all of these so the DTLB pgtable walk code can find the
  92          * right higher-level entry without knowing if it's a hugepage or not.
  93          */
  94         for (i = 0; i < num_hugepd; i++, hpdp++) {
  95                 if (unlikely(!hugepd_none(*hpdp)))
  96                         break;
  97                 else {
  98 #ifdef CONFIG_PPC_BOOK3S_64
  99                         *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS |
 100                                          (shift_to_mmu_psize(pshift) << 2));
 101 #elif defined(CONFIG_PPC_8xx)
 102                         *hpdp = __hugepd(__pa(new) | _PMD_USER |
 103                                          (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
 104                                           _PMD_PAGE_512K) | _PMD_PRESENT);
 105 #else
 106                         /* We use the old format for PPC_FSL_BOOK3E */
 107                         *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
 108 #endif
 109                 }
 110         }
 111         /* If we bailed from the for loop early, an error occurred, clean up */
 112         if (i < num_hugepd) {
 113                 for (i = i - 1 ; i >= 0; i--, hpdp--)
 114                         *hpdp = __hugepd(0);
 115                 kmem_cache_free(cachep, new);
 116         } else {
 117                 kmemleak_ignore(new);
 118         }
 119         spin_unlock(ptl);
 120         return 0;
 121 }
 122
 123 /*
 124  * At this point we do the placement change only for BOOK3S 64. This would
 125  * possibly work on other subarchs.
 126  */
 127 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 128 {
 129         pgd_t *pg;
 130         pud_t *pu;
 131         pmd_t *pm;
 132         hugepd_t *hpdp = NULL;
 133         unsigned pshift = __ffs(sz);
 134         unsigned pdshift = PGDIR_SHIFT;
 135         spinlock_t *ptl;
 136
 137         addr &= ~(sz-1);
 138         pg = pgd_offset(mm, addr);
 139
 140 #ifdef CONFIG_PPC_BOOK3S_64
 141         if (pshift == PGDIR_SHIFT)
 142                 /* 16GB huge page */
 143                 return (pte_t *) pg;
 144         else if (pshift > PUD_SHIFT) {
 145                 /*
 146                  * We need to use hugepd table
 147                  */
 148                 ptl = &mm->page_table_lock;
 149                 hpdp = (hugepd_t *)pg;
 150         } else {
 151                 pdshift = PUD_SHIFT;
 152                 pu = pud_alloc(mm, pg, addr);
 153                 if (pshift == PUD_SHIFT)
 154                         return (pte_t *)pu;
 155                 else if (pshift > PMD_SHIFT) {
 156                         ptl = pud_lockptr(mm, pu);
 157                         hpdp = (hugepd_t *)pu;
 158                 } else {
 159                         pdshift = PMD_SHIFT;
 160                         pm = pmd_alloc(mm, pu, addr);
 161                         if (pshift == PMD_SHIFT)
 162                                 /* 16MB hugepage */
 163                                 return (pte_t *)pm;
 164                         else {
 165                                 ptl = pmd_lockptr(mm, pm);
 166                                 hpdp = (hugepd_t *)pm;
 167                         }
 168                 }
 169         }
 170 #else
 171         if (pshift >= PGDIR_SHIFT) {
 172                 ptl = &mm->page_table_lock;
 173                 hpdp = (hugepd_t *)pg;
 174         } else {
 175                 pdshift = PUD_SHIFT;
 176                 pu = pud_alloc(mm, pg, addr);
 177                 if (pshift >= PUD_SHIFT) {
 178                         ptl = pud_lockptr(mm, pu);
 179                         hpdp = (hugepd_t *)pu;
 180                 } else {
 181                         pdshift = PMD_SHIFT;
 182                         pm = pmd_alloc(mm, pu, addr);
 183                         ptl = pmd_lockptr(mm, pm);
 184                         hpdp = (hugepd_t *)pm;
 185                 }
 186         }
 187 #endif
 188         if (!hpdp)
 189                 return NULL;
 190
 191         BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
 192
 193         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
 194                                                   pdshift, pshift, ptl))
 195                 return NULL;
 196
 197         return hugepte_offset(*hpdp, addr, pdshift);
 198 }
 199
 200 #ifdef CONFIG_PPC_BOOK3S_64
 201 /*
 202  * Tracks gpages after the device tree is scanned and before the
 203  * huge_boot_pages list is ready on pseries.
 204  */
 205 #define MAX_NUMBER_GPAGES       1024
 206 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
 207 __initdata static unsigned nr_gpages;
 208
 209 /*
 210  * Build list of addresses of gigantic pages.  This function is used in early
 211  * boot before the buddy allocator is setup.
 212  */
 213 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 214 {
 215         if (!addr)
 216                 return;
 217         while (number_of_pages > 0) {
 218                 gpage_freearray[nr_gpages] = addr;
 219                 nr_gpages++;
 220                 number_of_pages--;
 221                 addr += page_size;
 222         }
 223 }
 224
 225 int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
 226 {
 227         struct huge_bootmem_page *m;
 228         if (nr_gpages == 0)
 229                 return 0;
 230         m = phys_to_virt(gpage_freearray[--nr_gpages]);
 231         gpage_freearray[nr_gpages] = 0;
 232         list_add(&m->list, &huge_boot_pages);
 233         m->hstate = hstate;
 234         return 1;
 235 }
 236 #endif
 237
 238
 239 int __init alloc_bootmem_huge_page(struct hstate *h)
 240 {
 241
 242 #ifdef CONFIG_PPC_BOOK3S_64
 243         if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
 244                 return pseries_alloc_bootmem_huge_page(h);
 245 #endif
 246         return __alloc_bootmem_huge_page(h);
 247 }
 248
 249 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 250 #define HUGEPD_FREELIST_SIZE \
 251         ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
 252
 253 struct hugepd_freelist {
 254         struct rcu_head rcu;
 255         unsigned int index;
 256         void *ptes[0];
 257 };
 258
 259 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
 260
 261 static void hugepd_free_rcu_callback(struct rcu_head *head)
 262 {
 263         struct hugepd_freelist *batch =
 264                 container_of(head, struct hugepd_freelist, rcu);
 265         unsigned int i;
 266
 267         for (i = 0; i < batch->index; i++)
 268                 kmem_cache_free(hugepte_cache, batch->ptes[i]);
 269
 270         free_page((unsigned long)batch);
 271 }
 272
 273 static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 274 {
 275         struct hugepd_freelist **batchp;
 276
 277         batchp = &get_cpu_var(hugepd_freelist_cur);
 278
 279         if (atomic_read(&tlb->mm->mm_users) < 2 ||
 280             mm_is_thread_local(tlb->mm)) {
 281                 kmem_cache_free(hugepte_cache, hugepte);
 282                 put_cpu_var(hugepd_freelist_cur);
 283                 return;
 284         }
 285
 286         if (*batchp == NULL) {
 287                 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
 288                 (*batchp)->index = 0;
 289         }
 290
 291         (*batchp)->ptes[(*batchp)->index++] = hugepte;
 292         if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
 293                 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
 294                 *batchp = NULL;
 295         }
 296         put_cpu_var(hugepd_freelist_cur);
 297 }
 298 #else
 299 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
 300 #endif
 301
 302 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
 303                               unsigned long start, unsigned long end,
 304                               unsigned long floor, unsigned long ceiling)
 305 {
 306         pte_t *hugepte = hugepd_page(*hpdp);
 307         int i;
 308
 309         unsigned long pdmask = ~((1UL << pdshift) - 1);
 310         unsigned int num_hugepd = 1;
 311         unsigned int shift = hugepd_shift(*hpdp);
 312
 313         /* Note: On fsl the hpdp may be the first of several */
 314         if (shift > pdshift)
 315                 num_hugepd = 1 << (shift - pdshift);
 316
 317         start &= pdmask;
 318         if (start < floor)
 319                 return;
 320         if (ceiling) {
 321                 ceiling &= pdmask;
 322                 if (! ceiling)
 323                         return;
 324         }
 325         if (end - 1 > ceiling - 1)
 326                 return;
 327
 328         for (i = 0; i < num_hugepd; i++, hpdp++)
 329                 *hpdp = __hugepd(0);
 330
 331         if (shift >= pdshift)
 332                 hugepd_free(tlb, hugepte);
 333         else
 334                 pgtable_free_tlb(tlb, hugepte,
 335                                  get_hugepd_cache_index(pdshift - shift));
 336 }
 337
 338 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 339                                    unsigned long addr, unsigned long end,
 340                                    unsigned long floor, unsigned long ceiling)
 341 {
 342         pmd_t *pmd;
 343         unsigned long next;
 344         unsigned long start;
 345
 346         start = addr;
 347         do {
 348                 unsigned long more;
 349
 350                 pmd = pmd_offset(pud, addr);
 351                 next = pmd_addr_end(addr, end);
 352                 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
 353                         /*
 354                          * if it is not hugepd pointer, we should already find
 355                          * it cleared.
 356                          */
 357                         WARN_ON(!pmd_none_or_clear_bad(pmd));
 358                         continue;
 359                 }
 360                 /*
 361                  * Increment next by the size of the huge mapping since
 362                  * there may be more than one entry at this level for a
 363                  * single hugepage, but all of them point to
 364                  * the same kmem cache that holds the hugepte.
 365                  */
 366                 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
 367                 if (more > next)
 368                         next = more;
 369
 370                 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
 371                                   addr, next, floor, ceiling);
 372         } while (addr = next, addr != end);
 373
 374         start &= PUD_MASK;
 375         if (start < floor)
 376                 return;
 377         if (ceiling) {
 378                 ceiling &= PUD_MASK;
 379                 if (!ceiling)
 380                         return;
 381         }
 382         if (end - 1 > ceiling - 1)
 383                 return;
 384
 385         pmd = pmd_offset(pud, start);
 386         pud_clear(pud);
 387         pmd_free_tlb(tlb, pmd, start);
 388         mm_dec_nr_pmds(tlb->mm);
 389 }
 390
 391 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 392                                    unsigned long addr, unsigned long end,
 393                                    unsigned long floor, unsigned long ceiling)
 394 {
 395         pud_t *pud;
 396         unsigned long next;
 397         unsigned long start;
 398
 399         start = addr;
 400         do {
 401                 pud = pud_offset(pgd, addr);
 402                 next = pud_addr_end(addr, end);
 403                 if (!is_hugepd(__hugepd(pud_val(*pud)))) {
 404                         if (pud_none_or_clear_bad(pud))
 405                                 continue;
 406                         hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 407                                                ceiling);
 408                 } else {
 409                         unsigned long more;
 410                         /*
 411                          * Increment next by the size of the huge mapping since
 412                          * there may be more than one entry at this level for a
 413                          * single hugepage, but all of them point to
 414                          * the same kmem cache that holds the hugepte.
 415                          */
 416                         more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
 417                         if (more > next)
 418                                 next = more;
 419
 420                         free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
 421                                           addr, next, floor, ceiling);
 422                 }
 423         } while (addr = next, addr != end);
 424
 425         start &= PGDIR_MASK;
 426         if (start < floor)
 427                 return;
 428         if (ceiling) {
 429                 ceiling &= PGDIR_MASK;
 430                 if (!ceiling)
 431                         return;
 432         }
 433         if (end - 1 > ceiling - 1)
 434                 return;
 435
 436         pud = pud_offset(pgd, start);
 437         pgd_clear(pgd);
 438         pud_free_tlb(tlb, pud, start);
 439         mm_dec_nr_puds(tlb->mm);
 440 }
 441
 442 /*
 443  * This function frees user-level page tables of a process.
 444  */
 445 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 446                             unsigned long addr, unsigned long end,
 447                             unsigned long floor, unsigned long ceiling)
 448 {
 449         pgd_t *pgd;
 450         unsigned long next;
 451
 452         /*
 453          * Because there are a number of different possible pagetable
 454          * layouts for hugepage ranges, we limit knowledge of how
 455          * things should be laid out to the allocation path
 456          * (huge_pte_alloc(), above).  Everything else works out the
 457          * structure as it goes from information in the hugepd
 458          * pointers.  That means that we can't here use the
 459          * optimization used in the normal page free_pgd_range(), of
 460          * checking whether we're actually covering a large enough
 461          * range to have to do anything at the top level of the walk
 462          * instead of at the bottom.
 463          *
 464          * To make sense of this, you should probably go read the big
 465          * block comment at the top of the normal free_pgd_range(),
 466          * too.
 467          */
 468
 469         do {
 470                 next = pgd_addr_end(addr, end);
 471                 pgd = pgd_offset(tlb->mm, addr);
 472                 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
 473                         if (pgd_none_or_clear_bad(pgd))
 474                                 continue;
 475                         hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 476                 } else {
 477                         unsigned long more;
 478                         /*
 479                          * Increment next by the size of the huge mapping since
 480                          * there may be more than one entry at the pgd level
 481                          * for a single hugepage, but all of them point to the
 482                          * same kmem cache that holds the hugepte.
 483                          */
 484                         more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
 485                         if (more > next)
 486                                 next = more;
 487
 488                         free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
 489                                           addr, next, floor, ceiling);
 490                 }
 491         } while (addr = next, addr != end);
 492 }
 493
 494 struct page *follow_huge_pd(struct vm_area_struct *vma,
 495                             unsigned long address, hugepd_t hpd,
 496                             int flags, int pdshift)
 497 {
 498         pte_t *ptep;
 499         spinlock_t *ptl;
 500         struct page *page = NULL;
 501         unsigned long mask;
 502         int shift = hugepd_shift(hpd);
 503         struct mm_struct *mm = vma->vm_mm;
 504
 505 retry:
 506         /*
 507          * hugepage directory entries are protected by mm->page_table_lock
 508          * Use this instead of huge_pte_lockptr
 509          */
 510         ptl = &mm->page_table_lock;
 511         spin_lock(ptl);
 512
 513         ptep = hugepte_offset(hpd, address, pdshift);
 514         if (pte_present(*ptep)) {
 515                 mask = (1UL << shift) - 1;
 516                 page = pte_page(*ptep);
 517                 page += ((address & mask) >> PAGE_SHIFT);
 518                 if (flags & FOLL_GET)
 519                         get_page(page);
 520         } else {
 521                 if (is_hugetlb_entry_migration(*ptep)) {
 522                         spin_unlock(ptl);
 523                         __migration_entry_wait(mm, ptep, ptl);
 524                         goto retry;
 525                 }
 526         }
 527         spin_unlock(ptl);
 528         return page;
 529 }
 530
 531 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 532                                       unsigned long sz)
 533 {
 534         unsigned long __boundary = (addr + sz) & ~(sz-1);
 535         return (__boundary - 1 < end - 1) ? __boundary : end;
 536 }
 537
 538 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
 539                 unsigned long end, int write, struct page **pages, int *nr)
 540 {
 541         pte_t *ptep;
 542         unsigned long sz = 1UL << hugepd_shift(hugepd);
 543         unsigned long next;
 544
 545         ptep = hugepte_offset(hugepd, addr, pdshift);
 546         do {
 547                 next = hugepte_addr_end(addr, end, sz);
 548                 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
 549                         return 0;
 550         } while (ptep++, addr = next, addr != end);
 551
 552         return 1;
 553 }
 554
 555 #ifdef CONFIG_PPC_MM_SLICES
 556 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 557                                         unsigned long len, unsigned long pgoff,
 558                                         unsigned long flags)
 559 {
 560         struct hstate *hstate = hstate_file(file);
 561         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 562
 563 #ifdef CONFIG_PPC_RADIX_MMU
 564         if (radix_enabled())
 565                 return radix__hugetlb_get_unmapped_area(file, addr, len,
 566                                                        pgoff, flags);
 567 #endif
 568         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
 569 }
 570 #endif
 571
 572 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 573 {
 574 #ifdef CONFIG_PPC_MM_SLICES
 575         /* With radix we don't use slice, so derive it from vma*/
 576         if (!radix_enabled()) {
 577                 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 578
 579                 return 1UL << mmu_psize_to_shift(psize);
 580         }
 581 #endif
 582         return vma_kernel_pagesize(vma);
 583 }
 584
 585 static inline bool is_power_of_4(unsigned long x)
 586 {
 587         if (is_power_of_2(x))
 588                 return (__ilog2(x) % 2) ? false : true;
 589         return false;
 590 }
 591
 592 static int __init add_huge_page_size(unsigned long long size)
 593 {
 594         int shift = __ffs(size);
 595         int mmu_psize;
 596
 597         /* Check that it is a page size supported by the hardware and
 598          * that it fits within pagetable and slice limits. */
 599         if (size <= PAGE_SIZE)
 600                 return -EINVAL;
 601 #if defined(CONFIG_PPC_FSL_BOOK3E)
 602         if (!is_power_of_4(size))
 603                 return -EINVAL;
 604 #elif !defined(CONFIG_PPC_8xx)
 605         if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
 606                 return -EINVAL;
 607 #endif
 608
 609         if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 610                 return -EINVAL;
 611
 612 #ifdef CONFIG_PPC_BOOK3S_64
 613         /*
 614          * We need to make sure that for different page sizes reported by
 615          * firmware we only add hugetlb support for page sizes that can be
 616          * supported by linux page table layout.
 617          * For now we have
 618          * Radix: 2M and 1G
 619          * Hash: 16M and 16G
 620          */
 621         if (radix_enabled()) {
 622                 if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
 623                         return -EINVAL;
 624         } else {
 625                 if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
 626                         return -EINVAL;
 627         }
 628 #endif
 629
 630         BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 631
 632         /* Return if huge page size has already been setup */
 633         if (size_to_hstate(size))
 634                 return 0;
 635
 636         hugetlb_add_hstate(shift - PAGE_SHIFT);
 637
 638         return 0;
 639 }
 640
 641 static int __init hugepage_setup_sz(char *str)
 642 {
 643         unsigned long long size;
 644
 645         size = memparse(str, &str);
 646
 647         if (add_huge_page_size(size) != 0) {
 648                 hugetlb_bad_size();
 649                 pr_err("Invalid huge page size specified(%llu)\n", size);
 650         }
 651
 652         return 1;
 653 }
 654 __setup("hugepagesz=", hugepage_setup_sz);
 655
 656 struct kmem_cache *hugepte_cache;
 657 static int __init hugetlbpage_init(void)
 658 {
 659         int psize;
 660
 661         if (hugetlb_disabled) {
 662                 pr_info("HugeTLB support is disabled!\n");
 663                 return 0;
 664         }
 665
 666 #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
 667         if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
 668                 return -ENODEV;
 669 #endif
 670         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 671                 unsigned shift;
 672                 unsigned pdshift;
 673
 674                 if (!mmu_psize_defs[psize].shift)
 675                         continue;
 676
 677                 shift = mmu_psize_to_shift(psize);
 678
 679 #ifdef CONFIG_PPC_BOOK3S_64
 680                 if (shift > PGDIR_SHIFT)
 681                         continue;
 682                 else if (shift > PUD_SHIFT)
 683                         pdshift = PGDIR_SHIFT;
 684                 else if (shift > PMD_SHIFT)
 685                         pdshift = PUD_SHIFT;
 686                 else
 687                         pdshift = PMD_SHIFT;
 688 #else
 689                 if (shift < PUD_SHIFT)
 690                         pdshift = PMD_SHIFT;
 691                 else if (shift < PGDIR_SHIFT)
 692                         pdshift = PUD_SHIFT;
 693                 else
 694                         pdshift = PGDIR_SHIFT;
 695 #endif
 696
 697                 if (add_huge_page_size(1ULL << shift) < 0)
 698                         continue;
 699                 /*
 700                  * if we have pdshift and shift value same, we don't
 701                  * use pgt cache for hugepd.
 702                  */
 703                 if (pdshift > shift)
 704                         pgtable_cache_add(pdshift - shift, NULL);
 705 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 706                 else if (!hugepte_cache) {
 707                         /*
 708                          * Create a kmem cache for hugeptes.  The bottom bits in
 709                          * the pte have size information encoded in them, so
 710                          * align them to allow this
 711                          */
 712                         hugepte_cache = kmem_cache_create("hugepte-cache",
 713                                                           sizeof(pte_t),
 714                                                           HUGEPD_SHIFT_MASK + 1,
 715                                                           0, NULL);
 716                         if (hugepte_cache == NULL)
 717                                 panic("%s: Unable to create kmem cache "
 718                                       "for hugeptes\n", __func__);
 719
 720                 }
 721 #endif
 722         }
 723
 724 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 725         /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
 726         if (mmu_psize_defs[MMU_PAGE_4M].shift)
 727                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
 728         else if (mmu_psize_defs[MMU_PAGE_512K].shift)
 729                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
 730 #else
 731         /* Set default large page size. Currently, we pick 16M or 1M
 732          * depending on what is available
 733          */
 734         if (mmu_psize_defs[MMU_PAGE_16M].shift)
 735                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
 736         else if (mmu_psize_defs[MMU_PAGE_1M].shift)
 737                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 738         else if (mmu_psize_defs[MMU_PAGE_2M].shift)
 739                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
 740 #endif
 741         return 0;
 742 }
 743
 744 arch_initcall(hugetlbpage_init);
 745
 746 void flush_dcache_icache_hugepage(struct page *page)
 747 {
 748         int i;
 749         void *start;
 750
 751         BUG_ON(!PageCompound(page));
 752
 753         for (i = 0; i < (1UL << compound_order(page)); i++) {
 754                 if (!PageHighMem(page)) {
 755                         __flush_dcache_icache(page_address(page+i));
 756                 } else {
 757                         start = kmap_atomic(page+i);
 758                         __flush_dcache_icache(start);
 759                         kunmap_atomic(start);
 760                 }
 761         }
 762 }
 763
 764 #endif /* CONFIG_HUGETLB_PAGE */
 765
 766 /*
 767  * We have 4 cases for pgds and pmds:
 768  * (1) invalid (all zeroes)
 769  * (2) pointer to next table, as normal; bottom 6 bits == 0
 770  * (3) leaf pte for huge page _PAGE_PTE set
 771  * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
 772  *
 773  * So long as we atomically load page table pointers we are safe against teardown,
 774  * we can follow the address down to the the page and take a ref on it.
 775  * This function need to be called with interrupts disabled. We use this variant
 776  * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
 777  */
 778 pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 779                         bool *is_thp, unsigned *hpage_shift)
 780 {
 781         pgd_t pgd, *pgdp;
 782         pud_t pud, *pudp;
 783         pmd_t pmd, *pmdp;
 784         pte_t *ret_pte;
 785         hugepd_t *hpdp = NULL;
 786         unsigned pdshift = PGDIR_SHIFT;
 787
 788         if (hpage_shift)
 789                 *hpage_shift = 0;
 790
 791         if (is_thp)
 792                 *is_thp = false;
 793
 794         pgdp = pgdir + pgd_index(ea);
 795         pgd  = READ_ONCE(*pgdp);
 796         /*
 797          * Always operate on the local stack value. This make sure the
 798          * value don't get updated by a parallel THP split/collapse,
 799          * page fault or a page unmap. The return pte_t * is still not
 800          * stable. So should be checked there for above conditions.
 801          */
 802         if (pgd_none(pgd))
 803                 return NULL;
 804         else if (pgd_huge(pgd)) {
 805                 ret_pte = (pte_t *) pgdp;
 806                 goto out;
 807         } else if (is_hugepd(__hugepd(pgd_val(pgd))))
 808                 hpdp = (hugepd_t *)&pgd;
 809         else {
 810                 /*
 811                  * Even if we end up with an unmap, the pgtable will not
 812                  * be freed, because we do an rcu free and here we are
 813                  * irq disabled
 814                  */
 815                 pdshift = PUD_SHIFT;
 816                 pudp = pud_offset(&pgd, ea);
 817                 pud  = READ_ONCE(*pudp);
 818
 819                 if (pud_none(pud))
 820                         return NULL;
 821                 else if (pud_huge(pud)) {
 822                         ret_pte = (pte_t *) pudp;
 823                         goto out;
 824                 } else if (is_hugepd(__hugepd(pud_val(pud))))
 825                         hpdp = (hugepd_t *)&pud;
 826                 else {
 827                         pdshift = PMD_SHIFT;
 828                         pmdp = pmd_offset(&pud, ea);
 829                         pmd  = READ_ONCE(*pmdp);
 830                         /*
 831                          * A hugepage collapse is captured by pmd_none, because
 832                          * it mark the pmd none and do a hpte invalidate.
 833                          */
 834                         if (pmd_none(pmd))
 835                                 return NULL;
 836
 837                         if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
 838                                 if (is_thp)
 839                                         *is_thp = true;
 840                                 ret_pte = (pte_t *) pmdp;
 841                                 goto out;
 842                         }
 843                         /*
 844                          * pmd_large check below will handle the swap pmd pte
 845                          * we need to do both the check because they are config
 846                          * dependent.
 847                          */
 848                         if (pmd_huge(pmd) || pmd_large(pmd)) {
 849                                 ret_pte = (pte_t *) pmdp;
 850                                 goto out;
 851                         } else if (is_hugepd(__hugepd(pmd_val(pmd))))
 852                                 hpdp = (hugepd_t *)&pmd;
 853                         else
 854                                 return pte_offset_kernel(&pmd, ea);
 855                 }
 856         }
 857         if (!hpdp)
 858                 return NULL;
 859
 860         ret_pte = hugepte_offset(*hpdp, ea, pdshift);
 861         pdshift = hugepd_shift(*hpdp);
 862 out:
 863         if (hpage_shift)
 864                 *hpage_shift = pdshift;
 865         return ret_pte;
 866 }
 867 EXPORT_SYMBOL_GPL(__find_linux_pte);
 868
 869 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 870                 unsigned long end, int write, struct page **pages, int *nr)
 871 {
 872         unsigned long pte_end;
 873         struct page *head, *page;
 874         pte_t pte;
 875         int refs;
 876
 877         pte_end = (addr + sz) & ~(sz-1);
 878         if (pte_end < end)
 879                 end = pte_end;
 880
 881         pte = READ_ONCE(*ptep);
 882
 883         if (!pte_access_permitted(pte, write))
 884                 return 0;
 885
 886         /* hugepages are never "special" */
 887         VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 888
 889         refs = 0;
 890         head = pte_page(pte);
 891
 892         page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
 893         do {
 894                 VM_BUG_ON(compound_head(page) != head);
 895                 pages[*nr] = page;
 896                 (*nr)++;
 897                 page++;
 898                 refs++;
 899         } while (addr += PAGE_SIZE, addr != end);
 900
 901         if (!page_cache_add_speculative(head, refs)) {
 902                 *nr -= refs;
 903                 return 0;
 904         }
 905
 906         if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 907                 /* Could be optimized better */
 908                 *nr -= refs;
 909                 while (refs--)
 910                         put_page(head);
 911                 return 0;
 912         }
 913
 914         return 1;
 915 }