mm/huge_memory.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  Copyright (C) 2009  Red Hat, Inc.
   4  */
   5
   6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8 #include <linux/mm.h>
   9 #include <linux/sched.h>
  10 #include <linux/sched/mm.h>
  11 #include <linux/sched/coredump.h>
  12 #include <linux/sched/numa_balancing.h>
  13 #include <linux/highmem.h>
  14 #include <linux/hugetlb.h>
  15 #include <linux/mmu_notifier.h>
  16 #include <linux/rmap.h>
  17 #include <linux/swap.h>
  18 #include <linux/shrinker.h>
  19 #include <linux/mm_inline.h>
  20 #include <linux/swapops.h>
  21 #include <linux/backing-dev.h>
  22 #include <linux/dax.h>
  23 #include <linux/mm_types.h>
  24 #include <linux/khugepaged.h>
  25 #include <linux/freezer.h>
  26 #include <linux/pfn_t.h>
  27 #include <linux/mman.h>
  28 #include <linux/memremap.h>
  29 #include <linux/pagemap.h>
  30 #include <linux/debugfs.h>
  31 #include <linux/migrate.h>
  32 #include <linux/hashtable.h>
  33 #include <linux/userfaultfd_k.h>
  34 #include <linux/page_idle.h>
  35 #include <linux/shmem_fs.h>
  36 #include <linux/oom.h>
  37 #include <linux/numa.h>
  38 #include <linux/page_owner.h>
  39 #include <linux/sched/sysctl.h>
  40 #include <linux/memory-tiers.h>
  41 #include <linux/compat.h>
  42 #include <linux/pgalloc_tag.h>
  43
  44 #include <asm/tlb.h>
  45 #include <asm/pgalloc.h>
  46 #include "internal.h"
  47 #include "swap.h"
  48
  49 #define CREATE_TRACE_POINTS
  50 #include <trace/events/thp.h>
  51
  52 /*
  53  * By default, transparent hugepage support is disabled in order to avoid
  54  * risking an increased memory footprint for applications that are not
  55  * guaranteed to benefit from it. When transparent hugepage support is
  56  * enabled, it is for all mappings, and khugepaged scans all mappings.
  57  * Defrag is invoked by khugepaged hugepage allocations and by page faults
  58  * for all hugepage allocations.
  59  */
  60 unsigned long transparent_hugepage_flags __read_mostly =
  61 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
  62         (1<<TRANSPARENT_HUGEPAGE_FLAG)|
  63 #endif
  64 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  65         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  66 #endif
  67         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
  68         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  69         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  70
  71 static struct shrinker *deferred_split_shrinker;
  72 static unsigned long deferred_split_count(struct shrinker *shrink,
  73                                           struct shrink_control *sc);
  74 static unsigned long deferred_split_scan(struct shrinker *shrink,
  75                                          struct shrink_control *sc);
  76
  77 static atomic_t huge_zero_refcount;
  78 struct folio *huge_zero_folio __read_mostly;
  79 unsigned long huge_zero_pfn __read_mostly = ~0UL;
  80 unsigned long huge_anon_orders_always __read_mostly;
  81 unsigned long huge_anon_orders_madvise __read_mostly;
  82 unsigned long huge_anon_orders_inherit __read_mostly;
  83
  84 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
  85                                          unsigned long vm_flags,
  86                                          unsigned long tva_flags,
  87                                          unsigned long orders)
  88 {
  89         bool smaps = tva_flags & TVA_SMAPS;
  90         bool in_pf = tva_flags & TVA_IN_PF;
  91         bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
  92         unsigned long supported_orders;
  93
  94         /* Check the intersection of requested and supported orders. */
  95         if (vma_is_anonymous(vma))
  96                 supported_orders = THP_ORDERS_ALL_ANON;
  97         else if (vma_is_dax(vma))
  98                 supported_orders = THP_ORDERS_ALL_FILE_DAX;
  99         else
 100                 supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
 101
 102         orders &= supported_orders;
 103         if (!orders)
 104                 return 0;
 105
 106         if (!vma->vm_mm)                /* vdso */
 107                 return 0;
 108
 109         /*
 110          * Explicitly disabled through madvise or prctl, or some
 111          * architectures may disable THP for some mappings, for
 112          * example, s390 kvm.
 113          * */
 114         if ((vm_flags & VM_NOHUGEPAGE) ||
 115             test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 116                 return 0;
 117         /*
 118          * If the hardware/firmware marked hugepage support disabled.
 119          */
 120         if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
 121                 return 0;
 122
 123         /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
 124         if (vma_is_dax(vma))
 125                 return in_pf ? orders : 0;
 126
 127         /*
 128          * khugepaged special VMA and hugetlb VMA.
 129          * Must be checked after dax since some dax mappings may have
 130          * VM_MIXEDMAP set.
 131          */
 132         if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
 133                 return 0;
 134
 135         /*
 136          * Check alignment for file vma and size for both file and anon vma by
 137          * filtering out the unsuitable orders.
 138          *
 139          * Skip the check for page fault. Huge fault does the check in fault
 140          * handlers.
 141          */
 142         if (!in_pf) {
 143                 int order = highest_order(orders);
 144                 unsigned long addr;
 145
 146                 while (orders) {
 147                         addr = vma->vm_end - (PAGE_SIZE << order);
 148                         if (thp_vma_suitable_order(vma, addr, order))
 149                                 break;
 150                         order = next_order(&orders, order);
 151                 }
 152
 153                 if (!orders)
 154                         return 0;
 155         }
 156
 157         /*
 158          * Enabled via shmem mount options or sysfs settings.
 159          * Must be done before hugepage flags check since shmem has its
 160          * own flags.
 161          */
 162         if (!in_pf && shmem_file(vma->vm_file)) {
 163                 bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
 164                                                         !enforce_sysfs, vma->vm_mm, vm_flags);
 165
 166                 if (!vma_is_anon_shmem(vma))
 167                         return global_huge ? orders : 0;
 168                 return shmem_allowable_huge_orders(file_inode(vma->vm_file),
 169                                                         vma, vma->vm_pgoff, global_huge);
 170         }
 171
 172         if (!vma_is_anonymous(vma)) {
 173                 /*
 174                  * Enforce sysfs THP requirements as necessary. Anonymous vmas
 175                  * were already handled in thp_vma_allowable_orders().
 176                  */
 177                 if (enforce_sysfs &&
 178                     (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
 179                                                     !hugepage_global_always())))
 180                         return 0;
 181
 182                 /*
 183                  * Trust that ->huge_fault() handlers know what they are doing
 184                  * in fault path.
 185                  */
 186                 if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
 187                         return orders;
 188                 /* Only regular file is valid in collapse path */
 189                 if (((!in_pf || smaps)) && file_thp_enabled(vma))
 190                         return orders;
 191                 return 0;
 192         }
 193
 194         if (vma_is_temporary_stack(vma))
 195                 return 0;
 196
 197         /*
 198          * THPeligible bit of smaps should show 1 for proper VMAs even
 199          * though anon_vma is not initialized yet.
 200          *
 201          * Allow page fault since anon_vma may be not initialized until
 202          * the first page fault.
 203          */
 204         if (!vma->anon_vma)
 205                 return (smaps || in_pf) ? orders : 0;
 206
 207         return orders;
 208 }
 209
 210 static bool get_huge_zero_page(void)
 211 {
 212         struct folio *zero_folio;
 213 retry:
 214         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
 215                 return true;
 216
 217         zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
 218                         HPAGE_PMD_ORDER);
 219         if (!zero_folio) {
 220                 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
 221                 return false;
 222         }
 223         /* Ensure zero folio won't have large_rmappable flag set. */
 224         folio_clear_large_rmappable(zero_folio);
 225         preempt_disable();
 226         if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
 227                 preempt_enable();
 228                 folio_put(zero_folio);
 229                 goto retry;
 230         }
 231         WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
 232
 233         /* We take additional reference here. It will be put back by shrinker */
 234         atomic_set(&huge_zero_refcount, 2);
 235         preempt_enable();
 236         count_vm_event(THP_ZERO_PAGE_ALLOC);
 237         return true;
 238 }
 239
 240 static void put_huge_zero_page(void)
 241 {
 242         /*
 243          * Counter should never go to zero here. Only shrinker can put
 244          * last reference.
 245          */
 246         BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 247 }
 248
 249 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
 250 {
 251         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 252                 return READ_ONCE(huge_zero_folio);
 253
 254         if (!get_huge_zero_page())
 255                 return NULL;
 256
 257         if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 258                 put_huge_zero_page();
 259
 260         return READ_ONCE(huge_zero_folio);
 261 }
 262
 263 void mm_put_huge_zero_folio(struct mm_struct *mm)
 264 {
 265         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 266                 put_huge_zero_page();
 267 }
 268
 269 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
 270                                         struct shrink_control *sc)
 271 {
 272         /* we can free zero page only if last reference remains */
 273         return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
 274 }
 275
 276 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
 277                                        struct shrink_control *sc)
 278 {
 279         if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
 280                 struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
 281                 BUG_ON(zero_folio == NULL);
 282                 WRITE_ONCE(huge_zero_pfn, ~0UL);
 283                 folio_put(zero_folio);
 284                 return HPAGE_PMD_NR;
 285         }
 286
 287         return 0;
 288 }
 289
 290 static struct shrinker *huge_zero_page_shrinker;
 291
 292 #ifdef CONFIG_SYSFS
 293 static ssize_t enabled_show(struct kobject *kobj,
 294                             struct kobj_attribute *attr, char *buf)
 295 {
 296         const char *output;
 297
 298         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
 299                 output = "[always] madvise never";
 300         else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 301                           &transparent_hugepage_flags))
 302                 output = "always [madvise] never";
 303         else
 304                 output = "always madvise [never]";
 305
 306         return sysfs_emit(buf, "%s\n", output);
 307 }
 308
 309 static ssize_t enabled_store(struct kobject *kobj,
 310                              struct kobj_attribute *attr,
 311                              const char *buf, size_t count)
 312 {
 313         ssize_t ret = count;
 314
 315         if (sysfs_streq(buf, "always")) {
 316                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 317                 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 318         } else if (sysfs_streq(buf, "madvise")) {
 319                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 320                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 321         } else if (sysfs_streq(buf, "never")) {
 322                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 323                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 324         } else
 325                 ret = -EINVAL;
 326
 327         if (ret > 0) {
 328                 int err = start_stop_khugepaged();
 329                 if (err)
 330                         ret = err;
 331         }
 332         return ret;
 333 }
 334
 335 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
 336
 337 ssize_t single_hugepage_flag_show(struct kobject *kobj,
 338                                   struct kobj_attribute *attr, char *buf,
 339                                   enum transparent_hugepage_flag flag)
 340 {
 341         return sysfs_emit(buf, "%d\n",
 342                           !!test_bit(flag, &transparent_hugepage_flags));
 343 }
 344
 345 ssize_t single_hugepage_flag_store(struct kobject *kobj,
 346                                  struct kobj_attribute *attr,
 347                                  const char *buf, size_t count,
 348                                  enum transparent_hugepage_flag flag)
 349 {
 350         unsigned long value;
 351         int ret;
 352
 353         ret = kstrtoul(buf, 10, &value);
 354         if (ret < 0)
 355                 return ret;
 356         if (value > 1)
 357                 return -EINVAL;
 358
 359         if (value)
 360                 set_bit(flag, &transparent_hugepage_flags);
 361         else
 362                 clear_bit(flag, &transparent_hugepage_flags);
 363
 364         return count;
 365 }
 366
 367 static ssize_t defrag_show(struct kobject *kobj,
 368                            struct kobj_attribute *attr, char *buf)
 369 {
 370         const char *output;
 371
 372         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
 373                      &transparent_hugepage_flags))
 374                 output = "[always] defer defer+madvise madvise never";
 375         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
 376                           &transparent_hugepage_flags))
 377                 output = "always [defer] defer+madvise madvise never";
 378         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
 379                           &transparent_hugepage_flags))
 380                 output = "always defer [defer+madvise] madvise never";
 381         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
 382                           &transparent_hugepage_flags))
 383                 output = "always defer defer+madvise [madvise] never";
 384         else
 385                 output = "always defer defer+madvise madvise [never]";
 386
 387         return sysfs_emit(buf, "%s\n", output);
 388 }
 389
 390 static ssize_t defrag_store(struct kobject *kobj,
 391                             struct kobj_attribute *attr,
 392                             const char *buf, size_t count)
 393 {
 394         if (sysfs_streq(buf, "always")) {
 395                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 396                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 397                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 398                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 399         } else if (sysfs_streq(buf, "defer+madvise")) {
 400                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 401                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 402                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 403                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 404         } else if (sysfs_streq(buf, "defer")) {
 405                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 406                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 407                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 408                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 409         } else if (sysfs_streq(buf, "madvise")) {
 410                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 411                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 412                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 413                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 414         } else if (sysfs_streq(buf, "never")) {
 415                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 416                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 417                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 418                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 419         } else
 420                 return -EINVAL;
 421
 422         return count;
 423 }
 424 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
 425
 426 static ssize_t use_zero_page_show(struct kobject *kobj,
 427                                   struct kobj_attribute *attr, char *buf)
 428 {
 429         return single_hugepage_flag_show(kobj, attr, buf,
 430                                          TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 431 }
 432 static ssize_t use_zero_page_store(struct kobject *kobj,
 433                 struct kobj_attribute *attr, const char *buf, size_t count)
 434 {
 435         return single_hugepage_flag_store(kobj, attr, buf, count,
 436                                  TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 437 }
 438 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
 439
 440 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
 441                                    struct kobj_attribute *attr, char *buf)
 442 {
 443         return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
 444 }
 445 static struct kobj_attribute hpage_pmd_size_attr =
 446         __ATTR_RO(hpage_pmd_size);
 447
 448 static struct attribute *hugepage_attr[] = {
 449         &enabled_attr.attr,
 450         &defrag_attr.attr,
 451         &use_zero_page_attr.attr,
 452         &hpage_pmd_size_attr.attr,
 453 #ifdef CONFIG_SHMEM
 454         &shmem_enabled_attr.attr,
 455 #endif
 456         NULL,
 457 };
 458
 459 static const struct attribute_group hugepage_attr_group = {
 460         .attrs = hugepage_attr,
 461 };
 462
 463 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
 464 static void thpsize_release(struct kobject *kobj);
 465 static DEFINE_SPINLOCK(huge_anon_orders_lock);
 466 static LIST_HEAD(thpsize_list);
 467
 468 static ssize_t thpsize_enabled_show(struct kobject *kobj,
 469                                     struct kobj_attribute *attr, char *buf)
 470 {
 471         int order = to_thpsize(kobj)->order;
 472         const char *output;
 473
 474         if (test_bit(order, &huge_anon_orders_always))
 475                 output = "[always] inherit madvise never";
 476         else if (test_bit(order, &huge_anon_orders_inherit))
 477                 output = "always [inherit] madvise never";
 478         else if (test_bit(order, &huge_anon_orders_madvise))
 479                 output = "always inherit [madvise] never";
 480         else
 481                 output = "always inherit madvise [never]";
 482
 483         return sysfs_emit(buf, "%s\n", output);
 484 }
 485
 486 static ssize_t thpsize_enabled_store(struct kobject *kobj,
 487                                      struct kobj_attribute *attr,
 488                                      const char *buf, size_t count)
 489 {
 490         int order = to_thpsize(kobj)->order;
 491         ssize_t ret = count;
 492
 493         if (sysfs_streq(buf, "always")) {
 494                 spin_lock(&huge_anon_orders_lock);
 495                 clear_bit(order, &huge_anon_orders_inherit);
 496                 clear_bit(order, &huge_anon_orders_madvise);
 497                 set_bit(order, &huge_anon_orders_always);
 498                 spin_unlock(&huge_anon_orders_lock);
 499         } else if (sysfs_streq(buf, "inherit")) {
 500                 spin_lock(&huge_anon_orders_lock);
 501                 clear_bit(order, &huge_anon_orders_always);
 502                 clear_bit(order, &huge_anon_orders_madvise);
 503                 set_bit(order, &huge_anon_orders_inherit);
 504                 spin_unlock(&huge_anon_orders_lock);
 505         } else if (sysfs_streq(buf, "madvise")) {
 506                 spin_lock(&huge_anon_orders_lock);
 507                 clear_bit(order, &huge_anon_orders_always);
 508                 clear_bit(order, &huge_anon_orders_inherit);
 509                 set_bit(order, &huge_anon_orders_madvise);
 510                 spin_unlock(&huge_anon_orders_lock);
 511         } else if (sysfs_streq(buf, "never")) {
 512                 spin_lock(&huge_anon_orders_lock);
 513                 clear_bit(order, &huge_anon_orders_always);
 514                 clear_bit(order, &huge_anon_orders_inherit);
 515                 clear_bit(order, &huge_anon_orders_madvise);
 516                 spin_unlock(&huge_anon_orders_lock);
 517         } else
 518                 ret = -EINVAL;
 519
 520         if (ret > 0) {
 521                 int err;
 522
 523                 err = start_stop_khugepaged();
 524                 if (err)
 525                         ret = err;
 526         }
 527         return ret;
 528 }
 529
 530 static struct kobj_attribute thpsize_enabled_attr =
 531         __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
 532
 533 static struct attribute *thpsize_attrs[] = {
 534         &thpsize_enabled_attr.attr,
 535 #ifdef CONFIG_SHMEM
 536         &thpsize_shmem_enabled_attr.attr,
 537 #endif
 538         NULL,
 539 };
 540
 541 static const struct attribute_group thpsize_attr_group = {
 542         .attrs = thpsize_attrs,
 543 };
 544
 545 static const struct kobj_type thpsize_ktype = {
 546         .release = &thpsize_release,
 547         .sysfs_ops = &kobj_sysfs_ops,
 548 };
 549
 550 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
 551
 552 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
 553 {
 554         unsigned long sum = 0;
 555         int cpu;
 556
 557         for_each_possible_cpu(cpu) {
 558                 struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
 559
 560                 sum += this->stats[order][item];
 561         }
 562
 563         return sum;
 564 }
 565
 566 #define DEFINE_MTHP_STAT_ATTR(_name, _index)                            \
 567 static ssize_t _name##_show(struct kobject *kobj,                       \
 568                         struct kobj_attribute *attr, char *buf)         \
 569 {                                                                       \
 570         int order = to_thpsize(kobj)->order;                            \
 571                                                                         \
 572         return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
 573 }                                                                       \
 574 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 575
 576 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
 577 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
 578 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 579 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
 580 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
 581 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
 582 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
 583 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
 584 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
 585 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 586 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
 587
 588 static struct attribute *stats_attrs[] = {
 589         &anon_fault_alloc_attr.attr,
 590         &anon_fault_fallback_attr.attr,
 591         &anon_fault_fallback_charge_attr.attr,
 592         &swpout_attr.attr,
 593         &swpout_fallback_attr.attr,
 594         &shmem_alloc_attr.attr,
 595         &shmem_fallback_attr.attr,
 596         &shmem_fallback_charge_attr.attr,
 597         &split_attr.attr,
 598         &split_failed_attr.attr,
 599         &split_deferred_attr.attr,
 600         NULL,
 601 };
 602
 603 static struct attribute_group stats_attr_group = {
 604         .name = "stats",
 605         .attrs = stats_attrs,
 606 };
 607
 608 static struct thpsize *thpsize_create(int order, struct kobject *parent)
 609 {
 610         unsigned long size = (PAGE_SIZE << order) / SZ_1K;
 611         struct thpsize *thpsize;
 612         int ret;
 613
 614         thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
 615         if (!thpsize)
 616                 return ERR_PTR(-ENOMEM);
 617
 618         ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
 619                                    "hugepages-%lukB", size);
 620         if (ret) {
 621                 kfree(thpsize);
 622                 return ERR_PTR(ret);
 623         }
 624
 625         ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
 626         if (ret) {
 627                 kobject_put(&thpsize->kobj);
 628                 return ERR_PTR(ret);
 629         }
 630
 631         ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
 632         if (ret) {
 633                 kobject_put(&thpsize->kobj);
 634                 return ERR_PTR(ret);
 635         }
 636
 637         thpsize->order = order;
 638         return thpsize;
 639 }
 640
 641 static void thpsize_release(struct kobject *kobj)
 642 {
 643         kfree(to_thpsize(kobj));
 644 }
 645
 646 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 647 {
 648         int err;
 649         struct thpsize *thpsize;
 650         unsigned long orders;
 651         int order;
 652
 653         /*
 654          * Default to setting PMD-sized THP to inherit the global setting and
 655          * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
 656          * constant so we have to do this here.
 657          */
 658         huge_anon_orders_inherit = BIT(PMD_ORDER);
 659
 660         *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 661         if (unlikely(!*hugepage_kobj)) {
 662                 pr_err("failed to create transparent hugepage kobject\n");
 663                 return -ENOMEM;
 664         }
 665
 666         err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
 667         if (err) {
 668                 pr_err("failed to register transparent hugepage group\n");
 669                 goto delete_obj;
 670         }
 671
 672         err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
 673         if (err) {
 674                 pr_err("failed to register transparent hugepage group\n");
 675                 goto remove_hp_group;
 676         }
 677
 678         orders = THP_ORDERS_ALL_ANON;
 679         order = highest_order(orders);
 680         while (orders) {
 681                 thpsize = thpsize_create(order, *hugepage_kobj);
 682                 if (IS_ERR(thpsize)) {
 683                         pr_err("failed to create thpsize for order %d\n", order);
 684                         err = PTR_ERR(thpsize);
 685                         goto remove_all;
 686                 }
 687                 list_add(&thpsize->node, &thpsize_list);
 688                 order = next_order(&orders, order);
 689         }
 690
 691         return 0;
 692
 693 remove_all:
 694         hugepage_exit_sysfs(*hugepage_kobj);
 695         return err;
 696 remove_hp_group:
 697         sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
 698 delete_obj:
 699         kobject_put(*hugepage_kobj);
 700         return err;
 701 }
 702
 703 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 704 {
 705         struct thpsize *thpsize, *tmp;
 706
 707         list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
 708                 list_del(&thpsize->node);
 709                 kobject_put(&thpsize->kobj);
 710         }
 711
 712         sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
 713         sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
 714         kobject_put(hugepage_kobj);
 715 }
 716 #else
 717 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
 718 {
 719         return 0;
 720 }
 721
 722 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 723 {
 724 }
 725 #endif /* CONFIG_SYSFS */
 726
 727 static int __init thp_shrinker_init(void)
 728 {
 729         huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
 730         if (!huge_zero_page_shrinker)
 731                 return -ENOMEM;
 732
 733         deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
 734                                                  SHRINKER_MEMCG_AWARE |
 735                                                  SHRINKER_NONSLAB,
 736                                                  "thp-deferred_split");
 737         if (!deferred_split_shrinker) {
 738                 shrinker_free(huge_zero_page_shrinker);
 739                 return -ENOMEM;
 740         }
 741
 742         huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
 743         huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
 744         shrinker_register(huge_zero_page_shrinker);
 745
 746         deferred_split_shrinker->count_objects = deferred_split_count;
 747         deferred_split_shrinker->scan_objects = deferred_split_scan;
 748         shrinker_register(deferred_split_shrinker);
 749
 750         return 0;
 751 }
 752
 753 static void __init thp_shrinker_exit(void)
 754 {
 755         shrinker_free(huge_zero_page_shrinker);
 756         shrinker_free(deferred_split_shrinker);
 757 }
 758
 759 static int __init hugepage_init(void)
 760 {
 761         int err;
 762         struct kobject *hugepage_kobj;
 763
 764         if (!has_transparent_hugepage()) {
 765                 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
 766                 return -EINVAL;
 767         }
 768
 769         /*
 770          * hugepages can't be allocated by the buddy allocator
 771          */
 772         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
 773
 774         err = hugepage_init_sysfs(&hugepage_kobj);
 775         if (err)
 776                 goto err_sysfs;
 777
 778         err = khugepaged_init();
 779         if (err)
 780                 goto err_slab;
 781
 782         err = thp_shrinker_init();
 783         if (err)
 784                 goto err_shrinker;
 785
 786         /*
 787          * By default disable transparent hugepages on smaller systems,
 788          * where the extra memory used could hurt more than TLB overhead
 789          * is likely to save.  The admin can still enable it through /sys.
 790          */
 791         if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
 792                 transparent_hugepage_flags = 0;
 793                 return 0;
 794         }
 795
 796         err = start_stop_khugepaged();
 797         if (err)
 798                 goto err_khugepaged;
 799
 800         return 0;
 801 err_khugepaged:
 802         thp_shrinker_exit();
 803 err_shrinker:
 804         khugepaged_destroy();
 805 err_slab:
 806         hugepage_exit_sysfs(hugepage_kobj);
 807 err_sysfs:
 808         return err;
 809 }
 810 subsys_initcall(hugepage_init);
 811
 812 static int __init setup_transparent_hugepage(char *str)
 813 {
 814         int ret = 0;
 815         if (!str)
 816                 goto out;
 817         if (!strcmp(str, "always")) {
 818                 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
 819                         &transparent_hugepage_flags);
 820                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 821                           &transparent_hugepage_flags);
 822                 ret = 1;
 823         } else if (!strcmp(str, "madvise")) {
 824                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 825                           &transparent_hugepage_flags);
 826                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 827                         &transparent_hugepage_flags);
 828                 ret = 1;
 829         } else if (!strcmp(str, "never")) {
 830                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 831                           &transparent_hugepage_flags);
 832                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 833                           &transparent_hugepage_flags);
 834                 ret = 1;
 835         }
 836 out:
 837         if (!ret)
 838                 pr_warn("transparent_hugepage= cannot parse, ignored\n");
 839         return ret;
 840 }
 841 __setup("transparent_hugepage=", setup_transparent_hugepage);
 842
 843 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 844 {
 845         if (likely(vma->vm_flags & VM_WRITE))
 846                 pmd = pmd_mkwrite(pmd, vma);
 847         return pmd;
 848 }
 849
 850 #ifdef CONFIG_MEMCG
 851 static inline
 852 struct deferred_split *get_deferred_split_queue(struct folio *folio)
 853 {
 854         struct mem_cgroup *memcg = folio_memcg(folio);
 855         struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
 856
 857         if (memcg)
 858                 return &memcg->deferred_split_queue;
 859         else
 860                 return &pgdat->deferred_split_queue;
 861 }
 862 #else
 863 static inline
 864 struct deferred_split *get_deferred_split_queue(struct folio *folio)
 865 {
 866         struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
 867
 868         return &pgdat->deferred_split_queue;
 869 }
 870 #endif
 871
 872 static inline bool is_transparent_hugepage(const struct folio *folio)
 873 {
 874         if (!folio_test_large(folio))
 875                 return false;
 876
 877         return is_huge_zero_folio(folio) ||
 878                 folio_test_large_rmappable(folio);
 879 }
 880
 881 static unsigned long __thp_get_unmapped_area(struct file *filp,
 882                 unsigned long addr, unsigned long len,
 883                 loff_t off, unsigned long flags, unsigned long size,
 884                 vm_flags_t vm_flags)
 885 {
 886         loff_t off_end = off + len;
 887         loff_t off_align = round_up(off, size);
 888         unsigned long len_pad, ret, off_sub;
 889
 890         if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
 891                 return 0;
 892
 893         if (off_end <= off_align || (off_end - off_align) < size)
 894                 return 0;
 895
 896         len_pad = len + size;
 897         if (len_pad < len || (off + len_pad) < off)
 898                 return 0;
 899
 900         ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
 901                                            off >> PAGE_SHIFT, flags, vm_flags);
 902
 903         /*
 904          * The failure might be due to length padding. The caller will retry
 905          * without the padding.
 906          */
 907         if (IS_ERR_VALUE(ret))
 908                 return 0;
 909
 910         /*
 911          * Do not try to align to THP boundary if allocation at the address
 912          * hint succeeds.
 913          */
 914         if (ret == addr)
 915                 return addr;
 916
 917         off_sub = (off - ret) & (size - 1);
 918
 919         if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
 920                 return ret + size;
 921
 922         ret += off_sub;
 923         return ret;
 924 }
 925
 926 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
 927                 unsigned long len, unsigned long pgoff, unsigned long flags,
 928                 vm_flags_t vm_flags)
 929 {
 930         unsigned long ret;
 931         loff_t off = (loff_t)pgoff << PAGE_SHIFT;
 932
 933         ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
 934         if (ret)
 935                 return ret;
 936
 937         return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
 938                                             vm_flags);
 939 }
 940
 941 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 942                 unsigned long len, unsigned long pgoff, unsigned long flags)
 943 {
 944         return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
 945 }
 946 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 947
 948 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 949                         struct page *page, gfp_t gfp)
 950 {
 951         struct vm_area_struct *vma = vmf->vma;
 952         struct folio *folio = page_folio(page);
 953         pgtable_t pgtable;
 954         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 955         vm_fault_t ret = 0;
 956
 957         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 958
 959         if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
 960                 folio_put(folio);
 961                 count_vm_event(THP_FAULT_FALLBACK);
 962                 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
 963                 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
 964                 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 965                 return VM_FAULT_FALLBACK;
 966         }
 967         folio_throttle_swaprate(folio, gfp);
 968
 969         pgtable = pte_alloc_one(vma->vm_mm);
 970         if (unlikely(!pgtable)) {
 971                 ret = VM_FAULT_OOM;
 972                 goto release;
 973         }
 974
 975         folio_zero_user(folio, vmf->address);
 976         /*
 977          * The memory barrier inside __folio_mark_uptodate makes sure that
 978          * folio_zero_user writes become visible before the set_pmd_at()
 979          * write.
 980          */
 981         __folio_mark_uptodate(folio);
 982
 983         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 984         if (unlikely(!pmd_none(*vmf->pmd))) {
 985                 goto unlock_release;
 986         } else {
 987                 pmd_t entry;
 988
 989                 ret = check_stable_address_space(vma->vm_mm);
 990                 if (ret)
 991                         goto unlock_release;
 992
 993                 /* Deliver the page fault to userland */
 994                 if (userfaultfd_missing(vma)) {
 995                         spin_unlock(vmf->ptl);
 996                         folio_put(folio);
 997                         pte_free(vma->vm_mm, pgtable);
 998                         ret = handle_userfault(vmf, VM_UFFD_MISSING);
 999                         VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1000                         return ret;
1001                 }
1002
1003                 entry = mk_huge_pmd(page, vma->vm_page_prot);
1004                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1005                 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
1006                 folio_add_lru_vma(folio, vma);
1007                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1008                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
1009                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1010                 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1011                 mm_inc_nr_ptes(vma->vm_mm);
1012                 spin_unlock(vmf->ptl);
1013                 count_vm_event(THP_FAULT_ALLOC);
1014                 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
1015                 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
1016         }
1017
1018         return 0;
1019 unlock_release:
1020         spin_unlock(vmf->ptl);
1021 release:
1022         if (pgtable)
1023                 pte_free(vma->vm_mm, pgtable);
1024         folio_put(folio);
1025         return ret;
1026
1027 }
1028
1029 /*
1030  * always: directly stall for all thp allocations
1031  * defer: wake kswapd and fail if not immediately available
1032  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1033  *                fail if not immediately available
1034  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1035  *          available
1036  * never: never stall for any thp allocation
1037  */
1038 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
1039 {
1040         const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
1041
1042         /* Always do synchronous compaction */
1043         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1044                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
1045
1046         /* Kick kcompactd and fail quickly */
1047         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
1048                 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
1049
1050         /* Synchronous compaction if madvised, otherwise kick kcompactd */
1051         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
1052                 return GFP_TRANSHUGE_LIGHT |
1053                         (vma_madvised ? __GFP_DIRECT_RECLAIM :
1054                                         __GFP_KSWAPD_RECLAIM);
1055
1056         /* Only do synchronous compaction if madvised */
1057         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
1058                 return GFP_TRANSHUGE_LIGHT |
1059                        (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
1060
1061         return GFP_TRANSHUGE_LIGHT;
1062 }
1063
1064 /* Caller must hold page table lock. */
1065 static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
1066                 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
1067                 struct folio *zero_folio)
1068 {
1069         pmd_t entry;
1070         if (!pmd_none(*pmd))
1071                 return;
1072         entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
1073         entry = pmd_mkhuge(entry);
1074         pgtable_trans_huge_deposit(mm, pmd, pgtable);
1075         set_pmd_at(mm, haddr, pmd, entry);
1076         mm_inc_nr_ptes(mm);
1077 }
1078
1079 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1080 {
1081         struct vm_area_struct *vma = vmf->vma;
1082         gfp_t gfp;
1083         struct folio *folio;
1084         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1085         vm_fault_t ret;
1086
1087         if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
1088                 return VM_FAULT_FALLBACK;
1089         ret = vmf_anon_prepare(vmf);
1090         if (ret)
1091                 return ret;
1092         khugepaged_enter_vma(vma, vma->vm_flags);
1093
1094         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1095                         !mm_forbids_zeropage(vma->vm_mm) &&
1096                         transparent_hugepage_use_zero_page()) {
1097                 pgtable_t pgtable;
1098                 struct folio *zero_folio;
1099                 vm_fault_t ret;
1100
1101                 pgtable = pte_alloc_one(vma->vm_mm);
1102                 if (unlikely(!pgtable))
1103                         return VM_FAULT_OOM;
1104                 zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
1105                 if (unlikely(!zero_folio)) {
1106                         pte_free(vma->vm_mm, pgtable);
1107                         count_vm_event(THP_FAULT_FALLBACK);
1108                         return VM_FAULT_FALLBACK;
1109                 }
1110                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1111                 ret = 0;
1112                 if (pmd_none(*vmf->pmd)) {
1113                         ret = check_stable_address_space(vma->vm_mm);
1114                         if (ret) {
1115                                 spin_unlock(vmf->ptl);
1116                                 pte_free(vma->vm_mm, pgtable);
1117                         } else if (userfaultfd_missing(vma)) {
1118                                 spin_unlock(vmf->ptl);
1119                                 pte_free(vma->vm_mm, pgtable);
1120                                 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1121                                 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1122                         } else {
1123                                 set_huge_zero_folio(pgtable, vma->vm_mm, vma,
1124                                                    haddr, vmf->pmd, zero_folio);
1125                                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1126                                 spin_unlock(vmf->ptl);
1127                         }
1128                 } else {
1129                         spin_unlock(vmf->ptl);
1130                         pte_free(vma->vm_mm, pgtable);
1131                 }
1132                 return ret;
1133         }
1134         gfp = vma_thp_gfp_mask(vma);
1135         folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
1136         if (unlikely(!folio)) {
1137                 count_vm_event(THP_FAULT_FALLBACK);
1138                 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
1139                 return VM_FAULT_FALLBACK;
1140         }
1141         return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
1142 }
1143
1144 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
1145                 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
1146                 pgtable_t pgtable)
1147 {
1148         struct mm_struct *mm = vma->vm_mm;
1149         pmd_t entry;
1150         spinlock_t *ptl;
1151
1152         ptl = pmd_lock(mm, pmd);
1153         if (!pmd_none(*pmd)) {
1154                 if (write) {
1155                         if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
1156                                 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1157                                 goto out_unlock;
1158                         }
1159                         entry = pmd_mkyoung(*pmd);
1160                         entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1161                         if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1162                                 update_mmu_cache_pmd(vma, addr, pmd);
1163                 }
1164
1165                 goto out_unlock;
1166         }
1167
1168         entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1169         if (pfn_t_devmap(pfn))
1170                 entry = pmd_mkdevmap(entry);
1171         if (write) {
1172                 entry = pmd_mkyoung(pmd_mkdirty(entry));
1173                 entry = maybe_pmd_mkwrite(entry, vma);
1174         }
1175
1176         if (pgtable) {
1177                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1178                 mm_inc_nr_ptes(mm);
1179                 pgtable = NULL;
1180         }
1181
1182         set_pmd_at(mm, addr, pmd, entry);
1183         update_mmu_cache_pmd(vma, addr, pmd);
1184
1185 out_unlock:
1186         spin_unlock(ptl);
1187         if (pgtable)
1188                 pte_free(mm, pgtable);
1189 }
1190
1191 /**
1192  * vmf_insert_pfn_pmd - insert a pmd size pfn
1193  * @vmf: Structure describing the fault
1194  * @pfn: pfn to insert
1195  * @write: whether it's a write fault
1196  *
1197  * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
1198  *
1199  * Return: vm_fault_t value.
1200  */
1201 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
1202 {
1203         unsigned long addr = vmf->address & PMD_MASK;
1204         struct vm_area_struct *vma = vmf->vma;
1205         pgprot_t pgprot = vma->vm_page_prot;
1206         pgtable_t pgtable = NULL;
1207
1208         /*
1209          * If we had pmd_special, we could avoid all these restrictions,
1210          * but we need to be consistent with PTEs and architectures that
1211          * can't support a 'special' bit.
1212          */
1213         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1214                         !pfn_t_devmap(pfn));
1215         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1216                                                 (VM_PFNMAP|VM_MIXEDMAP));
1217         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1218
1219         if (addr < vma->vm_start || addr >= vma->vm_end)
1220                 return VM_FAULT_SIGBUS;
1221
1222         if (arch_needs_pgtable_deposit()) {
1223                 pgtable = pte_alloc_one(vma->vm_mm);
1224                 if (!pgtable)
1225                         return VM_FAULT_OOM;
1226         }
1227
1228         track_pfn_insert(vma, &pgprot, pfn);
1229
1230         insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
1231         return VM_FAULT_NOPAGE;
1232 }
1233 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1234
1235 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1236 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1237 {
1238         if (likely(vma->vm_flags & VM_WRITE))
1239                 pud = pud_mkwrite(pud);
1240         return pud;
1241 }
1242
1243 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
1244                 pud_t *pud, pfn_t pfn, bool write)
1245 {
1246         struct mm_struct *mm = vma->vm_mm;
1247         pgprot_t prot = vma->vm_page_prot;
1248         pud_t entry;
1249         spinlock_t *ptl;
1250
1251         ptl = pud_lock(mm, pud);
1252         if (!pud_none(*pud)) {
1253                 if (write) {
1254                         if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
1255                                 WARN_ON_ONCE(!is_huge_zero_pud(*pud));
1256                                 goto out_unlock;
1257                         }
1258                         entry = pud_mkyoung(*pud);
1259                         entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1260                         if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1261                                 update_mmu_cache_pud(vma, addr, pud);
1262                 }
1263                 goto out_unlock;
1264         }
1265
1266         entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1267         if (pfn_t_devmap(pfn))
1268                 entry = pud_mkdevmap(entry);
1269         if (write) {
1270                 entry = pud_mkyoung(pud_mkdirty(entry));
1271                 entry = maybe_pud_mkwrite(entry, vma);
1272         }
1273         set_pud_at(mm, addr, pud, entry);
1274         update_mmu_cache_pud(vma, addr, pud);
1275
1276 out_unlock:
1277         spin_unlock(ptl);
1278 }
1279
1280 /**
1281  * vmf_insert_pfn_pud - insert a pud size pfn
1282  * @vmf: Structure describing the fault
1283  * @pfn: pfn to insert
1284  * @write: whether it's a write fault
1285  *
1286  * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
1287  *
1288  * Return: vm_fault_t value.
1289  */
1290 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
1291 {
1292         unsigned long addr = vmf->address & PUD_MASK;
1293         struct vm_area_struct *vma = vmf->vma;
1294         pgprot_t pgprot = vma->vm_page_prot;
1295
1296         /*
1297          * If we had pud_special, we could avoid all these restrictions,
1298          * but we need to be consistent with PTEs and architectures that
1299          * can't support a 'special' bit.
1300          */
1301         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1302                         !pfn_t_devmap(pfn));
1303         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1304                                                 (VM_PFNMAP|VM_MIXEDMAP));
1305         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1306
1307         if (addr < vma->vm_start || addr >= vma->vm_end)
1308                 return VM_FAULT_SIGBUS;
1309
1310         track_pfn_insert(vma, &pgprot, pfn);
1311
1312         insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
1313         return VM_FAULT_NOPAGE;
1314 }
1315 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1316 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1317
1318 void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1319                pmd_t *pmd, bool write)
1320 {
1321         pmd_t _pmd;
1322
1323         _pmd = pmd_mkyoung(*pmd);
1324         if (write)
1325                 _pmd = pmd_mkdirty(_pmd);
1326         if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1327                                   pmd, _pmd, write))
1328                 update_mmu_cache_pmd(vma, addr, pmd);
1329 }
1330
1331 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
1332                 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
1333 {
1334         unsigned long pfn = pmd_pfn(*pmd);
1335         struct mm_struct *mm = vma->vm_mm;
1336         struct page *page;
1337         int ret;
1338
1339         assert_spin_locked(pmd_lockptr(mm, pmd));
1340
1341         if (flags & FOLL_WRITE && !pmd_write(*pmd))
1342                 return NULL;
1343
1344         if (pmd_present(*pmd) && pmd_devmap(*pmd))
1345                 /* pass */;
1346         else
1347                 return NULL;
1348
1349         if (flags & FOLL_TOUCH)
1350                 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
1351
1352         /*
1353          * device mapped pages can only be returned if the
1354          * caller will manage the page reference count.
1355          */
1356         if (!(flags & (FOLL_GET | FOLL_PIN)))
1357                 return ERR_PTR(-EEXIST);
1358
1359         pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1360         *pgmap = get_dev_pagemap(pfn, *pgmap);
1361         if (!*pgmap)
1362                 return ERR_PTR(-EFAULT);
1363         page = pfn_to_page(pfn);
1364         ret = try_grab_folio(page_folio(page), 1, flags);
1365         if (ret)
1366                 page = ERR_PTR(ret);
1367
1368         return page;
1369 }
1370
1371 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1372                   pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1373                   struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1374 {
1375         spinlock_t *dst_ptl, *src_ptl;
1376         struct page *src_page;
1377         struct folio *src_folio;
1378         pmd_t pmd;
1379         pgtable_t pgtable = NULL;
1380         int ret = -ENOMEM;
1381
1382         /* Skip if can be re-fill on fault */
1383         if (!vma_is_anonymous(dst_vma))
1384                 return 0;
1385
1386         pgtable = pte_alloc_one(dst_mm);
1387         if (unlikely(!pgtable))
1388                 goto out;
1389
1390         dst_ptl = pmd_lock(dst_mm, dst_pmd);
1391         src_ptl = pmd_lockptr(src_mm, src_pmd);
1392         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1393
1394         ret = -EAGAIN;
1395         pmd = *src_pmd;
1396
1397 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1398         if (unlikely(is_swap_pmd(pmd))) {
1399                 swp_entry_t entry = pmd_to_swp_entry(pmd);
1400
1401                 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1402                 if (!is_readable_migration_entry(entry)) {
1403                         entry = make_readable_migration_entry(
1404                                                         swp_offset(entry));
1405                         pmd = swp_entry_to_pmd(entry);
1406                         if (pmd_swp_soft_dirty(*src_pmd))
1407                                 pmd = pmd_swp_mksoft_dirty(pmd);
1408                         if (pmd_swp_uffd_wp(*src_pmd))
1409                                 pmd = pmd_swp_mkuffd_wp(pmd);
1410                         set_pmd_at(src_mm, addr, src_pmd, pmd);
1411                 }
1412                 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1413                 mm_inc_nr_ptes(dst_mm);
1414                 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1415                 if (!userfaultfd_wp(dst_vma))
1416                         pmd = pmd_swp_clear_uffd_wp(pmd);
1417                 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1418                 ret = 0;
1419                 goto out_unlock;
1420         }
1421 #endif
1422
1423         if (unlikely(!pmd_trans_huge(pmd))) {
1424                 pte_free(dst_mm, pgtable);
1425                 goto out_unlock;
1426         }
1427         /*
1428          * When page table lock is held, the huge zero pmd should not be
1429          * under splitting since we don't split the page itself, only pmd to
1430          * a page table.
1431          */
1432         if (is_huge_zero_pmd(pmd)) {
1433                 /*
1434                  * mm_get_huge_zero_folio() will never allocate a new
1435                  * folio here, since we already have a zero page to
1436                  * copy. It just takes a reference.
1437                  */
1438                 mm_get_huge_zero_folio(dst_mm);
1439                 goto out_zero_page;
1440         }
1441
1442         src_page = pmd_page(pmd);
1443         VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1444         src_folio = page_folio(src_page);
1445
1446         folio_get(src_folio);
1447         if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
1448                 /* Page maybe pinned: split and retry the fault on PTEs. */
1449                 folio_put(src_folio);
1450                 pte_free(dst_mm, pgtable);
1451                 spin_unlock(src_ptl);
1452                 spin_unlock(dst_ptl);
1453                 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
1454                 return -EAGAIN;
1455         }
1456         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1457 out_zero_page:
1458         mm_inc_nr_ptes(dst_mm);
1459         pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1460         pmdp_set_wrprotect(src_mm, addr, src_pmd);
1461         if (!userfaultfd_wp(dst_vma))
1462                 pmd = pmd_clear_uffd_wp(pmd);
1463         pmd = pmd_mkold(pmd_wrprotect(pmd));
1464         set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1465
1466         ret = 0;
1467 out_unlock:
1468         spin_unlock(src_ptl);
1469         spin_unlock(dst_ptl);
1470 out:
1471         return ret;
1472 }
1473
1474 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1475 void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1476                pud_t *pud, bool write)
1477 {
1478         pud_t _pud;
1479
1480         _pud = pud_mkyoung(*pud);
1481         if (write)
1482                 _pud = pud_mkdirty(_pud);
1483         if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1484                                   pud, _pud, write))
1485                 update_mmu_cache_pud(vma, addr, pud);
1486 }
1487
1488 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1489                   pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1490                   struct vm_area_struct *vma)
1491 {
1492         spinlock_t *dst_ptl, *src_ptl;
1493         pud_t pud;
1494         int ret;
1495
1496         dst_ptl = pud_lock(dst_mm, dst_pud);
1497         src_ptl = pud_lockptr(src_mm, src_pud);
1498         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1499
1500         ret = -EAGAIN;
1501         pud = *src_pud;
1502         if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1503                 goto out_unlock;
1504
1505         /*
1506          * When page table lock is held, the huge zero pud should not be
1507          * under splitting since we don't split the page itself, only pud to
1508          * a page table.
1509          */
1510         if (is_huge_zero_pud(pud)) {
1511                 /* No huge zero pud yet */
1512         }
1513
1514         /*
1515          * TODO: once we support anonymous pages, use
1516          * folio_try_dup_anon_rmap_*() and split if duplicating fails.
1517          */
1518         pudp_set_wrprotect(src_mm, addr, src_pud);
1519         pud = pud_mkold(pud_wrprotect(pud));
1520         set_pud_at(dst_mm, addr, dst_pud, pud);
1521
1522         ret = 0;
1523 out_unlock:
1524         spin_unlock(src_ptl);
1525         spin_unlock(dst_ptl);
1526         return ret;
1527 }
1528
1529 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1530 {
1531         bool write = vmf->flags & FAULT_FLAG_WRITE;
1532
1533         vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1534         if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1535                 goto unlock;
1536
1537         touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1538 unlock:
1539         spin_unlock(vmf->ptl);
1540 }
1541 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1542
1543 void huge_pmd_set_accessed(struct vm_fault *vmf)
1544 {
1545         bool write = vmf->flags & FAULT_FLAG_WRITE;
1546
1547         vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1548         if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1549                 goto unlock;
1550
1551         touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1552
1553 unlock:
1554         spin_unlock(vmf->ptl);
1555 }
1556
1557 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1558 {
1559         const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
1560         struct vm_area_struct *vma = vmf->vma;
1561         struct folio *folio;
1562         struct page *page;
1563         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1564         pmd_t orig_pmd = vmf->orig_pmd;
1565
1566         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1567         VM_BUG_ON_VMA(!vma->anon_vma, vma);
1568
1569         if (is_huge_zero_pmd(orig_pmd))
1570                 goto fallback;
1571
1572         spin_lock(vmf->ptl);
1573
1574         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1575                 spin_unlock(vmf->ptl);
1576                 return 0;
1577         }
1578
1579         page = pmd_page(orig_pmd);
1580         folio = page_folio(page);
1581         VM_BUG_ON_PAGE(!PageHead(page), page);
1582
1583         /* Early check when only holding the PT lock. */
1584         if (PageAnonExclusive(page))
1585                 goto reuse;
1586
1587         if (!folio_trylock(folio)) {
1588                 folio_get(folio);
1589                 spin_unlock(vmf->ptl);
1590                 folio_lock(folio);
1591                 spin_lock(vmf->ptl);
1592                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1593                         spin_unlock(vmf->ptl);
1594                         folio_unlock(folio);
1595                         folio_put(folio);
1596                         return 0;
1597                 }
1598                 folio_put(folio);
1599         }
1600
1601         /* Recheck after temporarily dropping the PT lock. */
1602         if (PageAnonExclusive(page)) {
1603                 folio_unlock(folio);
1604                 goto reuse;
1605         }
1606
1607         /*
1608          * See do_wp_page(): we can only reuse the folio exclusively if
1609          * there are no additional references. Note that we always drain
1610          * the LRU cache immediately after adding a THP.
1611          */
1612         if (folio_ref_count(folio) >
1613                         1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
1614                 goto unlock_fallback;
1615         if (folio_test_swapcache(folio))
1616                 folio_free_swap(folio);
1617         if (folio_ref_count(folio) == 1) {
1618                 pmd_t entry;
1619
1620                 folio_move_anon_rmap(folio, vma);
1621                 SetPageAnonExclusive(page);
1622                 folio_unlock(folio);
1623 reuse:
1624                 if (unlikely(unshare)) {
1625                         spin_unlock(vmf->ptl);
1626                         return 0;
1627                 }
1628                 entry = pmd_mkyoung(orig_pmd);
1629                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1630                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1631                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1632                 spin_unlock(vmf->ptl);
1633                 return 0;
1634         }
1635
1636 unlock_fallback:
1637         folio_unlock(folio);
1638         spin_unlock(vmf->ptl);
1639 fallback:
1640         __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
1641         return VM_FAULT_FALLBACK;
1642 }
1643
1644 static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1645                                            unsigned long addr, pmd_t pmd)
1646 {
1647         struct page *page;
1648
1649         if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1650                 return false;
1651
1652         /* Don't touch entries that are not even readable (NUMA hinting). */
1653         if (pmd_protnone(pmd))
1654                 return false;
1655
1656         /* Do we need write faults for softdirty tracking? */
1657         if (pmd_needs_soft_dirty_wp(vma, pmd))
1658                 return false;
1659
1660         /* Do we need write faults for uffd-wp tracking? */
1661         if (userfaultfd_huge_pmd_wp(vma, pmd))
1662                 return false;
1663
1664         if (!(vma->vm_flags & VM_SHARED)) {
1665                 /* See can_change_pte_writable(). */
1666                 page = vm_normal_page_pmd(vma, addr, pmd);
1667                 return page && PageAnon(page) && PageAnonExclusive(page);
1668         }
1669
1670         /* See can_change_pte_writable(). */
1671         return pmd_dirty(pmd);
1672 }
1673
1674 /* NUMA hinting page fault entry point for trans huge pmds */
1675 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
1676 {
1677         struct vm_area_struct *vma = vmf->vma;
1678         pmd_t oldpmd = vmf->orig_pmd;
1679         pmd_t pmd;
1680         struct folio *folio;
1681         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1682         int nid = NUMA_NO_NODE;
1683         int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
1684         bool writable = false;
1685         int flags = 0;
1686
1687         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1688         if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1689                 spin_unlock(vmf->ptl);
1690                 return 0;
1691         }
1692
1693         pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1694
1695         /*
1696          * Detect now whether the PMD could be writable; this information
1697          * is only valid while holding the PT lock.
1698          */
1699         writable = pmd_write(pmd);
1700         if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
1701             can_change_pmd_writable(vma, vmf->address, pmd))
1702                 writable = true;
1703
1704         folio = vm_normal_folio_pmd(vma, haddr, pmd);
1705         if (!folio)
1706                 goto out_map;
1707
1708         /* See similar comment in do_numa_page for explanation */
1709         if (!writable)
1710                 flags |= TNF_NO_GROUP;
1711
1712         nid = folio_nid(folio);
1713         /*
1714          * For memory tiering mode, cpupid of slow memory page is used
1715          * to record page access time.  So use default value.
1716          */
1717         if (node_is_toptier(nid))
1718                 last_cpupid = folio_last_cpupid(folio);
1719         target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
1720         if (target_nid == NUMA_NO_NODE)
1721                 goto out_map;
1722         if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
1723                 flags |= TNF_MIGRATE_FAIL;
1724                 goto out_map;
1725         }
1726         /* The folio is isolated and isolation code holds a folio reference. */
1727         spin_unlock(vmf->ptl);
1728         writable = false;
1729
1730         if (!migrate_misplaced_folio(folio, vma, target_nid)) {
1731                 flags |= TNF_MIGRATED;
1732                 nid = target_nid;
1733                 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
1734                 return 0;
1735         }
1736
1737         flags |= TNF_MIGRATE_FAIL;
1738         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1739         if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1740                 spin_unlock(vmf->ptl);
1741                 return 0;
1742         }
1743 out_map:
1744         /* Restore the PMD */
1745         pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1746         pmd = pmd_mkyoung(pmd);
1747         if (writable)
1748                 pmd = pmd_mkwrite(pmd, vma);
1749         set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1750         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1751         spin_unlock(vmf->ptl);
1752
1753         if (nid != NUMA_NO_NODE)
1754                 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
1755         return 0;
1756 }
1757
1758 /*
1759  * Return true if we do MADV_FREE successfully on entire pmd page.
1760  * Otherwise, return false.
1761  */
1762 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1763                 pmd_t *pmd, unsigned long addr, unsigned long next)
1764 {
1765         spinlock_t *ptl;
1766         pmd_t orig_pmd;
1767         struct folio *folio;
1768         struct mm_struct *mm = tlb->mm;
1769         bool ret = false;
1770
1771         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1772
1773         ptl = pmd_trans_huge_lock(pmd, vma);
1774         if (!ptl)
1775                 goto out_unlocked;
1776
1777         orig_pmd = *pmd;
1778         if (is_huge_zero_pmd(orig_pmd))
1779                 goto out;
1780
1781         if (unlikely(!pmd_present(orig_pmd))) {
1782                 VM_BUG_ON(thp_migration_supported() &&
1783                                   !is_pmd_migration_entry(orig_pmd));
1784                 goto out;
1785         }
1786
1787         folio = pmd_folio(orig_pmd);
1788         /*
1789          * If other processes are mapping this folio, we couldn't discard
1790          * the folio unless they all do MADV_FREE so let's skip the folio.
1791          */
1792         if (folio_likely_mapped_shared(folio))
1793                 goto out;
1794
1795         if (!folio_trylock(folio))
1796                 goto out;
1797
1798         /*
1799          * If user want to discard part-pages of THP, split it so MADV_FREE
1800          * will deactivate only them.
1801          */
1802         if (next - addr != HPAGE_PMD_SIZE) {
1803                 folio_get(folio);
1804                 spin_unlock(ptl);
1805                 split_folio(folio);
1806                 folio_unlock(folio);
1807                 folio_put(folio);
1808                 goto out_unlocked;
1809         }
1810
1811         if (folio_test_dirty(folio))
1812                 folio_clear_dirty(folio);
1813         folio_unlock(folio);
1814
1815         if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1816                 pmdp_invalidate(vma, addr, pmd);
1817                 orig_pmd = pmd_mkold(orig_pmd);
1818                 orig_pmd = pmd_mkclean(orig_pmd);
1819
1820                 set_pmd_at(mm, addr, pmd, orig_pmd);
1821                 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1822         }
1823
1824         folio_mark_lazyfree(folio);
1825         ret = true;
1826 out:
1827         spin_unlock(ptl);
1828 out_unlocked:
1829         return ret;
1830 }
1831
1832 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1833 {
1834         pgtable_t pgtable;
1835
1836         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1837         pte_free(mm, pgtable);
1838         mm_dec_nr_ptes(mm);
1839 }
1840
1841 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1842                  pmd_t *pmd, unsigned long addr)
1843 {
1844         pmd_t orig_pmd;
1845         spinlock_t *ptl;
1846
1847         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1848
1849         ptl = __pmd_trans_huge_lock(pmd, vma);
1850         if (!ptl)
1851                 return 0;
1852         /*
1853          * For architectures like ppc64 we look at deposited pgtable
1854          * when calling pmdp_huge_get_and_clear. So do the
1855          * pgtable_trans_huge_withdraw after finishing pmdp related
1856          * operations.
1857          */
1858         orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
1859                                                 tlb->fullmm);
1860         arch_check_zapped_pmd(vma, orig_pmd);
1861         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1862         if (vma_is_special_huge(vma)) {
1863                 if (arch_needs_pgtable_deposit())
1864                         zap_deposited_table(tlb->mm, pmd);
1865                 spin_unlock(ptl);
1866         } else if (is_huge_zero_pmd(orig_pmd)) {
1867                 zap_deposited_table(tlb->mm, pmd);
1868                 spin_unlock(ptl);
1869         } else {
1870                 struct folio *folio = NULL;
1871                 int flush_needed = 1;
1872
1873                 if (pmd_present(orig_pmd)) {
1874                         struct page *page = pmd_page(orig_pmd);
1875
1876                         folio = page_folio(page);
1877                         folio_remove_rmap_pmd(folio, page, vma);
1878                         WARN_ON_ONCE(folio_mapcount(folio) < 0);
1879                         VM_BUG_ON_PAGE(!PageHead(page), page);
1880                 } else if (thp_migration_supported()) {
1881                         swp_entry_t entry;
1882
1883                         VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1884                         entry = pmd_to_swp_entry(orig_pmd);
1885                         folio = pfn_swap_entry_folio(entry);
1886                         flush_needed = 0;
1887                 } else
1888                         WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1889
1890                 if (folio_test_anon(folio)) {
1891                         zap_deposited_table(tlb->mm, pmd);
1892                         add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1893                 } else {
1894                         if (arch_needs_pgtable_deposit())
1895                                 zap_deposited_table(tlb->mm, pmd);
1896                         add_mm_counter(tlb->mm, mm_counter_file(folio),
1897                                        -HPAGE_PMD_NR);
1898                 }
1899
1900                 spin_unlock(ptl);
1901                 if (flush_needed)
1902                         tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
1903         }
1904         return 1;
1905 }
1906
1907 #ifndef pmd_move_must_withdraw
1908 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1909                                          spinlock_t *old_pmd_ptl,
1910                                          struct vm_area_struct *vma)
1911 {
1912         /*
1913          * With split pmd lock we also need to move preallocated
1914          * PTE page table if new_pmd is on different PMD page table.
1915          *
1916          * We also don't deposit and withdraw tables for file pages.
1917          */
1918         return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1919 }
1920 #endif
1921
1922 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1923 {
1924 #ifdef CONFIG_MEM_SOFT_DIRTY
1925         if (unlikely(is_pmd_migration_entry(pmd)))
1926                 pmd = pmd_swp_mksoft_dirty(pmd);
1927         else if (pmd_present(pmd))
1928                 pmd = pmd_mksoft_dirty(pmd);
1929 #endif
1930         return pmd;
1931 }
1932
1933 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1934                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
1935 {
1936         spinlock_t *old_ptl, *new_ptl;
1937         pmd_t pmd;
1938         struct mm_struct *mm = vma->vm_mm;
1939         bool force_flush = false;
1940
1941         /*
1942          * The destination pmd shouldn't be established, free_pgtables()
1943          * should have released it; but move_page_tables() might have already
1944          * inserted a page table, if racing against shmem/file collapse.
1945          */
1946         if (!pmd_none(*new_pmd)) {
1947                 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1948                 return false;
1949         }
1950
1951         /*
1952          * We don't have to worry about the ordering of src and dst
1953          * ptlocks because exclusive mmap_lock prevents deadlock.
1954          */
1955         old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1956         if (old_ptl) {
1957                 new_ptl = pmd_lockptr(mm, new_pmd);
1958                 if (new_ptl != old_ptl)
1959                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1960                 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1961                 if (pmd_present(pmd))
1962                         force_flush = true;
1963                 VM_BUG_ON(!pmd_none(*new_pmd));
1964
1965                 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
1966                         pgtable_t pgtable;
1967                         pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1968                         pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
1969                 }
1970                 pmd = move_soft_dirty_pmd(pmd);
1971                 set_pmd_at(mm, new_addr, new_pmd, pmd);
1972                 if (force_flush)
1973                         flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1974                 if (new_ptl != old_ptl)
1975                         spin_unlock(new_ptl);
1976                 spin_unlock(old_ptl);
1977                 return true;
1978         }
1979         return false;
1980 }
1981
1982 /*
1983  * Returns
1984  *  - 0 if PMD could not be locked
1985  *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
1986  *      or if prot_numa but THP migration is not supported
1987  *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
1988  */
1989 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1990                     pmd_t *pmd, unsigned long addr, pgprot_t newprot,
1991                     unsigned long cp_flags)
1992 {
1993         struct mm_struct *mm = vma->vm_mm;
1994         spinlock_t *ptl;
1995         pmd_t oldpmd, entry;
1996         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
1997         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
1998         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
1999         int ret = 1;
2000
2001         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2002
2003         if (prot_numa && !thp_migration_supported())
2004                 return 1;
2005
2006         ptl = __pmd_trans_huge_lock(pmd, vma);
2007         if (!ptl)
2008                 return 0;
2009
2010 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2011         if (is_swap_pmd(*pmd)) {
2012                 swp_entry_t entry = pmd_to_swp_entry(*pmd);
2013                 struct folio *folio = pfn_swap_entry_folio(entry);
2014                 pmd_t newpmd;
2015
2016                 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
2017                 if (is_writable_migration_entry(entry)) {
2018                         /*
2019                          * A protection check is difficult so
2020                          * just be safe and disable write
2021                          */
2022                         if (folio_test_anon(folio))
2023                                 entry = make_readable_exclusive_migration_entry(swp_offset(entry));
2024                         else
2025                                 entry = make_readable_migration_entry(swp_offset(entry));
2026                         newpmd = swp_entry_to_pmd(entry);
2027                         if (pmd_swp_soft_dirty(*pmd))
2028                                 newpmd = pmd_swp_mksoft_dirty(newpmd);
2029                 } else {
2030                         newpmd = *pmd;
2031                 }
2032
2033                 if (uffd_wp)
2034                         newpmd = pmd_swp_mkuffd_wp(newpmd);
2035                 else if (uffd_wp_resolve)
2036                         newpmd = pmd_swp_clear_uffd_wp(newpmd);
2037                 if (!pmd_same(*pmd, newpmd))
2038                         set_pmd_at(mm, addr, pmd, newpmd);
2039                 goto unlock;
2040         }
2041 #endif
2042
2043         if (prot_numa) {
2044                 struct folio *folio;
2045                 bool toptier;
2046                 /*
2047                  * Avoid trapping faults against the zero page. The read-only
2048                  * data is likely to be read-cached on the local CPU and
2049                  * local/remote hits to the zero page are not interesting.
2050                  */
2051                 if (is_huge_zero_pmd(*pmd))
2052                         goto unlock;
2053
2054                 if (pmd_protnone(*pmd))
2055                         goto unlock;
2056
2057                 folio = pmd_folio(*pmd);
2058                 toptier = node_is_toptier(folio_nid(folio));
2059                 /*
2060                  * Skip scanning top tier node if normal numa
2061                  * balancing is disabled
2062                  */
2063                 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
2064                     toptier)
2065                         goto unlock;
2066
2067                 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
2068                     !toptier)
2069                         folio_xchg_access_time(folio,
2070                                                jiffies_to_msecs(jiffies));
2071         }
2072         /*
2073          * In case prot_numa, we are under mmap_read_lock(mm). It's critical
2074          * to not clear pmd intermittently to avoid race with MADV_DONTNEED
2075          * which is also under mmap_read_lock(mm):
2076          *
2077          *      CPU0:                           CPU1:
2078          *                              change_huge_pmd(prot_numa=1)
2079          *                               pmdp_huge_get_and_clear_notify()
2080          * madvise_dontneed()
2081          *  zap_pmd_range()
2082          *   pmd_trans_huge(*pmd) == 0 (without ptl)
2083          *   // skip the pmd
2084          *                               set_pmd_at();
2085          *                               // pmd is re-established
2086          *
2087          * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2088          * which may break userspace.
2089          *
2090          * pmdp_invalidate_ad() is required to make sure we don't miss
2091          * dirty/young flags set by hardware.
2092          */
2093         oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
2094
2095         entry = pmd_modify(oldpmd, newprot);
2096         if (uffd_wp)
2097                 entry = pmd_mkuffd_wp(entry);
2098         else if (uffd_wp_resolve)
2099                 /*
2100                  * Leave the write bit to be handled by PF interrupt
2101                  * handler, then things like COW could be properly
2102                  * handled.
2103                  */
2104                 entry = pmd_clear_uffd_wp(entry);
2105
2106         /* See change_pte_range(). */
2107         if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2108             can_change_pmd_writable(vma, addr, entry))
2109                 entry = pmd_mkwrite(entry, vma);
2110
2111         ret = HPAGE_PMD_NR;
2112         set_pmd_at(mm, addr, pmd, entry);
2113
2114         if (huge_pmd_needs_flush(oldpmd, entry))
2115                 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
2116 unlock:
2117         spin_unlock(ptl);
2118         return ret;
2119 }
2120
2121 #ifdef CONFIG_USERFAULTFD
2122 /*
2123  * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
2124  * the caller, but it must return after releasing the page_table_lock.
2125  * Just move the page from src_pmd to dst_pmd if possible.
2126  * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2127  * repeated by the caller, or other errors in case of failure.
2128  */
2129 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2130                         struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2131                         unsigned long dst_addr, unsigned long src_addr)
2132 {
2133         pmd_t _dst_pmd, src_pmdval;
2134         struct page *src_page;
2135         struct folio *src_folio;
2136         struct anon_vma *src_anon_vma;
2137         spinlock_t *src_ptl, *dst_ptl;
2138         pgtable_t src_pgtable;
2139         struct mmu_notifier_range range;
2140         int err = 0;
2141
2142         src_pmdval = *src_pmd;
2143         src_ptl = pmd_lockptr(mm, src_pmd);
2144
2145         lockdep_assert_held(src_ptl);
2146         vma_assert_locked(src_vma);
2147         vma_assert_locked(dst_vma);
2148
2149         /* Sanity checks before the operation */
2150         if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2151             WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2152                 spin_unlock(src_ptl);
2153                 return -EINVAL;
2154         }
2155
2156         if (!pmd_trans_huge(src_pmdval)) {
2157                 spin_unlock(src_ptl);
2158                 if (is_pmd_migration_entry(src_pmdval)) {
2159                         pmd_migration_entry_wait(mm, &src_pmdval);
2160                         return -EAGAIN;
2161                 }
2162                 return -ENOENT;
2163         }
2164
2165         src_page = pmd_page(src_pmdval);
2166
2167         if (!is_huge_zero_pmd(src_pmdval)) {
2168                 if (unlikely(!PageAnonExclusive(src_page))) {
2169                         spin_unlock(src_ptl);
2170                         return -EBUSY;
2171                 }
2172
2173                 src_folio = page_folio(src_page);
2174                 folio_get(src_folio);
2175         } else
2176                 src_folio = NULL;
2177
2178         spin_unlock(src_ptl);
2179
2180         flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2181         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2182                                 src_addr + HPAGE_PMD_SIZE);
2183         mmu_notifier_invalidate_range_start(&range);
2184
2185         if (src_folio) {
2186                 folio_lock(src_folio);
2187
2188                 /*
2189                  * split_huge_page walks the anon_vma chain without the page
2190                  * lock. Serialize against it with the anon_vma lock, the page
2191                  * lock is not enough.
2192                  */
2193                 src_anon_vma = folio_get_anon_vma(src_folio);
2194                 if (!src_anon_vma) {
2195                         err = -EAGAIN;
2196                         goto unlock_folio;
2197                 }
2198                 anon_vma_lock_write(src_anon_vma);
2199         } else
2200                 src_anon_vma = NULL;
2201
2202         dst_ptl = pmd_lockptr(mm, dst_pmd);
2203         double_pt_lock(src_ptl, dst_ptl);
2204         if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2205                      !pmd_same(*dst_pmd, dst_pmdval))) {
2206                 err = -EAGAIN;
2207                 goto unlock_ptls;
2208         }
2209         if (src_folio) {
2210                 if (folio_maybe_dma_pinned(src_folio) ||
2211                     !PageAnonExclusive(&src_folio->page)) {
2212                         err = -EBUSY;
2213                         goto unlock_ptls;
2214                 }
2215
2216                 if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2217                     WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2218                         err = -EBUSY;
2219                         goto unlock_ptls;
2220                 }
2221
2222                 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2223                 /* Folio got pinned from under us. Put it back and fail the move. */
2224                 if (folio_maybe_dma_pinned(src_folio)) {
2225                         set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2226                         err = -EBUSY;
2227                         goto unlock_ptls;
2228                 }
2229
2230                 folio_move_anon_rmap(src_folio, dst_vma);
2231                 src_folio->index = linear_page_index(dst_vma, dst_addr);
2232
2233                 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2234                 /* Follow mremap() behavior and treat the entry dirty after the move */
2235                 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2236         } else {
2237                 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2238                 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
2239         }
2240         set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2241
2242         src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2243         pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2244 unlock_ptls:
2245         double_pt_unlock(src_ptl, dst_ptl);
2246         if (src_anon_vma) {
2247                 anon_vma_unlock_write(src_anon_vma);
2248                 put_anon_vma(src_anon_vma);
2249         }
2250 unlock_folio:
2251         /* unblock rmap walks */
2252         if (src_folio)
2253                 folio_unlock(src_folio);
2254         mmu_notifier_invalidate_range_end(&range);
2255         if (src_folio)
2256                 folio_put(src_folio);
2257         return err;
2258 }
2259 #endif /* CONFIG_USERFAULTFD */
2260
2261 /*
2262  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2263  *
2264  * Note that if it returns page table lock pointer, this routine returns without
2265  * unlocking page table lock. So callers must unlock it.
2266  */
2267 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2268 {
2269         spinlock_t *ptl;
2270         ptl = pmd_lock(vma->vm_mm, pmd);
2271         if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
2272                         pmd_devmap(*pmd)))
2273                 return ptl;
2274         spin_unlock(ptl);
2275         return NULL;
2276 }
2277
2278 /*
2279  * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2280  *
2281  * Note that if it returns page table lock pointer, this routine returns without
2282  * unlocking page table lock. So callers must unlock it.
2283  */
2284 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2285 {
2286         spinlock_t *ptl;
2287
2288         ptl = pud_lock(vma->vm_mm, pud);
2289         if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2290                 return ptl;
2291         spin_unlock(ptl);
2292         return NULL;
2293 }
2294
2295 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2296 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2297                  pud_t *pud, unsigned long addr)
2298 {
2299         spinlock_t *ptl;
2300
2301         ptl = __pud_trans_huge_lock(pud, vma);
2302         if (!ptl)
2303                 return 0;
2304
2305         pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2306         tlb_remove_pud_tlb_entry(tlb, pud, addr);
2307         if (vma_is_special_huge(vma)) {
2308                 spin_unlock(ptl);
2309                 /* No zero page support yet */
2310         } else {
2311                 /* No support for anonymous PUD pages yet */
2312                 BUG();
2313         }
2314         return 1;
2315 }
2316
2317 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2318                 unsigned long haddr)
2319 {
2320         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2321         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2322         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2323         VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2324
2325         count_vm_event(THP_SPLIT_PUD);
2326
2327         pudp_huge_clear_flush(vma, haddr, pud);
2328 }
2329
2330 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2331                 unsigned long address)
2332 {
2333         spinlock_t *ptl;
2334         struct mmu_notifier_range range;
2335
2336         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2337                                 address & HPAGE_PUD_MASK,
2338                                 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2339         mmu_notifier_invalidate_range_start(&range);
2340         ptl = pud_lock(vma->vm_mm, pud);
2341         if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2342                 goto out;
2343         __split_huge_pud_locked(vma, pud, range.start);
2344
2345 out:
2346         spin_unlock(ptl);
2347         mmu_notifier_invalidate_range_end(&range);
2348 }
2349 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2350
2351 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2352                 unsigned long haddr, pmd_t *pmd)
2353 {
2354         struct mm_struct *mm = vma->vm_mm;
2355         pgtable_t pgtable;
2356         pmd_t _pmd, old_pmd;
2357         unsigned long addr;
2358         pte_t *pte;
2359         int i;
2360
2361         /*
2362          * Leave pmd empty until pte is filled note that it is fine to delay
2363          * notification until mmu_notifier_invalidate_range_end() as we are
2364          * replacing a zero pmd write protected page with a zero pte write
2365          * protected page.
2366          *
2367          * See Documentation/mm/mmu_notifier.rst
2368          */
2369         old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2370
2371         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2372         pmd_populate(mm, &_pmd, pgtable);
2373
2374         pte = pte_offset_map(&_pmd, haddr);
2375         VM_BUG_ON(!pte);
2376         for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2377                 pte_t entry;
2378
2379                 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
2380                 entry = pte_mkspecial(entry);
2381                 if (pmd_uffd_wp(old_pmd))
2382                         entry = pte_mkuffd_wp(entry);
2383                 VM_BUG_ON(!pte_none(ptep_get(pte)));
2384                 set_pte_at(mm, addr, pte, entry);
2385                 pte++;
2386         }
2387         pte_unmap(pte - 1);
2388         smp_wmb(); /* make pte visible before pmd */
2389         pmd_populate(mm, pmd, pgtable);
2390 }
2391
2392 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2393                 unsigned long haddr, bool freeze)
2394 {
2395         struct mm_struct *mm = vma->vm_mm;
2396         struct folio *folio;
2397         struct page *page;
2398         pgtable_t pgtable;
2399         pmd_t old_pmd, _pmd;
2400         bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2401         bool anon_exclusive = false, dirty = false;
2402         unsigned long addr;
2403         pte_t *pte;
2404         int i;
2405
2406         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2407         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2408         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2409         VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2410                                 && !pmd_devmap(*pmd));
2411
2412         count_vm_event(THP_SPLIT_PMD);
2413
2414         if (!vma_is_anonymous(vma)) {
2415                 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2416                 /*
2417                  * We are going to unmap this huge page. So
2418                  * just go ahead and zap it
2419                  */
2420                 if (arch_needs_pgtable_deposit())
2421                         zap_deposited_table(mm, pmd);
2422                 if (vma_is_special_huge(vma))
2423                         return;
2424                 if (unlikely(is_pmd_migration_entry(old_pmd))) {
2425                         swp_entry_t entry;
2426
2427                         entry = pmd_to_swp_entry(old_pmd);
2428                         folio = pfn_swap_entry_folio(entry);
2429                 } else {
2430                         page = pmd_page(old_pmd);
2431                         folio = page_folio(page);
2432                         if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
2433                                 folio_mark_dirty(folio);
2434                         if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2435                                 folio_set_referenced(folio);
2436                         folio_remove_rmap_pmd(folio, page, vma);
2437                         folio_put(folio);
2438                 }
2439                 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
2440                 return;
2441         }
2442
2443         if (is_huge_zero_pmd(*pmd)) {
2444                 /*
2445                  * FIXME: Do we want to invalidate secondary mmu by calling
2446                  * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2447                  * inside __split_huge_pmd() ?
2448                  *
2449                  * We are going from a zero huge page write protected to zero
2450                  * small page also write protected so it does not seems useful
2451                  * to invalidate secondary mmu at this time.
2452                  */
2453                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2454         }
2455
2456         pmd_migration = is_pmd_migration_entry(*pmd);
2457         if (unlikely(pmd_migration)) {
2458                 swp_entry_t entry;
2459
2460                 old_pmd = *pmd;
2461                 entry = pmd_to_swp_entry(old_pmd);
2462                 page = pfn_swap_entry_to_page(entry);
2463                 write = is_writable_migration_entry(entry);
2464                 if (PageAnon(page))
2465                         anon_exclusive = is_readable_exclusive_migration_entry(entry);
2466                 young = is_migration_entry_young(entry);
2467                 dirty = is_migration_entry_dirty(entry);
2468                 soft_dirty = pmd_swp_soft_dirty(old_pmd);
2469                 uffd_wp = pmd_swp_uffd_wp(old_pmd);
2470         } else {
2471                 /*
2472                  * Up to this point the pmd is present and huge and userland has
2473                  * the whole access to the hugepage during the split (which
2474                  * happens in place). If we overwrite the pmd with the not-huge
2475                  * version pointing to the pte here (which of course we could if
2476                  * all CPUs were bug free), userland could trigger a small page
2477                  * size TLB miss on the small sized TLB while the hugepage TLB
2478                  * entry is still established in the huge TLB. Some CPU doesn't
2479                  * like that. See
2480                  * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2481                  * 383 on page 105. Intel should be safe but is also warns that
2482                  * it's only safe if the permission and cache attributes of the
2483                  * two entries loaded in the two TLB is identical (which should
2484                  * be the case here). But it is generally safer to never allow
2485                  * small and huge TLB entries for the same virtual address to be
2486                  * loaded simultaneously. So instead of doing "pmd_populate();
2487                  * flush_pmd_tlb_range();" we first mark the current pmd
2488                  * notpresent (atomically because here the pmd_trans_huge must
2489                  * remain set at all times on the pmd until the split is
2490                  * complete for this pmd), then we flush the SMP TLB and finally
2491                  * we write the non-huge version of the pmd entry with
2492                  * pmd_populate.
2493                  */
2494                 old_pmd = pmdp_invalidate(vma, haddr, pmd);
2495                 page = pmd_page(old_pmd);
2496                 folio = page_folio(page);
2497                 if (pmd_dirty(old_pmd)) {
2498                         dirty = true;
2499                         folio_set_dirty(folio);
2500                 }
2501                 write = pmd_write(old_pmd);
2502                 young = pmd_young(old_pmd);
2503                 soft_dirty = pmd_soft_dirty(old_pmd);
2504                 uffd_wp = pmd_uffd_wp(old_pmd);
2505
2506                 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2507                 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2508
2509                 /*
2510                  * Without "freeze", we'll simply split the PMD, propagating the
2511                  * PageAnonExclusive() flag for each PTE by setting it for
2512                  * each subpage -- no need to (temporarily) clear.
2513                  *
2514                  * With "freeze" we want to replace mapped pages by
2515                  * migration entries right away. This is only possible if we
2516                  * managed to clear PageAnonExclusive() -- see
2517                  * set_pmd_migration_entry().
2518                  *
2519                  * In case we cannot clear PageAnonExclusive(), split the PMD
2520                  * only and let try_to_migrate_one() fail later.
2521                  *
2522                  * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
2523                  */
2524                 anon_exclusive = PageAnonExclusive(page);
2525                 if (freeze && anon_exclusive &&
2526                     folio_try_share_anon_rmap_pmd(folio, page))
2527                         freeze = false;
2528                 if (!freeze) {
2529                         rmap_t rmap_flags = RMAP_NONE;
2530
2531                         folio_ref_add(folio, HPAGE_PMD_NR - 1);
2532                         if (anon_exclusive)
2533                                 rmap_flags |= RMAP_EXCLUSIVE;
2534                         folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2535                                                  vma, haddr, rmap_flags);
2536                 }
2537         }
2538
2539         /*
2540          * Withdraw the table only after we mark the pmd entry invalid.
2541          * This's critical for some architectures (Power).
2542          */
2543         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2544         pmd_populate(mm, &_pmd, pgtable);
2545
2546         pte = pte_offset_map(&_pmd, haddr);
2547         VM_BUG_ON(!pte);
2548
2549         /*
2550          * Note that NUMA hinting access restrictions are not transferred to
2551          * avoid any possibility of altering permissions across VMAs.
2552          */
2553         if (freeze || pmd_migration) {
2554                 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2555                         pte_t entry;
2556                         swp_entry_t swp_entry;
2557
2558                         if (write)
2559                                 swp_entry = make_writable_migration_entry(
2560                                                         page_to_pfn(page + i));
2561                         else if (anon_exclusive)
2562                                 swp_entry = make_readable_exclusive_migration_entry(
2563                                                         page_to_pfn(page + i));
2564                         else
2565                                 swp_entry = make_readable_migration_entry(
2566                                                         page_to_pfn(page + i));
2567                         if (young)
2568                                 swp_entry = make_migration_entry_young(swp_entry);
2569                         if (dirty)
2570                                 swp_entry = make_migration_entry_dirty(swp_entry);
2571                         entry = swp_entry_to_pte(swp_entry);
2572                         if (soft_dirty)
2573                                 entry = pte_swp_mksoft_dirty(entry);
2574                         if (uffd_wp)
2575                                 entry = pte_swp_mkuffd_wp(entry);
2576
2577                         VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2578                         set_pte_at(mm, addr, pte + i, entry);
2579                 }
2580         } else {
2581                 pte_t entry;
2582
2583                 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
2584                 if (write)
2585                         entry = pte_mkwrite(entry, vma);
2586                 if (!young)
2587                         entry = pte_mkold(entry);
2588                 /* NOTE: this may set soft-dirty too on some archs */
2589                 if (dirty)
2590                         entry = pte_mkdirty(entry);
2591                 if (soft_dirty)
2592                         entry = pte_mksoft_dirty(entry);
2593                 if (uffd_wp)
2594                         entry = pte_mkuffd_wp(entry);
2595
2596                 for (i = 0; i < HPAGE_PMD_NR; i++)
2597                         VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2598
2599                 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
2600         }
2601         pte_unmap(pte);
2602
2603         if (!pmd_migration)
2604                 folio_remove_rmap_pmd(folio, page, vma);
2605         if (freeze)
2606                 put_page(page);
2607
2608         smp_wmb(); /* make pte visible before pmd */
2609         pmd_populate(mm, pmd, pgtable);
2610 }
2611
2612 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
2613                            pmd_t *pmd, bool freeze, struct folio *folio)
2614 {
2615         VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
2616         VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
2617         VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
2618         VM_BUG_ON(freeze && !folio);
2619
2620         /*
2621          * When the caller requests to set up a migration entry, we
2622          * require a folio to check the PMD against. Otherwise, there
2623          * is a risk of replacing the wrong folio.
2624          */
2625         if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
2626             is_pmd_migration_entry(*pmd)) {
2627                 if (folio && folio != pmd_folio(*pmd))
2628                         return;
2629                 __split_huge_pmd_locked(vma, pmd, address, freeze);
2630         }
2631 }
2632
2633 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2634                 unsigned long address, bool freeze, struct folio *folio)
2635 {
2636         spinlock_t *ptl;
2637         struct mmu_notifier_range range;
2638
2639         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2640                                 address & HPAGE_PMD_MASK,
2641                                 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2642         mmu_notifier_invalidate_range_start(&range);
2643         ptl = pmd_lock(vma->vm_mm, pmd);
2644         split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
2645         spin_unlock(ptl);
2646         mmu_notifier_invalidate_range_end(&range);
2647 }
2648
2649 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2650                 bool freeze, struct folio *folio)
2651 {
2652         pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
2653
2654         if (!pmd)
2655                 return;
2656
2657         __split_huge_pmd(vma, pmd, address, freeze, folio);
2658 }
2659
2660 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
2661 {
2662         /*
2663          * If the new address isn't hpage aligned and it could previously
2664          * contain an hugepage: check if we need to split an huge pmd.
2665          */
2666         if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
2667             range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
2668                          ALIGN(address, HPAGE_PMD_SIZE)))
2669                 split_huge_pmd_address(vma, address, false, NULL);
2670 }
2671
2672 void vma_adjust_trans_huge(struct vm_area_struct *vma,
2673                              unsigned long start,
2674                              unsigned long end,
2675                              long adjust_next)
2676 {
2677         /* Check if we need to split start first. */
2678         split_huge_pmd_if_needed(vma, start);
2679
2680         /* Check if we need to split end next. */
2681         split_huge_pmd_if_needed(vma, end);
2682
2683         /*
2684          * If we're also updating the next vma vm_start,
2685          * check if we need to split it.
2686          */
2687         if (adjust_next > 0) {
2688                 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
2689                 unsigned long nstart = next->vm_start;
2690                 nstart += adjust_next;
2691                 split_huge_pmd_if_needed(next, nstart);
2692         }
2693 }
2694
2695 static void unmap_folio(struct folio *folio)
2696 {
2697         enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
2698                 TTU_BATCH_FLUSH;
2699
2700         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2701
2702         if (folio_test_pmd_mappable(folio))
2703                 ttu_flags |= TTU_SPLIT_HUGE_PMD;
2704
2705         /*
2706          * Anon pages need migration entries to preserve them, but file
2707          * pages can simply be left unmapped, then faulted back on demand.
2708          * If that is ever changed (perhaps for mlock), update remap_page().
2709          */
2710         if (folio_test_anon(folio))
2711                 try_to_migrate(folio, ttu_flags);
2712         else
2713                 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
2714
2715         try_to_unmap_flush();
2716 }
2717
2718 static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
2719                                             unsigned long addr, pmd_t *pmdp,
2720                                             struct folio *folio)
2721 {
2722         struct mm_struct *mm = vma->vm_mm;
2723         int ref_count, map_count;
2724         pmd_t orig_pmd = *pmdp;
2725
2726         if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
2727                 return false;
2728
2729         orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
2730
2731         /*
2732          * Syncing against concurrent GUP-fast:
2733          * - clear PMD; barrier; read refcount
2734          * - inc refcount; barrier; read PMD
2735          */
2736         smp_mb();
2737
2738         ref_count = folio_ref_count(folio);
2739         map_count = folio_mapcount(folio);
2740
2741         /*
2742          * Order reads for folio refcount and dirty flag
2743          * (see comments in __remove_mapping()).
2744          */
2745         smp_rmb();
2746
2747         /*
2748          * If the folio or its PMD is redirtied at this point, or if there
2749          * are unexpected references, we will give up to discard this folio
2750          * and remap it.
2751          *
2752          * The only folio refs must be one from isolation plus the rmap(s).
2753          */
2754         if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
2755             ref_count != map_count + 1) {
2756                 set_pmd_at(mm, addr, pmdp, orig_pmd);
2757                 return false;
2758         }
2759
2760         folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
2761         zap_deposited_table(mm, pmdp);
2762         add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
2763         if (vma->vm_flags & VM_LOCKED)
2764                 mlock_drain_local();
2765         folio_put(folio);
2766
2767         return true;
2768 }
2769
2770 bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
2771                            pmd_t *pmdp, struct folio *folio)
2772 {
2773         VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
2774         VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
2775         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
2776
2777         if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
2778                 return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
2779
2780         return false;
2781 }
2782
2783 static void remap_page(struct folio *folio, unsigned long nr)
2784 {
2785         int i = 0;
2786
2787         /* If unmap_folio() uses try_to_migrate() on file, remove this check */
2788         if (!folio_test_anon(folio))
2789                 return;
2790         for (;;) {
2791                 remove_migration_ptes(folio, folio, true);
2792                 i += folio_nr_pages(folio);
2793                 if (i >= nr)
2794                         break;
2795                 folio = folio_next(folio);
2796         }
2797 }
2798
2799 static void lru_add_page_tail(struct page *head, struct page *tail,
2800                 struct lruvec *lruvec, struct list_head *list)
2801 {
2802         VM_BUG_ON_PAGE(!PageHead(head), head);
2803         VM_BUG_ON_PAGE(PageLRU(tail), head);
2804         lockdep_assert_held(&lruvec->lru_lock);
2805
2806         if (list) {
2807                 /* page reclaim is reclaiming a huge page */
2808                 VM_WARN_ON(PageLRU(head));
2809                 get_page(tail);
2810                 list_add_tail(&tail->lru, list);
2811         } else {
2812                 /* head is still on lru (and we have it frozen) */
2813                 VM_WARN_ON(!PageLRU(head));
2814                 if (PageUnevictable(tail))
2815                         tail->mlock_count = 0;
2816                 else
2817                         list_add_tail(&tail->lru, &head->lru);
2818                 SetPageLRU(tail);
2819         }
2820 }
2821
2822 static void __split_huge_page_tail(struct folio *folio, int tail,
2823                 struct lruvec *lruvec, struct list_head *list,
2824                 unsigned int new_order)
2825 {
2826         struct page *head = &folio->page;
2827         struct page *page_tail = head + tail;
2828         /*
2829          * Careful: new_folio is not a "real" folio before we cleared PageTail.
2830          * Don't pass it around before clear_compound_head().
2831          */
2832         struct folio *new_folio = (struct folio *)page_tail;
2833
2834         VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2835
2836         /*
2837          * Clone page flags before unfreezing refcount.
2838          *
2839          * After successful get_page_unless_zero() might follow flags change,
2840          * for example lock_page() which set PG_waiters.
2841          *
2842          * Note that for mapped sub-pages of an anonymous THP,
2843          * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
2844          * the migration entry instead from where remap_page() will restore it.
2845          * We can still have PG_anon_exclusive set on effectively unmapped and
2846          * unreferenced sub-pages of an anonymous THP: we can simply drop
2847          * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
2848          */
2849         page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2850         page_tail->flags |= (head->flags &
2851                         ((1L << PG_referenced) |
2852                          (1L << PG_swapbacked) |
2853                          (1L << PG_swapcache) |
2854                          (1L << PG_mlocked) |
2855                          (1L << PG_uptodate) |
2856                          (1L << PG_active) |
2857                          (1L << PG_workingset) |
2858                          (1L << PG_locked) |
2859                          (1L << PG_unevictable) |
2860 #ifdef CONFIG_ARCH_USES_PG_ARCH_X
2861                          (1L << PG_arch_2) |
2862                          (1L << PG_arch_3) |
2863 #endif
2864                          (1L << PG_dirty) |
2865                          LRU_GEN_MASK | LRU_REFS_MASK));
2866
2867         /* ->mapping in first and second tail page is replaced by other uses */
2868         VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2869                         page_tail);
2870         page_tail->mapping = head->mapping;
2871         page_tail->index = head->index + tail;
2872
2873         /*
2874          * page->private should not be set in tail pages. Fix up and warn once
2875          * if private is unexpectedly set.
2876          */
2877         if (unlikely(page_tail->private)) {
2878                 VM_WARN_ON_ONCE_PAGE(true, page_tail);
2879                 page_tail->private = 0;
2880         }
2881         if (folio_test_swapcache(folio))
2882                 new_folio->swap.val = folio->swap.val + tail;
2883
2884         /* Page flags must be visible before we make the page non-compound. */
2885         smp_wmb();
2886
2887         /*
2888          * Clear PageTail before unfreezing page refcount.
2889          *
2890          * After successful get_page_unless_zero() might follow put_page()
2891          * which needs correct compound_head().
2892          */
2893         clear_compound_head(page_tail);
2894         if (new_order) {
2895                 prep_compound_page(page_tail, new_order);
2896                 folio_set_large_rmappable(new_folio);
2897         }
2898
2899         /* Finally unfreeze refcount. Additional reference from page cache. */
2900         page_ref_unfreeze(page_tail,
2901                 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
2902                              folio_nr_pages(new_folio) : 0));
2903
2904         if (folio_test_young(folio))
2905                 folio_set_young(new_folio);
2906         if (folio_test_idle(folio))
2907                 folio_set_idle(new_folio);
2908
2909         folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
2910
2911         /*
2912          * always add to the tail because some iterators expect new
2913          * pages to show after the currently processed elements - e.g.
2914          * migrate_pages
2915          */
2916         lru_add_page_tail(head, page_tail, lruvec, list);
2917 }
2918
2919 static void __split_huge_page(struct page *page, struct list_head *list,
2920                 pgoff_t end, unsigned int new_order)
2921 {
2922         struct folio *folio = page_folio(page);
2923         struct page *head = &folio->page;
2924         struct lruvec *lruvec;
2925         struct address_space *swap_cache = NULL;
2926         unsigned long offset = 0;
2927         int i, nr_dropped = 0;
2928         unsigned int new_nr = 1 << new_order;
2929         int order = folio_order(folio);
2930         unsigned int nr = 1 << order;
2931
2932         /* complete memcg works before add pages to LRU */
2933         split_page_memcg(head, order, new_order);
2934
2935         if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
2936                 offset = swap_cache_index(folio->swap);
2937                 swap_cache = swap_address_space(folio->swap);
2938                 xa_lock(&swap_cache->i_pages);
2939         }
2940
2941         /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
2942         lruvec = folio_lruvec_lock(folio);
2943
2944         ClearPageHasHWPoisoned(head);
2945
2946         for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
2947                 __split_huge_page_tail(folio, i, lruvec, list, new_order);
2948                 /* Some pages can be beyond EOF: drop them from page cache */
2949                 if (head[i].index >= end) {
2950                         struct folio *tail = page_folio(head + i);
2951
2952                         if (shmem_mapping(folio->mapping))
2953                                 nr_dropped++;
2954                         else if (folio_test_clear_dirty(tail))
2955                                 folio_account_cleaned(tail,
2956                                         inode_to_wb(folio->mapping->host));
2957                         __filemap_remove_folio(tail, NULL);
2958                         folio_put(tail);
2959                 } else if (!PageAnon(page)) {
2960                         __xa_store(&folio->mapping->i_pages, head[i].index,
2961                                         head + i, 0);
2962                 } else if (swap_cache) {
2963                         __xa_store(&swap_cache->i_pages, offset + i,
2964                                         head + i, 0);
2965                 }
2966         }
2967
2968         if (!new_order)
2969                 ClearPageCompound(head);
2970         else {
2971                 struct folio *new_folio = (struct folio *)head;
2972
2973                 folio_set_order(new_folio, new_order);
2974         }
2975         unlock_page_lruvec(lruvec);
2976         /* Caller disabled irqs, so they are still disabled here */
2977
2978         split_page_owner(head, order, new_order);
2979         pgalloc_tag_split(head, 1 << order);
2980
2981         /* See comment in __split_huge_page_tail() */
2982         if (folio_test_anon(folio)) {
2983                 /* Additional pin to swap cache */
2984                 if (folio_test_swapcache(folio)) {
2985                         folio_ref_add(folio, 1 + new_nr);
2986                         xa_unlock(&swap_cache->i_pages);
2987                 } else {
2988                         folio_ref_inc(folio);
2989                 }
2990         } else {
2991                 /* Additional pin to page cache */
2992                 folio_ref_add(folio, 1 + new_nr);
2993                 xa_unlock(&folio->mapping->i_pages);
2994         }
2995         local_irq_enable();
2996
2997         if (nr_dropped)
2998                 shmem_uncharge(folio->mapping->host, nr_dropped);
2999         remap_page(folio, nr);
3000
3001         /*
3002          * set page to its compound_head when split to non order-0 pages, so
3003          * we can skip unlocking it below, since PG_locked is transferred to
3004          * the compound_head of the page and the caller will unlock it.
3005          */
3006         if (new_order)
3007                 page = compound_head(page);
3008
3009         for (i = 0; i < nr; i += new_nr) {
3010                 struct page *subpage = head + i;
3011                 struct folio *new_folio = page_folio(subpage);
3012                 if (subpage == page)
3013                         continue;
3014                 folio_unlock(new_folio);
3015
3016                 /*
3017                  * Subpages may be freed if there wasn't any mapping
3018                  * like if add_to_swap() is running on a lru page that
3019                  * had its mapping zapped. And freeing these pages
3020                  * requires taking the lru_lock so we do the put_page
3021                  * of the tail pages after the split is complete.
3022                  */
3023                 free_page_and_swap_cache(subpage);
3024         }
3025 }
3026
3027 /* Racy check whether the huge page can be split */
3028 bool can_split_folio(struct folio *folio, int *pextra_pins)
3029 {
3030         int extra_pins;
3031
3032         /* Additional pins from page cache */
3033         if (folio_test_anon(folio))
3034                 extra_pins = folio_test_swapcache(folio) ?
3035                                 folio_nr_pages(folio) : 0;
3036         else
3037                 extra_pins = folio_nr_pages(folio);
3038         if (pextra_pins)
3039                 *pextra_pins = extra_pins;
3040         return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
3041 }
3042
3043 /*
3044  * This function splits a large folio into smaller folios of order @new_order.
3045  * @page can point to any page of the large folio to split. The split operation
3046  * does not change the position of @page.
3047  *
3048  * Prerequisites:
3049  *
3050  * 1) The caller must hold a reference on the @page's owning folio, also known
3051  *    as the large folio.
3052  *
3053  * 2) The large folio must be locked.
3054  *
3055  * 3) The folio must not be pinned. Any unexpected folio references, including
3056  *    GUP pins, will result in the folio not getting split; instead, the caller
3057  *    will receive an -EAGAIN.
3058  *
3059  * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
3060  *    supported for non-file-backed folios, because folio->_deferred_list, which
3061  *    is used by partially mapped folios, is stored in subpage 2, but an order-1
3062  *    folio only has subpages 0 and 1. File-backed order-1 folios are supported,
3063  *    since they do not use _deferred_list.
3064  *
3065  * After splitting, the caller's folio reference will be transferred to @page,
3066  * resulting in a raised refcount of @page after this call. The other pages may
3067  * be freed if they are not mapped.
3068  *
3069  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3070  *
3071  * Pages in @new_order will inherit the mapping, flags, and so on from the
3072  * huge page.
3073  *
3074  * Returns 0 if the huge page was split successfully.
3075  *
3076  * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
3077  * the folio was concurrently removed from the page cache.
3078  *
3079  * Returns -EBUSY when trying to split the huge zeropage, if the folio is
3080  * under writeback, if fs-specific folio metadata cannot currently be
3081  * released, or if some unexpected race happened (e.g., anon VMA disappeared,
3082  * truncation).
3083  *
3084  * Callers should ensure that the order respects the address space mapping
3085  * min-order if one is set for non-anonymous folios.
3086  *
3087  * Returns -EINVAL when trying to split to an order that is incompatible
3088  * with the folio. Splitting to order 0 is compatible with all folios.
3089  */
3090 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
3091                                      unsigned int new_order)
3092 {
3093         struct folio *folio = page_folio(page);
3094         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3095         /* reset xarray order to new order after split */
3096         XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
3097         struct anon_vma *anon_vma = NULL;
3098         struct address_space *mapping = NULL;
3099         int order = folio_order(folio);
3100         int extra_pins, ret;
3101         pgoff_t end;
3102         bool is_hzp;
3103
3104         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3105         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3106
3107         if (new_order >= folio_order(folio))
3108                 return -EINVAL;
3109
3110         if (folio_test_anon(folio)) {
3111                 /* order-1 is not supported for anonymous THP. */
3112                 if (new_order == 1) {
3113                         VM_WARN_ONCE(1, "Cannot split to order-1 folio");
3114                         return -EINVAL;
3115                 }
3116         } else if (new_order) {
3117                 /* Split shmem folio to non-zero order not supported */
3118                 if (shmem_mapping(folio->mapping)) {
3119                         VM_WARN_ONCE(1,
3120                                 "Cannot split shmem folio to non-0 order");
3121                         return -EINVAL;
3122                 }
3123                 /*
3124                  * No split if the file system does not support large folio.
3125                  * Note that we might still have THPs in such mappings due to
3126                  * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
3127                  * does not actually support large folios properly.
3128                  */
3129                 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3130                     !mapping_large_folio_support(folio->mapping)) {
3131                         VM_WARN_ONCE(1,
3132                                 "Cannot split file folio to non-0 order");
3133                         return -EINVAL;
3134                 }
3135         }
3136
3137         /* Only swapping a whole PMD-mapped folio is supported */
3138         if (folio_test_swapcache(folio) && new_order)
3139                 return -EINVAL;
3140
3141         is_hzp = is_huge_zero_folio(folio);
3142         if (is_hzp) {
3143                 pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
3144                 return -EBUSY;
3145         }
3146
3147         if (folio_test_writeback(folio))
3148                 return -EBUSY;
3149
3150         if (folio_test_anon(folio)) {
3151                 /*
3152                  * The caller does not necessarily hold an mmap_lock that would
3153                  * prevent the anon_vma disappearing so we first we take a
3154                  * reference to it and then lock the anon_vma for write. This
3155                  * is similar to folio_lock_anon_vma_read except the write lock
3156                  * is taken to serialise against parallel split or collapse
3157                  * operations.
3158                  */
3159                 anon_vma = folio_get_anon_vma(folio);
3160                 if (!anon_vma) {
3161                         ret = -EBUSY;
3162                         goto out;
3163                 }
3164                 end = -1;
3165                 mapping = NULL;
3166                 anon_vma_lock_write(anon_vma);
3167         } else {
3168                 unsigned int min_order;
3169                 gfp_t gfp;
3170
3171                 mapping = folio->mapping;
3172
3173                 /* Truncated ? */
3174                 if (!mapping) {
3175                         ret = -EBUSY;
3176                         goto out;
3177                 }
3178
3179                 min_order = mapping_min_folio_order(folio->mapping);
3180                 if (new_order < min_order) {
3181                         VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
3182                                      min_order);
3183                         ret = -EINVAL;
3184                         goto out;
3185                 }
3186
3187                 gfp = current_gfp_context(mapping_gfp_mask(mapping) &
3188                                                         GFP_RECLAIM_MASK);
3189
3190                 if (!filemap_release_folio(folio, gfp)) {
3191                         ret = -EBUSY;
3192                         goto out;
3193                 }
3194
3195                 xas_split_alloc(&xas, folio, folio_order(folio), gfp);
3196                 if (xas_error(&xas)) {
3197                         ret = xas_error(&xas);
3198                         goto out;
3199                 }
3200
3201                 anon_vma = NULL;
3202                 i_mmap_lock_read(mapping);
3203
3204                 /*
3205                  *__split_huge_page() may need to trim off pages beyond EOF:
3206                  * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
3207                  * which cannot be nested inside the page tree lock. So note
3208                  * end now: i_size itself may be changed at any moment, but
3209                  * folio lock is good enough to serialize the trimming.
3210                  */
3211                 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3212                 if (shmem_mapping(mapping))
3213                         end = shmem_fallocend(mapping->host, end);
3214         }
3215
3216         /*
3217          * Racy check if we can split the page, before unmap_folio() will
3218          * split PMDs
3219          */
3220         if (!can_split_folio(folio, &extra_pins)) {
3221                 ret = -EAGAIN;
3222                 goto out_unlock;
3223         }
3224
3225         unmap_folio(folio);
3226
3227         /* block interrupt reentry in xa_lock and spinlock */
3228         local_irq_disable();
3229         if (mapping) {
3230                 /*
3231                  * Check if the folio is present in page cache.
3232                  * We assume all tail are present too, if folio is there.
3233                  */
3234                 xas_lock(&xas);
3235                 xas_reset(&xas);
3236                 if (xas_load(&xas) != folio)
3237                         goto fail;
3238         }
3239
3240         /* Prevent deferred_split_scan() touching ->_refcount */
3241         spin_lock(&ds_queue->split_queue_lock);
3242         if (folio_ref_freeze(folio, 1 + extra_pins)) {
3243                 if (folio_order(folio) > 1 &&
3244                     !list_empty(&folio->_deferred_list)) {
3245                         ds_queue->split_queue_len--;
3246                         /*
3247                          * Reinitialize page_deferred_list after removing the
3248                          * page from the split_queue, otherwise a subsequent
3249                          * split will see list corruption when checking the
3250                          * page_deferred_list.
3251                          */
3252                         list_del_init(&folio->_deferred_list);
3253                 }
3254                 spin_unlock(&ds_queue->split_queue_lock);
3255                 if (mapping) {
3256                         int nr = folio_nr_pages(folio);
3257
3258                         xas_split(&xas, folio, folio_order(folio));
3259                         if (folio_test_pmd_mappable(folio) &&
3260                             new_order < HPAGE_PMD_ORDER) {
3261                                 if (folio_test_swapbacked(folio)) {
3262                                         __lruvec_stat_mod_folio(folio,
3263                                                         NR_SHMEM_THPS, -nr);
3264                                 } else {
3265                                         __lruvec_stat_mod_folio(folio,
3266                                                         NR_FILE_THPS, -nr);
3267                                         filemap_nr_thps_dec(mapping);
3268                                 }
3269                         }
3270                 }
3271
3272                 __split_huge_page(page, list, end, new_order);
3273                 ret = 0;
3274         } else {
3275                 spin_unlock(&ds_queue->split_queue_lock);
3276 fail:
3277                 if (mapping)
3278                         xas_unlock(&xas);
3279                 local_irq_enable();
3280                 remap_page(folio, folio_nr_pages(folio));
3281                 ret = -EAGAIN;
3282         }
3283
3284 out_unlock:
3285         if (anon_vma) {
3286                 anon_vma_unlock_write(anon_vma);
3287                 put_anon_vma(anon_vma);
3288         }
3289         if (mapping)
3290                 i_mmap_unlock_read(mapping);
3291 out:
3292         xas_destroy(&xas);
3293         if (order == HPAGE_PMD_ORDER)
3294                 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3295         count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
3296         return ret;
3297 }
3298
3299 int min_order_for_split(struct folio *folio)
3300 {
3301         if (folio_test_anon(folio))
3302                 return 0;
3303
3304         if (!folio->mapping) {
3305                 if (folio_test_pmd_mappable(folio))
3306                         count_vm_event(THP_SPLIT_PAGE_FAILED);
3307                 return -EBUSY;
3308         }
3309
3310         return mapping_min_folio_order(folio->mapping);
3311 }
3312
3313 int split_folio_to_list(struct folio *folio, struct list_head *list)
3314 {
3315         int ret = min_order_for_split(folio);
3316
3317         if (ret < 0)
3318                 return ret;
3319
3320         return split_huge_page_to_list_to_order(&folio->page, list, ret);
3321 }
3322
3323 void __folio_undo_large_rmappable(struct folio *folio)
3324 {
3325         struct deferred_split *ds_queue;
3326         unsigned long flags;
3327
3328         ds_queue = get_deferred_split_queue(folio);
3329         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3330         if (!list_empty(&folio->_deferred_list)) {
3331                 ds_queue->split_queue_len--;
3332                 list_del_init(&folio->_deferred_list);
3333         }
3334         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3335 }
3336
3337 void deferred_split_folio(struct folio *folio)
3338 {
3339         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3340 #ifdef CONFIG_MEMCG
3341         struct mem_cgroup *memcg = folio_memcg(folio);
3342 #endif
3343         unsigned long flags;
3344
3345         /*
3346          * Order 1 folios have no space for a deferred list, but we also
3347          * won't waste much memory by not adding them to the deferred list.
3348          */
3349         if (folio_order(folio) <= 1)
3350                 return;
3351
3352         /*
3353          * The try_to_unmap() in page reclaim path might reach here too,
3354          * this may cause a race condition to corrupt deferred split queue.
3355          * And, if page reclaim is already handling the same folio, it is
3356          * unnecessary to handle it again in shrinker.
3357          *
3358          * Check the swapcache flag to determine if the folio is being
3359          * handled by page reclaim since THP swap would add the folio into
3360          * swap cache before calling try_to_unmap().
3361          */
3362         if (folio_test_swapcache(folio))
3363                 return;
3364
3365         if (!list_empty(&folio->_deferred_list))
3366                 return;
3367
3368         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3369         if (list_empty(&folio->_deferred_list)) {
3370                 if (folio_test_pmd_mappable(folio))
3371                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
3372                 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
3373                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
3374                 ds_queue->split_queue_len++;
3375 #ifdef CONFIG_MEMCG
3376                 if (memcg)
3377                         set_shrinker_bit(memcg, folio_nid(folio),
3378                                          deferred_split_shrinker->id);
3379 #endif
3380         }
3381         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3382 }
3383
3384 static unsigned long deferred_split_count(struct shrinker *shrink,
3385                 struct shrink_control *sc)
3386 {
3387         struct pglist_data *pgdata = NODE_DATA(sc->nid);
3388         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
3389
3390 #ifdef CONFIG_MEMCG
3391         if (sc->memcg)
3392                 ds_queue = &sc->memcg->deferred_split_queue;
3393 #endif
3394         return READ_ONCE(ds_queue->split_queue_len);
3395 }
3396
3397 static unsigned long deferred_split_scan(struct shrinker *shrink,
3398                 struct shrink_control *sc)
3399 {
3400         struct pglist_data *pgdata = NODE_DATA(sc->nid);
3401         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
3402         unsigned long flags;
3403         LIST_HEAD(list);
3404         struct folio *folio, *next;
3405         int split = 0;
3406
3407 #ifdef CONFIG_MEMCG
3408         if (sc->memcg)
3409                 ds_queue = &sc->memcg->deferred_split_queue;
3410 #endif
3411
3412         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3413         /* Take pin on all head pages to avoid freeing them under us */
3414         list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
3415                                                         _deferred_list) {
3416                 if (folio_try_get(folio)) {
3417                         list_move(&folio->_deferred_list, &list);
3418                 } else {
3419                         /* We lost race with folio_put() */
3420                         list_del_init(&folio->_deferred_list);
3421                         ds_queue->split_queue_len--;
3422                 }
3423                 if (!--sc->nr_to_scan)
3424                         break;
3425         }
3426         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3427
3428         list_for_each_entry_safe(folio, next, &list, _deferred_list) {
3429                 if (!folio_trylock(folio))
3430                         goto next;
3431                 /* split_huge_page() removes page from list on success */
3432                 if (!split_folio(folio))
3433                         split++;
3434                 folio_unlock(folio);
3435 next:
3436                 folio_put(folio);
3437         }
3438
3439         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3440         list_splice_tail(&list, &ds_queue->split_queue);
3441         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3442
3443         /*
3444          * Stop shrinker if we didn't split any page, but the queue is empty.
3445          * This can happen if pages were freed under us.
3446          */
3447         if (!split && list_empty(&ds_queue->split_queue))
3448                 return SHRINK_STOP;
3449         return split;
3450 }
3451
3452 #ifdef CONFIG_DEBUG_FS
3453 static void split_huge_pages_all(void)
3454 {
3455         struct zone *zone;
3456         struct page *page;
3457         struct folio *folio;
3458         unsigned long pfn, max_zone_pfn;
3459         unsigned long total = 0, split = 0;
3460
3461         pr_debug("Split all THPs\n");
3462         for_each_zone(zone) {
3463                 if (!managed_zone(zone))
3464                         continue;
3465                 max_zone_pfn = zone_end_pfn(zone);
3466                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
3467                         int nr_pages;
3468
3469                         page = pfn_to_online_page(pfn);
3470                         if (!page || PageTail(page))
3471                                 continue;
3472                         folio = page_folio(page);
3473                         if (!folio_try_get(folio))
3474                                 continue;
3475
3476                         if (unlikely(page_folio(page) != folio))
3477                                 goto next;
3478
3479                         if (zone != folio_zone(folio))
3480                                 goto next;
3481
3482                         if (!folio_test_large(folio)
3483                                 || folio_test_hugetlb(folio)
3484                                 || !folio_test_lru(folio))
3485                                 goto next;
3486
3487                         total++;
3488                         folio_lock(folio);
3489                         nr_pages = folio_nr_pages(folio);
3490                         if (!split_folio(folio))
3491                                 split++;
3492                         pfn += nr_pages - 1;
3493                         folio_unlock(folio);
3494 next:
3495                         folio_put(folio);
3496                         cond_resched();
3497                 }
3498         }
3499
3500         pr_debug("%lu of %lu THP split\n", split, total);
3501 }
3502
3503 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
3504 {
3505         return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
3506                     is_vm_hugetlb_page(vma);
3507 }
3508
3509 static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
3510                                 unsigned long vaddr_end, unsigned int new_order)
3511 {
3512         int ret = 0;
3513         struct task_struct *task;
3514         struct mm_struct *mm;
3515         unsigned long total = 0, split = 0;
3516         unsigned long addr;
3517
3518         vaddr_start &= PAGE_MASK;
3519         vaddr_end &= PAGE_MASK;
3520
3521         /* Find the task_struct from pid */
3522         rcu_read_lock();
3523         task = find_task_by_vpid(pid);
3524         if (!task) {
3525                 rcu_read_unlock();
3526                 ret = -ESRCH;
3527                 goto out;
3528         }
3529         get_task_struct(task);
3530         rcu_read_unlock();
3531
3532         /* Find the mm_struct */
3533         mm = get_task_mm(task);
3534         put_task_struct(task);
3535
3536         if (!mm) {
3537                 ret = -EINVAL;
3538                 goto out;
3539         }
3540
3541         pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3542                  pid, vaddr_start, vaddr_end);
3543
3544         mmap_read_lock(mm);
3545         /*
3546          * always increase addr by PAGE_SIZE, since we could have a PTE page
3547          * table filled with PTE-mapped THPs, each of which is distinct.
3548          */
3549         for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
3550                 struct vm_area_struct *vma = vma_lookup(mm, addr);
3551                 struct page *page;
3552                 struct folio *folio;
3553                 struct address_space *mapping;
3554                 unsigned int target_order = new_order;
3555
3556                 if (!vma)
3557                         break;
3558
3559                 /* skip special VMA and hugetlb VMA */
3560                 if (vma_not_suitable_for_thp_split(vma)) {
3561                         addr = vma->vm_end;
3562                         continue;
3563                 }
3564
3565                 /* FOLL_DUMP to ignore special (like zero) pages */
3566                 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
3567
3568                 if (IS_ERR_OR_NULL(page))
3569                         continue;
3570
3571                 folio = page_folio(page);
3572                 if (!is_transparent_hugepage(folio))
3573                         goto next;
3574
3575                 if (!folio_test_anon(folio)) {
3576                         mapping = folio->mapping;
3577                         target_order = max(new_order,
3578                                            mapping_min_folio_order(mapping));
3579                 }
3580
3581                 if (target_order >= folio_order(folio))
3582                         goto next;
3583
3584                 total++;
3585                 /*
3586                  * For folios with private, split_huge_page_to_list_to_order()
3587                  * will try to drop it before split and then check if the folio
3588                  * can be split or not. So skip the check here.
3589                  */
3590                 if (!folio_test_private(folio) &&
3591                     !can_split_folio(folio, NULL))
3592                         goto next;
3593
3594                 if (!folio_trylock(folio))
3595                         goto next;
3596
3597                 if (!folio_test_anon(folio) && folio->mapping != mapping)
3598                         goto unlock;
3599
3600                 if (!split_folio_to_order(folio, target_order))
3601                         split++;
3602
3603 unlock:
3604
3605                 folio_unlock(folio);
3606 next:
3607                 folio_put(folio);
3608                 cond_resched();
3609         }
3610         mmap_read_unlock(mm);
3611         mmput(mm);
3612
3613         pr_debug("%lu of %lu THP split\n", split, total);
3614
3615 out:
3616         return ret;
3617 }
3618
3619 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
3620                                 pgoff_t off_end, unsigned int new_order)
3621 {
3622         struct filename *file;
3623         struct file *candidate;
3624         struct address_space *mapping;
3625         int ret = -EINVAL;
3626         pgoff_t index;
3627         int nr_pages = 1;
3628         unsigned long total = 0, split = 0;
3629         unsigned int min_order;
3630         unsigned int target_order;
3631
3632         file = getname_kernel(file_path);
3633         if (IS_ERR(file))
3634                 return ret;
3635
3636         candidate = file_open_name(file, O_RDONLY, 0);
3637         if (IS_ERR(candidate))
3638                 goto out;
3639
3640         pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3641                  file_path, off_start, off_end);
3642
3643         mapping = candidate->f_mapping;
3644         min_order = mapping_min_folio_order(mapping);
3645         target_order = max(new_order, min_order);
3646
3647         for (index = off_start; index < off_end; index += nr_pages) {
3648                 struct folio *folio = filemap_get_folio(mapping, index);
3649
3650                 nr_pages = 1;
3651                 if (IS_ERR(folio))
3652                         continue;
3653
3654                 if (!folio_test_large(folio))
3655                         goto next;
3656
3657                 total++;
3658                 nr_pages = folio_nr_pages(folio);
3659
3660                 if (target_order >= folio_order(folio))
3661                         goto next;
3662
3663                 if (!folio_trylock(folio))
3664                         goto next;
3665
3666                 if (folio->mapping != mapping)
3667                         goto unlock;
3668
3669                 if (!split_folio_to_order(folio, target_order))
3670                         split++;
3671
3672 unlock:
3673                 folio_unlock(folio);
3674 next:
3675                 folio_put(folio);
3676                 cond_resched();
3677         }
3678
3679         filp_close(candidate, NULL);
3680         ret = 0;
3681
3682         pr_debug("%lu of %lu file-backed THP split\n", split, total);
3683 out:
3684         putname(file);
3685         return ret;
3686 }
3687
3688 #define MAX_INPUT_BUF_SZ 255
3689
3690 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
3691                                 size_t count, loff_t *ppops)
3692 {
3693         static DEFINE_MUTEX(split_debug_mutex);
3694         ssize_t ret;
3695         /*
3696          * hold pid, start_vaddr, end_vaddr, new_order or
3697          * file_path, off_start, off_end, new_order
3698          */
3699         char input_buf[MAX_INPUT_BUF_SZ];
3700         int pid;
3701         unsigned long vaddr_start, vaddr_end;
3702         unsigned int new_order = 0;
3703
3704         ret = mutex_lock_interruptible(&split_debug_mutex);
3705         if (ret)
3706                 return ret;
3707
3708         ret = -EFAULT;
3709
3710         memset(input_buf, 0, MAX_INPUT_BUF_SZ);
3711         if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
3712                 goto out;
3713
3714         input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
3715
3716         if (input_buf[0] == '/') {
3717                 char *tok;
3718                 char *buf = input_buf;
3719                 char file_path[MAX_INPUT_BUF_SZ];
3720                 pgoff_t off_start = 0, off_end = 0;
3721                 size_t input_len = strlen(input_buf);
3722
3723                 tok = strsep(&buf, ",");
3724                 if (tok) {
3725                         strcpy(file_path, tok);
3726                 } else {
3727                         ret = -EINVAL;
3728                         goto out;
3729                 }
3730
3731                 ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
3732                 if (ret != 2 && ret != 3) {
3733                         ret = -EINVAL;
3734                         goto out;
3735                 }
3736                 ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
3737                 if (!ret)
3738                         ret = input_len;
3739
3740                 goto out;
3741         }
3742
3743         ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
3744         if (ret == 1 && pid == 1) {
3745                 split_huge_pages_all();
3746                 ret = strlen(input_buf);
3747                 goto out;
3748         } else if (ret != 3 && ret != 4) {
3749                 ret = -EINVAL;
3750                 goto out;
3751         }
3752
3753         ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
3754         if (!ret)
3755                 ret = strlen(input_buf);
3756 out:
3757         mutex_unlock(&split_debug_mutex);
3758         return ret;
3759
3760 }
3761
3762 static const struct file_operations split_huge_pages_fops = {
3763         .owner   = THIS_MODULE,
3764         .write   = split_huge_pages_write,
3765         .llseek  = no_llseek,
3766 };
3767
3768 static int __init split_huge_pages_debugfs(void)
3769 {
3770         debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
3771                             &split_huge_pages_fops);
3772         return 0;
3773 }
3774 late_initcall(split_huge_pages_debugfs);
3775 #endif
3776
3777 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
3778 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3779                 struct page *page)
3780 {
3781         struct folio *folio = page_folio(page);
3782         struct vm_area_struct *vma = pvmw->vma;
3783         struct mm_struct *mm = vma->vm_mm;
3784         unsigned long address = pvmw->address;
3785         bool anon_exclusive;
3786         pmd_t pmdval;
3787         swp_entry_t entry;
3788         pmd_t pmdswp;
3789
3790         if (!(pvmw->pmd && !pvmw->pte))
3791                 return 0;
3792
3793         flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
3794         pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
3795
3796         /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
3797         anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
3798         if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
3799                 set_pmd_at(mm, address, pvmw->pmd, pmdval);
3800                 return -EBUSY;
3801         }
3802
3803         if (pmd_dirty(pmdval))
3804                 folio_mark_dirty(folio);
3805         if (pmd_write(pmdval))
3806                 entry = make_writable_migration_entry(page_to_pfn(page));
3807         else if (anon_exclusive)
3808                 entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
3809         else
3810                 entry = make_readable_migration_entry(page_to_pfn(page));
3811         if (pmd_young(pmdval))
3812                 entry = make_migration_entry_young(entry);
3813         if (pmd_dirty(pmdval))
3814                 entry = make_migration_entry_dirty(entry);
3815         pmdswp = swp_entry_to_pmd(entry);
3816         if (pmd_soft_dirty(pmdval))
3817                 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
3818         if (pmd_uffd_wp(pmdval))
3819                 pmdswp = pmd_swp_mkuffd_wp(pmdswp);
3820         set_pmd_at(mm, address, pvmw->pmd, pmdswp);
3821         folio_remove_rmap_pmd(folio, page, vma);
3822         folio_put(folio);
3823         trace_set_migration_pmd(address, pmd_val(pmdswp));
3824
3825         return 0;
3826 }
3827
3828 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
3829 {
3830         struct folio *folio = page_folio(new);
3831         struct vm_area_struct *vma = pvmw->vma;
3832         struct mm_struct *mm = vma->vm_mm;
3833         unsigned long address = pvmw->address;
3834         unsigned long haddr = address & HPAGE_PMD_MASK;
3835         pmd_t pmde;
3836         swp_entry_t entry;
3837
3838         if (!(pvmw->pmd && !pvmw->pte))
3839                 return;
3840
3841         entry = pmd_to_swp_entry(*pvmw->pmd);
3842         folio_get(folio);
3843         pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
3844         if (pmd_swp_soft_dirty(*pvmw->pmd))
3845                 pmde = pmd_mksoft_dirty(pmde);
3846         if (is_writable_migration_entry(entry))
3847                 pmde = pmd_mkwrite(pmde, vma);
3848         if (pmd_swp_uffd_wp(*pvmw->pmd))
3849                 pmde = pmd_mkuffd_wp(pmde);
3850         if (!is_migration_entry_young(entry))
3851                 pmde = pmd_mkold(pmde);
3852         /* NOTE: this may contain setting soft-dirty on some archs */
3853         if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
3854                 pmde = pmd_mkdirty(pmde);
3855
3856         if (folio_test_anon(folio)) {
3857                 rmap_t rmap_flags = RMAP_NONE;
3858
3859                 if (!is_readable_migration_entry(entry))
3860                         rmap_flags |= RMAP_EXCLUSIVE;
3861
3862                 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
3863         } else {
3864                 folio_add_file_rmap_pmd(folio, new, vma);
3865         }
3866         VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
3867         set_pmd_at(mm, haddr, pvmw->pmd, pmde);
3868
3869         /* No need to invalidate - it was non-present before */
3870         update_mmu_cache_pmd(vma, address, pvmw->pmd);
3871         trace_remove_migration_pmd(address, pmd_val(pmde));
3872 }
3873 #endif