mm: thp: support allocation of anonymous multi-size THP

author Ryan Roberts <ryan.roberts@arm.com>

Thu, 7 Dec 2023 16:12:05 +0000 (16:12 +0000)

committer Andrew Morton <akpm@linux-foundation.org>

Wed, 20 Dec 2023 22:48:12 +0000 (14:48 -0800)
author Ryan Roberts <ryan.roberts@arm.com>
Thu, 7 Dec 2023 16:12:05 +0000 (16:12 +0000)
committer Andrew Morton <akpm@linux-foundation.org>
Wed, 20 Dec 2023 22:48:12 +0000 (14:48 -0800)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h

index 609c153bae57d39b3c1fc25b2740ac82a2a15932..fa7a38a30fc6890e81b99f36630e27e320dc1c67 100644 (file)
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
  #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
  
  /*
- * Mask of all large folio orders supported for anonymous THP.
+ * Mask of all large folio orders supported for anonymous THP; all orders up to
+ * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
+ * (which is a limitation of the THP implementation).
   */
-#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
+#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
  
  /*
   * Mask of all large folio orders supported for file THP.
diff --git a/mm/memory.c b/mm/memory.c

index 8ab2d994d997ee4fc2dd5dfd36b4dad59731d2b8..3c530b639559b495a0069035f31051dcb05646fe 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4125,6 +4125,84 @@ out_release:
         return ret;
  }
  
+static bool pte_range_none(pte_t *pte, int nr_pages)
+{
+       int i;
+
+       for (i = 0; i < nr_pages; i++) {
+               if (!pte_none(ptep_get_lockless(pte + i)))
+                       return false;
+       }
+
+       return true;
+}
+
+static struct folio *alloc_anon_folio(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       struct vm_area_struct *vma = vmf->vma;
+       unsigned long orders;
+       struct folio *folio;
+       unsigned long addr;
+       pte_t *pte;
+       gfp_t gfp;
+       int order;
+
+       /*
+        * If uffd is active for the vma we need per-page fault fidelity to
+        * maintain the uffd semantics.
+        */
+       if (unlikely(userfaultfd_armed(vma)))
+               goto fallback;
+
+       /*
+        * Get a list of all the (large) orders below PMD_ORDER that are enabled
+        * for this vma. Then filter out the orders that can't be allocated over
+        * the faulting address and still be fully contained in the vma.
+        */
+       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
+                                         BIT(PMD_ORDER) - 1);
+       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+
+       if (!orders)
+               goto fallback;
+
+       pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
+       if (!pte)
+               return ERR_PTR(-EAGAIN);
+
+       /*
+        * Find the highest order where the aligned range is completely
+        * pte_none(). Note that all remaining orders will be completely
+        * pte_none().
+        */
+       order = highest_order(orders);
+       while (orders) {
+               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+               if (pte_range_none(pte + pte_index(addr), 1 << order))
+                       break;
+               order = next_order(&orders, order);
+       }
+
+       pte_unmap(pte);
+
+       /* Try allocating the highest of the remaining orders. */
+       gfp = vma_thp_gfp_mask(vma);
+       while (orders) {
+               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+               folio = vma_alloc_folio(gfp, order, vma, addr, true);
+               if (folio) {
+                       clear_huge_page(&folio->page, vmf->address, 1 << order);
+                       return folio;
+               }
+               order = next_order(&orders, order);
+       }
+
+fallback:
+#endif
+       return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address);
+}
+
  /*
   * We enter with non-exclusive mmap_lock (to exclude vma changes,
   * but allow concurrent faults), and pte mapped but not yet locked.
@@ -4134,9 +4212,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  {
         bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
         struct vm_area_struct *vma = vmf->vma;
+       unsigned long addr = vmf->address;
         struct folio *folio;
         vm_fault_t ret = 0;
+       int nr_pages = 1;
         pte_t entry;
+       int i;
  
         /* File mapping without ->vm_ops ? */
         if (vma->vm_flags & VM_SHARED)
@@ -4176,10 +4257,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
         /* Allocate our own private page. */
         if (unlikely(anon_vma_prepare(vma)))
                 goto oom;
-       folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+       /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
+       folio = alloc_anon_folio(vmf);
+       if (IS_ERR(folio))
+               return 0;
         if (!folio)
                 goto oom;
  
+       nr_pages = folio_nr_pages(folio);
+       addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
+
         if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
                 goto oom_free_page;
         folio_throttle_swaprate(folio, GFP_KERNEL);
@@ -4196,12 +4283,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
         if (vma->vm_flags & VM_WRITE)
                 entry = pte_mkwrite(pte_mkdirty(entry), vma);
  
-       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-                       &vmf->ptl);
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
         if (!vmf->pte)
                 goto release;
-       if (vmf_pte_changed(vmf)) {
-               update_mmu_tlb(vma, vmf->address, vmf->pte);
+       if (nr_pages == 1 && vmf_pte_changed(vmf)) {
+               update_mmu_tlb(vma, addr, vmf->pte);
+               goto release;
+       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+               for (i = 0; i < nr_pages; i++)
+                       update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
                 goto release;
         }
  
@@ -4216,16 +4306,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
                 return handle_userfault(vmf, VM_UFFD_MISSING);
         }
  
-       inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-       folio_add_new_anon_rmap(folio, vma, vmf->address);
+       folio_ref_add(folio, nr_pages - 1);
+       add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+       folio_add_new_anon_rmap(folio, vma, addr);
         folio_add_lru_vma(folio, vma);
  setpte:
         if (uffd_wp)
                 entry = pte_mkuffd_wp(entry);
-       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+       set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
  
         /* No need to invalidate - it was non-present before */
-       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
+       update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
  unlock:
         if (vmf->pte)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
author	Ryan Roberts <ryan.roberts@arm.com>
	Thu, 7 Dec 2023 16:12:05 +0000 (16:12 +0000)
committer	Andrew Morton <akpm@linux-foundation.org>
	Wed, 20 Dec 2023 22:48:12 +0000 (14:48 -0800)
include/linux/huge_mm.h		patch \| blob \| blame \| history
mm/memory.c		patch \| blob \| blame \| history