mm: mincore: use pte_batch_hint() to batch process large folios
authorBaolin Wang <baolin.wang@linux.alibaba.com>
Fri, 9 May 2025 00:45:21 +0000 (08:45 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 22 May 2025 21:55:36 +0000 (14:55 -0700)
When I tested the mincore() syscall, I observed that it takes longer with
64K mTHP enabled on my Arm64 server.  The reason is the
mincore_pte_range() still checks each PTE individually, even when the PTEs
are contiguous, which is not efficient.

Thus we can use pte_batch_hint() to get the batch number of the present
contiguous PTEs, which can improve the performance.  I tested the
mincore() syscall with 1G anonymous memory populated with 64K mTHP, and
observed an obvious performance improvement:

w/o patch w/ patch changes
6022us 549us +91%

Moreover, I also tested mincore() with disabling mTHP/THP, and did not see
any obvious regression for base pages.

Link: https://lkml.kernel.org/r/99cb00ee626ceb6e788102ca36821815cd832237.1746697240.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/mincore.c

index 832f29f46767eed1cecf80ab4018985cd32abf32..42d6c9c8da86725b9c63a7841869f809c0f535c4 100644 (file)
@@ -21,6 +21,7 @@
 
 #include <linux/uaccess.h>
 #include "swap.h"
+#include "internal.h"
 
 static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
                        unsigned long end, struct mm_walk *walk)
@@ -105,6 +106,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *ptep;
        unsigned char *vec = walk->private;
        int nr = (end - addr) >> PAGE_SHIFT;
+       int step, i;
 
        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
@@ -118,16 +120,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                walk->action = ACTION_AGAIN;
                return 0;
        }
-       for (; addr != end; ptep++, addr += PAGE_SIZE) {
+       for (; addr != end; ptep += step, addr += step * PAGE_SIZE) {
                pte_t pte = ptep_get(ptep);
 
+               step = 1;
                /* We need to do cache lookup too for pte markers */
                if (pte_none_mostly(pte))
                        __mincore_unmapped_range(addr, addr + PAGE_SIZE,
                                                 vma, vec);
-               else if (pte_present(pte))
-                       *vec = 1;
-               else { /* pte is a swap entry */
+               else if (pte_present(pte)) {
+                       unsigned int batch = pte_batch_hint(ptep, pte);
+
+                       if (batch > 1) {
+                               unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+                               step = min_t(unsigned int, batch, max_nr);
+                       }
+
+                       for (i = 0; i < step; i++)
+                               vec[i] = 1;
+               } else { /* pte is a swap entry */
                        swp_entry_t entry = pte_to_swp_entry(pte);
 
                        if (non_swap_entry(entry)) {
@@ -146,7 +158,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 #endif
                        }
                }
-               vec++;
+               vec += step;
        }
        pte_unmap_unlock(ptep - 1, ptl);
 out: