mm: add NR_SECONDARY_PAGETABLE to count secondary page table uses.
authorYosry Ahmed <yosryahmed@google.com>
Tue, 23 Aug 2022 00:46:36 +0000 (00:46 +0000)
committerSean Christopherson <seanjc@google.com>
Wed, 24 Aug 2022 20:51:42 +0000 (13:51 -0700)
We keep track of several kernel memory stats (total kernel memory, page
tables, stack, vmalloc, etc) on multiple levels (global, per-node,
per-memcg, etc). These stats give insights to users to how much memory
is used by the kernel and for what purposes.

Currently, memory used by KVM mmu is not accounted in any of those
kernel memory stats. This patch series accounts the memory pages
used by KVM for page tables in those stats in a new
NR_SECONDARY_PAGETABLE stat. This stat can be later extended to account
for other types of secondary pages tables (e.g. iommu page tables).

KVM has a decent number of large allocations that aren't for page
tables, but for most of them, the number/size of those allocations
scales linearly with either the number of vCPUs or the amount of memory
assigned to the VM. KVM's secondary page table allocations do not scale
linearly, especially when nested virtualization is in use.

From a KVM perspective, NR_SECONDARY_PAGETABLE will scale with KVM's
per-VM pages_{4k,2m,1g} stats unless the guest is doing something
bizarre (e.g. accessing only 4kb chunks of 2mb pages so that KVM is
forced to allocate a large number of page tables even though the guest
isn't accessing that much memory). However, someone would need to either
understand how KVM works to make that connection, or know (or be told) to
go look at KVM's stats if they're running VMs to better decipher the stats.

Furthermore, having NR_PAGETABLE side-by-side with NR_SECONDARY_PAGETABLE
is informative. For example, when backing a VM with THP vs. HugeTLB,
NR_SECONDARY_PAGETABLE is roughly the same, but NR_PAGETABLE is an order
of magnitude higher with THP. So having this stat will at the very least
prove to be useful for understanding tradeoffs between VM backing types,
and likely even steer folks towards potential optimizations.

The original discussion with more details about the rationale:
https://lore.kernel.org/all/87ilqoi77b.wl-maz@kernel.org

This stat will be used by subsequent patches to count KVM mmu
memory usage.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220823004639.2387269-2-yosryahmed@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Documentation/admin-guide/cgroup-v2.rst
Documentation/filesystems/proc.rst
drivers/base/node.c
fs/proc/meminfo.c
include/linux/mmzone.h
mm/memcontrol.c
mm/page_alloc.c
mm/vmstat.c

index be4a77baf78414c8e0cfeabd6ed8dac78e58d059..7ce8130a89248b03188217ed8ca7709ad97caca8 100644 (file)
@@ -1355,6 +1355,11 @@ PAGE_SIZE multiple when read back.
          pagetables
                 Amount of memory allocated for page tables.
 
+         sec_pagetables
+               Amount of memory allocated for secondary page tables,
+               this currently includes KVM mmu allocations on x86
+               and arm64.
+
          percpu (npn)
                Amount of memory used for storing per-cpu kernel
                data structures.
index e7aafc82be99917748b1cae12cc58f8bdeb07ab6..898c99eae8e44630b459a2adca8b84ebb1423d8f 100644 (file)
@@ -982,6 +982,7 @@ Example output. You may not have all of these fields.
     SUnreclaim:       142336 kB
     KernelStack:       11168 kB
     PageTables:        20540 kB
+    SecPageTables:         0 kB
     NFS_Unstable:          0 kB
     Bounce:                0 kB
     WritebackTmp:          0 kB
@@ -1090,6 +1091,9 @@ KernelStack
               Memory consumed by the kernel stacks of all tasks
 PageTables
               Memory consumed by userspace page tables
+SecPageTables
+              Memory consumed by secondary page tables, this currently
+              currently includes KVM mmu allocations on x86 and arm64.
 NFS_Unstable
               Always zero. Previous counted pages which had been written to
               the server, but has not been committed to stable storage.
index eb0f43784c2b3a6dc3eb8885723d182dbe846459..432d40a5f910a6b95d8bd24ffefb9d94d6a73db8 100644 (file)
@@ -433,6 +433,7 @@ static ssize_t node_read_meminfo(struct device *dev,
                             "Node %d ShadowCallStack:%8lu kB\n"
 #endif
                             "Node %d PageTables:     %8lu kB\n"
+                            "Node %d SecPageTables:  %8lu kB\n"
                             "Node %d NFS_Unstable:   %8lu kB\n"
                             "Node %d Bounce:         %8lu kB\n"
                             "Node %d WritebackTmp:   %8lu kB\n"
@@ -459,6 +460,7 @@ static ssize_t node_read_meminfo(struct device *dev,
                             nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
                             nid, K(node_page_state(pgdat, NR_PAGETABLE)),
+                            nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
                             nid, 0UL,
                             nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
                             nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
index 6e89f0e2fd20f101d84b3c27daf9045ea33af697..208efd4fa52c76f83857f35c792006a360aa12b5 100644 (file)
@@ -115,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #endif
        show_val_kb(m, "PageTables:     ",
                    global_node_page_state(NR_PAGETABLE));
+       show_val_kb(m, "SecPageTables:  ",
+                   global_node_page_state(NR_SECONDARY_PAGETABLE));
 
        show_val_kb(m, "NFS_Unstable:   ", 0);
        show_val_kb(m, "Bounce:         ",
index e24b40c52468a8550eb93c0a0a8c0a95684bea84..355d842d2731850b03df76a9b98d894dd74e2bc0 100644 (file)
@@ -216,6 +216,7 @@ enum node_stat_item {
        NR_KERNEL_SCS_KB,       /* measured in KiB */
 #endif
        NR_PAGETABLE,           /* used for pagetables */
+       NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */
 #ifdef CONFIG_SWAP
        NR_SWAPCACHE,
 #endif
index b69979c9ced5c26d0bd238e0e090ce3851daa321..9d054e3767ce489b12081b3e63043626002033f0 100644 (file)
@@ -1401,6 +1401,7 @@ static const struct memory_stat memory_stats[] = {
        { "kernel",                     MEMCG_KMEM                      },
        { "kernel_stack",               NR_KERNEL_STACK_KB              },
        { "pagetables",                 NR_PAGETABLE                    },
+       { "sec_pagetables",             NR_SECONDARY_PAGETABLE          },
        { "percpu",                     MEMCG_PERCPU_B                  },
        { "sock",                       MEMCG_SOCK                      },
        { "vmalloc",                    MEMCG_VMALLOC                   },
index e5486d47406e81c3c1f96551c1eded4551e1a38b..90461bd947448ed0e3934435aa912463f9b896c8 100644 (file)
@@ -6039,7 +6039,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
                " unevictable:%lu dirty:%lu writeback:%lu\n"
                " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-               " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+               " mapped:%lu shmem:%lu pagetables:%lu\n"
+               " sec_pagetables:%lu bounce:%lu\n"
                " kernel_misc_reclaimable:%lu\n"
                " free:%lu free_pcp:%lu free_cma:%lu\n",
                global_node_page_state(NR_ACTIVE_ANON),
@@ -6056,6 +6057,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                global_node_page_state(NR_FILE_MAPPED),
                global_node_page_state(NR_SHMEM),
                global_node_page_state(NR_PAGETABLE),
+               global_node_page_state(NR_SECONDARY_PAGETABLE),
                global_zone_page_state(NR_BOUNCE),
                global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
                global_zone_page_state(NR_FREE_PAGES),
@@ -6089,6 +6091,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        " shadow_call_stack:%lukB"
 #endif
                        " pagetables:%lukB"
+                       " sec_pagetables:%lukB"
                        " all_unreclaimable? %s"
                        "\n",
                        pgdat->node_id,
@@ -6114,6 +6117,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
                        K(node_page_state(pgdat, NR_PAGETABLE)),
+                       K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
                        pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
                                "yes" : "no");
        }
index 373d2730fcf2157562f343ec4b3160e8f4f49eac..b937eba681d1533e773e201fd62ad9edcbe38cc2 100644 (file)
@@ -1240,6 +1240,7 @@ const char * const vmstat_text[] = {
        "nr_shadow_call_stack",
 #endif
        "nr_page_table_pages",
+       "nr_sec_page_table_pages",
 #ifdef CONFIG_SWAP
        "nr_swapcached",
 #endif