16 #include <sys/types.h>
18 #include <sys/sysmacros.h>
21 #include "linux/magic.h"
26 #define MADV_PAGEOUT 21
28 #ifndef MADV_POPULATE_READ
29 #define MADV_POPULATE_READ 22
32 #define MADV_COLLAPSE 25
35 #define BASE_ADDR ((void *)(1UL << 30))
36 static unsigned long hpage_pmd_size;
37 static unsigned long page_size;
38 static int hpage_pmd_nr;
40 #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
41 #define PID_SMAPS "/proc/self/smaps"
42 #define TEST_FILE "collapse_test_file"
44 #define MAX_LINE_LENGTH 500
53 void *(*setup_area)(int nr_hpages);
54 void (*cleanup_area)(void *p, unsigned long size);
55 void (*fault)(void *p, unsigned long start, unsigned long end);
56 bool (*check_huge)(void *addr, int nr_hpages);
60 static struct mem_ops *file_ops;
61 static struct mem_ops *anon_ops;
62 static struct mem_ops *shmem_ops;
64 struct collapse_context {
65 void (*collapse)(const char *msg, char *p, int nr_hpages,
66 struct mem_ops *ops, bool expect);
67 bool enforce_pte_scan_limits;
71 static struct collapse_context *khugepaged_context;
72 static struct collapse_context *madvise_context;
79 char dev_queue_read_ahead_path[PATH_MAX];
82 static struct file_info finfo;
90 static const char *thp_enabled_strings[] = {
100 THP_DEFRAG_DEFER_MADVISE,
105 static const char *thp_defrag_strings[] = {
123 static const char *shmem_enabled_strings[] = {
133 struct khugepaged_settings {
135 unsigned int alloc_sleep_millisecs;
136 unsigned int scan_sleep_millisecs;
137 unsigned int max_ptes_none;
138 unsigned int max_ptes_swap;
139 unsigned int max_ptes_shared;
140 unsigned long pages_to_scan;
144 enum thp_enabled thp_enabled;
145 enum thp_defrag thp_defrag;
146 enum shmem_enabled shmem_enabled;
148 struct khugepaged_settings khugepaged;
149 unsigned long read_ahead_kb;
152 static struct settings saved_settings;
153 static bool skip_settings_restore;
155 static int exit_status;
157 static void success(const char *msg)
159 printf(" \e[32m%s\e[0m\n", msg);
162 static void fail(const char *msg)
164 printf(" \e[31m%s\e[0m\n", msg);
168 static void skip(const char *msg)
170 printf(" \e[33m%s\e[0m\n", msg);
173 static int read_file(const char *path, char *buf, size_t buflen)
178 fd = open(path, O_RDONLY);
182 numread = read(fd, buf, buflen - 1);
191 return (unsigned int) numread;
194 static int write_file(const char *path, const char *buf, size_t buflen)
199 fd = open(path, O_WRONLY);
201 printf("open(%s)\n", path);
206 numwritten = write(fd, buf, buflen - 1);
208 if (numwritten < 1) {
209 printf("write(%s)\n", buf);
214 return (unsigned int) numwritten;
217 static int read_string(const char *name, const char *strings[])
224 ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
225 if (ret >= PATH_MAX) {
226 printf("%s: Pathname is too long\n", __func__);
230 if (!read_file(path, buf, sizeof(buf))) {
235 c = strchr(buf, '[');
237 printf("%s: Parse failure\n", __func__);
242 memmove(buf, c, sizeof(buf) - (c - buf));
244 c = strchr(buf, ']');
246 printf("%s: Parse failure\n", __func__);
252 while (strings[ret]) {
253 if (!strcmp(strings[ret], buf))
258 printf("Failed to parse %s\n", name);
262 static void write_string(const char *name, const char *val)
267 ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
268 if (ret >= PATH_MAX) {
269 printf("%s: Pathname is too long\n", __func__);
273 if (!write_file(path, val, strlen(val) + 1)) {
279 static const unsigned long _read_num(const char *path)
283 if (read_file(path, buf, sizeof(buf)) < 0) {
284 perror("read_file(read_num)");
288 return strtoul(buf, NULL, 10);
291 static const unsigned long read_num(const char *name)
296 ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
297 if (ret >= PATH_MAX) {
298 printf("%s: Pathname is too long\n", __func__);
301 return _read_num(path);
304 static void _write_num(const char *path, unsigned long num)
308 sprintf(buf, "%ld", num);
309 if (!write_file(path, buf, strlen(buf) + 1)) {
315 static void write_num(const char *name, unsigned long num)
320 ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
321 if (ret >= PATH_MAX) {
322 printf("%s: Pathname is too long\n", __func__);
325 _write_num(path, num);
328 static void write_settings(struct settings *settings)
330 struct khugepaged_settings *khugepaged = &settings->khugepaged;
332 write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
333 write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
334 write_string("shmem_enabled",
335 shmem_enabled_strings[settings->shmem_enabled]);
336 write_num("use_zero_page", settings->use_zero_page);
338 write_num("khugepaged/defrag", khugepaged->defrag);
339 write_num("khugepaged/alloc_sleep_millisecs",
340 khugepaged->alloc_sleep_millisecs);
341 write_num("khugepaged/scan_sleep_millisecs",
342 khugepaged->scan_sleep_millisecs);
343 write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
344 write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
345 write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
346 write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
348 if (file_ops && finfo.type == VMA_FILE)
349 _write_num(finfo.dev_queue_read_ahead_path,
350 settings->read_ahead_kb);
353 #define MAX_SETTINGS_DEPTH 4
354 static struct settings settings_stack[MAX_SETTINGS_DEPTH];
355 static int settings_index;
357 static struct settings *current_settings(void)
359 if (!settings_index) {
360 printf("Fail: No settings set");
363 return settings_stack + settings_index - 1;
366 static void push_settings(struct settings *settings)
368 if (settings_index >= MAX_SETTINGS_DEPTH) {
369 printf("Fail: Settings stack exceeded");
372 settings_stack[settings_index++] = *settings;
373 write_settings(current_settings());
376 static void pop_settings(void)
378 if (settings_index <= 0) {
379 printf("Fail: Settings stack empty");
383 write_settings(current_settings());
386 static void restore_settings(int sig)
388 if (skip_settings_restore)
391 printf("Restore THP and khugepaged settings...");
392 write_settings(&saved_settings);
400 static void save_settings(void)
402 printf("Save THP and khugepaged settings...");
403 saved_settings = (struct settings) {
404 .thp_enabled = read_string("enabled", thp_enabled_strings),
405 .thp_defrag = read_string("defrag", thp_defrag_strings),
407 read_string("shmem_enabled", shmem_enabled_strings),
408 .use_zero_page = read_num("use_zero_page"),
410 saved_settings.khugepaged = (struct khugepaged_settings) {
411 .defrag = read_num("khugepaged/defrag"),
412 .alloc_sleep_millisecs =
413 read_num("khugepaged/alloc_sleep_millisecs"),
414 .scan_sleep_millisecs =
415 read_num("khugepaged/scan_sleep_millisecs"),
416 .max_ptes_none = read_num("khugepaged/max_ptes_none"),
417 .max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
418 .max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
419 .pages_to_scan = read_num("khugepaged/pages_to_scan"),
421 if (file_ops && finfo.type == VMA_FILE)
422 saved_settings.read_ahead_kb =
423 _read_num(finfo.dev_queue_read_ahead_path);
427 signal(SIGTERM, restore_settings);
428 signal(SIGINT, restore_settings);
429 signal(SIGHUP, restore_settings);
430 signal(SIGQUIT, restore_settings);
433 static void get_finfo(const char *dir)
435 struct stat path_stat;
442 stat(finfo.dir, &path_stat);
443 if (!S_ISDIR(path_stat.st_mode)) {
444 printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
447 if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
448 finfo.dir) >= sizeof(finfo.path)) {
449 printf("%s: Pathname is too long\n", __func__);
452 if (statfs(finfo.dir, &fs)) {
456 finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
457 if (finfo.type == VMA_SHMEM)
460 /* Find owning device's queue/read_ahead_kb control */
461 if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
462 major(path_stat.st_dev), minor(path_stat.st_dev))
464 printf("%s: Pathname is too long\n", __func__);
467 if (read_file(path, buf, sizeof(buf)) < 0) {
468 perror("read_file(read_num)");
471 if (strstr(buf, "DEVTYPE=disk")) {
473 if (snprintf(finfo.dev_queue_read_ahead_path,
474 sizeof(finfo.dev_queue_read_ahead_path),
475 "/sys/dev/block/%d:%d/queue/read_ahead_kb",
476 major(path_stat.st_dev), minor(path_stat.st_dev))
477 >= sizeof(finfo.dev_queue_read_ahead_path)) {
478 printf("%s: Pathname is too long\n", __func__);
483 if (!strstr(buf, "DEVTYPE=partition")) {
484 printf("%s: Unknown device type: %s\n", __func__, path);
488 * Partition of block device - need to find actual device.
489 * Using naming convention that devnameN is partition of
492 str = strstr(buf, "DEVNAME=");
494 printf("%s: Could not read: %s", __func__, path);
502 if (snprintf(finfo.dev_queue_read_ahead_path,
503 sizeof(finfo.dev_queue_read_ahead_path),
504 "/sys/block/%s/queue/read_ahead_kb",
505 str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
506 printf("%s: Pathname is too long\n", __func__);
513 printf("%s: Could not read: %s\n", __func__, path);
517 static bool check_swap(void *addr, unsigned long size)
522 char buffer[MAX_LINE_LENGTH];
523 char addr_pattern[MAX_LINE_LENGTH];
525 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
526 (unsigned long) addr);
527 if (ret >= MAX_LINE_LENGTH) {
528 printf("%s: Pattern is too long\n", __func__);
533 fp = fopen(PID_SMAPS, "r");
535 printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
538 if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
541 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
543 if (ret >= MAX_LINE_LENGTH) {
544 printf("%s: Pattern is too long\n", __func__);
548 * Fetch the Swap: in the same block and check whether it got
549 * the expected number of hugeepages next.
551 if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
554 if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
563 static void *alloc_mapping(int nr)
567 p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
568 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
569 if (p != BASE_ADDR) {
570 printf("Failed to allocate VMA at %p\n", BASE_ADDR);
577 static void fill_memory(int *p, unsigned long start, unsigned long end)
581 for (i = start / page_size; i < end / page_size; i++)
582 p[i * page_size / sizeof(*p)] = i + 0xdead0000;
586 * MADV_COLLAPSE is a best-effort request and may fail if an internal
587 * resource is temporarily unavailable, in which case it will set errno to
588 * EAGAIN. In such a case, immediately reattempt the operation one more
591 static int madvise_collapse_retry(void *p, unsigned long size)
597 ret = madvise(p, size, MADV_COLLAPSE);
598 if (ret && errno == EAGAIN && retry) {
606 * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
607 * validate_memory()'able contents.
609 static void *alloc_hpage(struct mem_ops *ops)
611 void *p = ops->setup_area(1);
613 ops->fault(p, 0, hpage_pmd_size);
616 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
617 * The latter is ineligible for collapse by MADV_COLLAPSE
618 * while the former might cause MADV_COLLAPSE to race with
619 * khugepaged on low-load system (like a test machine), which
620 * would cause MADV_COLLAPSE to fail with EAGAIN.
622 printf("Allocate huge page...");
623 if (madvise_collapse_retry(p, hpage_pmd_size)) {
624 perror("madvise(MADV_COLLAPSE)");
627 if (!ops->check_huge(p, 1)) {
628 perror("madvise(MADV_COLLAPSE)");
631 if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
632 perror("madvise(MADV_HUGEPAGE)");
639 static void validate_memory(int *p, unsigned long start, unsigned long end)
643 for (i = start / page_size; i < end / page_size; i++) {
644 if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
645 printf("Page %d is corrupted: %#x\n",
646 i, p[i * page_size / sizeof(*p)]);
652 static void *anon_setup_area(int nr_hpages)
654 return alloc_mapping(nr_hpages);
657 static void anon_cleanup_area(void *p, unsigned long size)
662 static void anon_fault(void *p, unsigned long start, unsigned long end)
664 fill_memory(p, start, end);
667 static bool anon_check_huge(void *addr, int nr_hpages)
669 return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
672 static void *file_setup_area(int nr_hpages)
678 unlink(finfo.path); /* Cleanup from previous failed tests */
679 printf("Creating %s for collapse%s...", finfo.path,
680 finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
681 fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
688 size = nr_hpages * hpage_pmd_size;
689 p = alloc_mapping(nr_hpages);
690 fill_memory(p, 0, size);
696 printf("Opening %s read only for collapse...", finfo.path);
697 finfo.fd = open(finfo.path, O_RDONLY, 777);
702 p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
703 MAP_PRIVATE, finfo.fd, 0);
704 if (p == MAP_FAILED || p != BASE_ADDR) {
709 /* Drop page cache */
710 write_file("/proc/sys/vm/drop_caches", "3", 2);
715 static void file_cleanup_area(void *p, unsigned long size)
722 static void file_fault(void *p, unsigned long start, unsigned long end)
724 if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
725 perror("madvise(MADV_POPULATE_READ");
730 static bool file_check_huge(void *addr, int nr_hpages)
732 switch (finfo.type) {
734 return check_huge_file(addr, nr_hpages, hpage_pmd_size);
736 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
743 static void *shmem_setup_area(int nr_hpages)
746 unsigned long size = nr_hpages * hpage_pmd_size;
748 finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
750 perror("memfd_create()");
753 if (ftruncate(finfo.fd, size)) {
754 perror("ftruncate()");
757 p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
759 if (p != BASE_ADDR) {
766 static void shmem_cleanup_area(void *p, unsigned long size)
772 static bool shmem_check_huge(void *addr, int nr_hpages)
774 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
777 static struct mem_ops __anon_ops = {
778 .setup_area = &anon_setup_area,
779 .cleanup_area = &anon_cleanup_area,
780 .fault = &anon_fault,
781 .check_huge = &anon_check_huge,
785 static struct mem_ops __file_ops = {
786 .setup_area = &file_setup_area,
787 .cleanup_area = &file_cleanup_area,
788 .fault = &file_fault,
789 .check_huge = &file_check_huge,
793 static struct mem_ops __shmem_ops = {
794 .setup_area = &shmem_setup_area,
795 .cleanup_area = &shmem_cleanup_area,
796 .fault = &anon_fault,
797 .check_huge = &shmem_check_huge,
801 static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
802 struct mem_ops *ops, bool expect)
805 struct settings settings = *current_settings();
807 printf("%s...", msg);
810 * Prevent khugepaged interference and tests that MADV_COLLAPSE
811 * ignores /sys/kernel/mm/transparent_hugepage/enabled
813 settings.thp_enabled = THP_NEVER;
814 settings.shmem_enabled = SHMEM_NEVER;
815 push_settings(&settings);
817 /* Clear VM_NOHUGEPAGE */
818 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
819 ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
820 if (((bool)ret) == expect)
821 fail("Fail: Bad return value");
822 else if (!ops->check_huge(p, expect ? nr_hpages : 0))
823 fail("Fail: check_huge()");
830 static void madvise_collapse(const char *msg, char *p, int nr_hpages,
831 struct mem_ops *ops, bool expect)
834 if (!ops->check_huge(p, 0)) {
835 printf("Unexpected huge page\n");
838 __madvise_collapse(msg, p, nr_hpages, ops, expect);
842 static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
846 int timeout = 6; /* 3 seconds */
849 if (!ops->check_huge(p, 0)) {
850 printf("Unexpected huge page\n");
854 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
856 /* Wait until the second full_scan completed */
857 full_scans = read_num("khugepaged/full_scans") + 2;
859 printf("%s...", msg);
861 if (ops->check_huge(p, nr_hpages))
863 if (read_num("khugepaged/full_scans") >= full_scans)
869 madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
871 return timeout == -1;
874 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
875 struct mem_ops *ops, bool expect)
877 if (wait_for_scan(msg, p, nr_hpages, ops)) {
886 * For file and shmem memory, khugepaged only retracts pte entries after
887 * putting the new hugepage in the page cache. The hugepage must be
888 * subsequently refaulted to install the pmd mapping for the mm.
890 if (ops != &__anon_ops)
891 ops->fault(p, 0, nr_hpages * hpage_pmd_size);
893 if (ops->check_huge(p, expect ? nr_hpages : 0))
899 static struct collapse_context __khugepaged_context = {
900 .collapse = &khugepaged_collapse,
901 .enforce_pte_scan_limits = true,
902 .name = "khugepaged",
905 static struct collapse_context __madvise_context = {
906 .collapse = &madvise_collapse,
907 .enforce_pte_scan_limits = false,
911 static bool is_tmpfs(struct mem_ops *ops)
913 return ops == &__file_ops && finfo.type == VMA_SHMEM;
916 static void alloc_at_fault(void)
918 struct settings settings = *current_settings();
921 settings.thp_enabled = THP_ALWAYS;
922 push_settings(&settings);
924 p = alloc_mapping(1);
926 printf("Allocate huge page on fault...");
927 if (check_huge_anon(p, 1, hpage_pmd_size))
934 madvise(p, page_size, MADV_DONTNEED);
935 printf("Split huge PMD on MADV_DONTNEED...");
936 if (check_huge_anon(p, 0, hpage_pmd_size))
940 munmap(p, hpage_pmd_size);
943 static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
947 unsigned long size = nr_hpages * hpage_pmd_size;
949 p = ops->setup_area(nr_hpages);
950 ops->fault(p, 0, size);
951 c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
953 validate_memory(p, 0, size);
954 ops->cleanup_area(p, size);
957 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
961 p = ops->setup_area(1);
962 c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
963 ops->cleanup_area(p, hpage_pmd_size);
966 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
970 p = ops->setup_area(1);
971 ops->fault(p, 0, page_size);
972 c->collapse("Collapse PTE table with single PTE entry present", p,
974 ops->cleanup_area(p, hpage_pmd_size);
977 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
979 int max_ptes_none = hpage_pmd_nr / 2;
980 struct settings settings = *current_settings();
983 settings.khugepaged.max_ptes_none = max_ptes_none;
984 push_settings(&settings);
986 p = ops->setup_area(1);
989 /* shmem pages always in the page cache */
995 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
996 c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
997 ops, !c->enforce_pte_scan_limits);
998 validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
1000 if (c->enforce_pte_scan_limits) {
1001 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
1002 c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
1004 validate_memory(p, 0,
1005 (hpage_pmd_nr - max_ptes_none) * page_size);
1008 ops->cleanup_area(p, hpage_pmd_size);
1012 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
1016 p = ops->setup_area(1);
1017 ops->fault(p, 0, hpage_pmd_size);
1019 printf("Swapout one page...");
1020 if (madvise(p, page_size, MADV_PAGEOUT)) {
1021 perror("madvise(MADV_PAGEOUT)");
1024 if (check_swap(p, page_size)) {
1031 c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
1033 validate_memory(p, 0, hpage_pmd_size);
1035 ops->cleanup_area(p, hpage_pmd_size);
1038 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
1040 int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
1043 p = ops->setup_area(1);
1044 ops->fault(p, 0, hpage_pmd_size);
1046 printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
1047 if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
1048 perror("madvise(MADV_PAGEOUT)");
1051 if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
1058 c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
1059 !c->enforce_pte_scan_limits);
1060 validate_memory(p, 0, hpage_pmd_size);
1062 if (c->enforce_pte_scan_limits) {
1063 ops->fault(p, 0, hpage_pmd_size);
1064 printf("Swapout %d of %d pages...", max_ptes_swap,
1066 if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
1067 perror("madvise(MADV_PAGEOUT)");
1070 if (check_swap(p, max_ptes_swap * page_size)) {
1077 c->collapse("Collapse with max_ptes_swap pages swapped out", p,
1079 validate_memory(p, 0, hpage_pmd_size);
1082 ops->cleanup_area(p, hpage_pmd_size);
1085 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
1089 p = alloc_hpage(ops);
1091 if (is_tmpfs(ops)) {
1092 /* MADV_DONTNEED won't evict tmpfs pages */
1098 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1099 printf("Split huge page leaving single PTE mapping compound page...");
1100 madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
1101 if (ops->check_huge(p, 0))
1106 c->collapse("Collapse PTE table with single PTE mapping compound page",
1108 validate_memory(p, 0, page_size);
1110 ops->cleanup_area(p, hpage_pmd_size);
1113 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
1117 p = alloc_hpage(ops);
1118 printf("Split huge page leaving single PTE page table full of compound pages...");
1119 madvise(p, page_size, MADV_NOHUGEPAGE);
1120 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1121 if (ops->check_huge(p, 0))
1126 c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
1128 validate_memory(p, 0, hpage_pmd_size);
1129 ops->cleanup_area(p, hpage_pmd_size);
1132 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
1137 p = ops->setup_area(1);
1138 for (i = 0; i < hpage_pmd_nr; i++) {
1139 printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
1140 i + 1, hpage_pmd_nr);
1142 madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
1143 ops->fault(BASE_ADDR, 0, hpage_pmd_size);
1144 if (!ops->check_huge(BASE_ADDR, 1)) {
1145 printf("Failed to allocate huge page\n");
1148 madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
1150 p = mremap(BASE_ADDR - i * page_size,
1151 i * page_size + hpage_pmd_size,
1152 (i + 1) * page_size,
1153 MREMAP_MAYMOVE | MREMAP_FIXED,
1154 BASE_ADDR + 2 * hpage_pmd_size);
1155 if (p == MAP_FAILED) {
1156 perror("mremap+unmap");
1160 p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
1161 (i + 1) * page_size,
1162 (i + 1) * page_size + hpage_pmd_size,
1163 MREMAP_MAYMOVE | MREMAP_FIXED,
1164 BASE_ADDR - (i + 1) * page_size);
1165 if (p == MAP_FAILED) {
1166 perror("mremap+alloc");
1171 ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
1172 ops->fault(p, 0, hpage_pmd_size);
1173 if (!ops->check_huge(p, 1))
1178 c->collapse("Collapse PTE table full of different compound pages", p, 1,
1181 validate_memory(p, 0, hpage_pmd_size);
1182 ops->cleanup_area(p, hpage_pmd_size);
1185 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
1190 p = ops->setup_area(1);
1192 printf("Allocate small page...");
1193 ops->fault(p, 0, page_size);
1194 if (ops->check_huge(p, 0))
1199 printf("Share small page over fork()...");
1201 /* Do not touch settings on child exit */
1202 skip_settings_restore = true;
1205 if (ops->check_huge(p, 0))
1210 ops->fault(p, page_size, 2 * page_size);
1211 c->collapse("Collapse PTE table with single page shared with parent process",
1214 validate_memory(p, 0, page_size);
1215 ops->cleanup_area(p, hpage_pmd_size);
1220 exit_status += WEXITSTATUS(wstatus);
1222 printf("Check if parent still has small page...");
1223 if (ops->check_huge(p, 0))
1227 validate_memory(p, 0, page_size);
1228 ops->cleanup_area(p, hpage_pmd_size);
1231 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
1236 p = alloc_hpage(ops);
1237 printf("Share huge page over fork()...");
1239 /* Do not touch settings on child exit */
1240 skip_settings_restore = true;
1243 if (ops->check_huge(p, 1))
1248 printf("Split huge page PMD in child process...");
1249 madvise(p, page_size, MADV_NOHUGEPAGE);
1250 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1251 if (ops->check_huge(p, 0))
1255 ops->fault(p, 0, page_size);
1257 write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
1258 c->collapse("Collapse PTE table full of compound pages in child",
1260 write_num("khugepaged/max_ptes_shared",
1261 current_settings()->khugepaged.max_ptes_shared);
1263 validate_memory(p, 0, hpage_pmd_size);
1264 ops->cleanup_area(p, hpage_pmd_size);
1269 exit_status += WEXITSTATUS(wstatus);
1271 printf("Check if parent still has huge page...");
1272 if (ops->check_huge(p, 1))
1276 validate_memory(p, 0, hpage_pmd_size);
1277 ops->cleanup_area(p, hpage_pmd_size);
1280 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
1282 int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
1286 p = alloc_hpage(ops);
1287 printf("Share huge page over fork()...");
1289 /* Do not touch settings on child exit */
1290 skip_settings_restore = true;
1293 if (ops->check_huge(p, 1))
1298 printf("Trigger CoW on page %d of %d...",
1299 hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
1300 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
1301 if (ops->check_huge(p, 0))
1306 c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
1307 1, ops, !c->enforce_pte_scan_limits);
1309 if (c->enforce_pte_scan_limits) {
1310 printf("Trigger CoW on page %d of %d...",
1311 hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
1312 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
1314 if (ops->check_huge(p, 0))
1319 c->collapse("Collapse with max_ptes_shared PTEs shared",
1323 validate_memory(p, 0, hpage_pmd_size);
1324 ops->cleanup_area(p, hpage_pmd_size);
1329 exit_status += WEXITSTATUS(wstatus);
1331 printf("Check if parent still has huge page...");
1332 if (ops->check_huge(p, 1))
1336 validate_memory(p, 0, hpage_pmd_size);
1337 ops->cleanup_area(p, hpage_pmd_size);
1340 static void madvise_collapse_existing_thps(struct collapse_context *c,
1341 struct mem_ops *ops)
1345 p = ops->setup_area(1);
1346 ops->fault(p, 0, hpage_pmd_size);
1347 c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
1348 validate_memory(p, 0, hpage_pmd_size);
1350 /* c->collapse() will find a hugepage and complain - call directly. */
1351 __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
1352 validate_memory(p, 0, hpage_pmd_size);
1353 ops->cleanup_area(p, hpage_pmd_size);
1357 * Test race with khugepaged where page tables have been retracted and
1360 static void madvise_retracted_page_tables(struct collapse_context *c,
1361 struct mem_ops *ops)
1365 unsigned long size = nr_hpages * hpage_pmd_size;
1367 p = ops->setup_area(nr_hpages);
1368 ops->fault(p, 0, size);
1370 /* Let khugepaged collapse and leave pmd cleared */
1371 if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
1377 c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
1379 validate_memory(p, 0, size);
1380 ops->cleanup_area(p, size);
1383 static void usage(void)
1385 fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
1386 fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
1387 fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
1388 fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
1389 fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
1390 fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
1391 fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
1392 fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
1393 fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n");
1397 static void parse_test_type(int argc, const char **argv)
1403 /* Backwards compatibility */
1404 khugepaged_context = &__khugepaged_context;
1405 madvise_context = &__madvise_context;
1406 anon_ops = &__anon_ops;
1410 buf = strdup(argv[1]);
1411 token = strsep(&buf, ":");
1413 if (!strcmp(token, "all")) {
1414 khugepaged_context = &__khugepaged_context;
1415 madvise_context = &__madvise_context;
1416 } else if (!strcmp(token, "khugepaged")) {
1417 khugepaged_context = &__khugepaged_context;
1418 } else if (!strcmp(token, "madvise")) {
1419 madvise_context = &__madvise_context;
1427 if (!strcmp(buf, "all")) {
1428 file_ops = &__file_ops;
1429 anon_ops = &__anon_ops;
1430 shmem_ops = &__shmem_ops;
1431 } else if (!strcmp(buf, "anon")) {
1432 anon_ops = &__anon_ops;
1433 } else if (!strcmp(buf, "file")) {
1434 file_ops = &__file_ops;
1435 } else if (!strcmp(buf, "shmem")) {
1436 shmem_ops = &__shmem_ops;
1448 int main(int argc, const char **argv)
1450 struct settings default_settings = {
1451 .thp_enabled = THP_MADVISE,
1452 .thp_defrag = THP_DEFRAG_ALWAYS,
1453 .shmem_enabled = SHMEM_ADVISE,
1457 .alloc_sleep_millisecs = 10,
1458 .scan_sleep_millisecs = 10,
1461 * When testing file-backed memory, the collapse path
1462 * looks at how many pages are found in the page cache, not
1463 * what pages are mapped. Disable read ahead optimization so
1464 * pages don't find their way into the page cache unless
1465 * we mem_ops->fault() them in.
1470 parse_test_type(argc, argv);
1475 setbuf(stdout, NULL);
1477 page_size = getpagesize();
1478 hpage_pmd_size = read_pmd_pagesize();
1479 hpage_pmd_nr = hpage_pmd_size / page_size;
1481 default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
1482 default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
1483 default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
1484 default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
1487 push_settings(&default_settings);
1491 #define TEST(t, c, o) do { \
1493 printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
1498 TEST(collapse_full, khugepaged_context, anon_ops);
1499 TEST(collapse_full, khugepaged_context, file_ops);
1500 TEST(collapse_full, khugepaged_context, shmem_ops);
1501 TEST(collapse_full, madvise_context, anon_ops);
1502 TEST(collapse_full, madvise_context, file_ops);
1503 TEST(collapse_full, madvise_context, shmem_ops);
1505 TEST(collapse_empty, khugepaged_context, anon_ops);
1506 TEST(collapse_empty, madvise_context, anon_ops);
1508 TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
1509 TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
1510 TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
1511 TEST(collapse_single_pte_entry, madvise_context, anon_ops);
1512 TEST(collapse_single_pte_entry, madvise_context, file_ops);
1513 TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
1515 TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
1516 TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
1517 TEST(collapse_max_ptes_none, madvise_context, anon_ops);
1518 TEST(collapse_max_ptes_none, madvise_context, file_ops);
1520 TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
1521 TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
1522 TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
1523 TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
1525 TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
1526 TEST(collapse_full_of_compound, khugepaged_context, file_ops);
1527 TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
1528 TEST(collapse_full_of_compound, madvise_context, anon_ops);
1529 TEST(collapse_full_of_compound, madvise_context, file_ops);
1530 TEST(collapse_full_of_compound, madvise_context, shmem_ops);
1532 TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
1533 TEST(collapse_compound_extreme, madvise_context, anon_ops);
1535 TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
1536 TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
1538 TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
1539 TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
1541 TEST(collapse_fork, khugepaged_context, anon_ops);
1542 TEST(collapse_fork, madvise_context, anon_ops);
1544 TEST(collapse_fork_compound, khugepaged_context, anon_ops);
1545 TEST(collapse_fork_compound, madvise_context, anon_ops);
1547 TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
1548 TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
1550 TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
1551 TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
1552 TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
1554 TEST(madvise_retracted_page_tables, madvise_context, file_ops);
1555 TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
1557 restore_settings(0);