[linux-block.git] / tools / testing / selftests / kvm / max_guest_memory_test.c

// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <semaphore.h>
#include <sys/types.h>
#include <signal.h>
#include <errno.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/atomic.h>
#include <linux/sizes.h>

#include "kvm_util.h"
#include "test_util.h"
#include "guest_modes.h"
#include "processor.h"

static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
{
	uint64_t gpa;

	for (;;) {
		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
			*((volatile uint64_t *)gpa) = gpa;
		GUEST_SYNC(0);
	}
}

struct vcpu_info {
	struct kvm_vcpu *vcpu;
	uint64_t start_gpa;
	uint64_t end_gpa;
};

static int nr_vcpus;
static atomic_t rendezvous;

static void rendezvous_with_boss(void)
{
	int orig = atomic_read(&rendezvous);

	if (orig > 0) {
		atomic_dec_and_test(&rendezvous);
		while (atomic_read(&rendezvous) > 0)
			cpu_relax();
	} else {
		atomic_inc(&rendezvous);
		while (atomic_read(&rendezvous) < 0)
			cpu_relax();
	}
}

static void run_vcpu(struct kvm_vcpu *vcpu)
{
	vcpu_run(vcpu);
	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
}

static void *vcpu_worker(void *data)
{
	struct vcpu_info *info = data;
	struct kvm_vcpu *vcpu = info->vcpu;
	struct kvm_vm *vm = vcpu->vm;
	struct kvm_sregs sregs;

	vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);

	rendezvous_with_boss();

	run_vcpu(vcpu);
	rendezvous_with_boss();
	vcpu_sregs_get(vcpu, &sregs);
#ifdef __x86_64__
	/* Toggle CR0.WP to trigger a MMU context reset. */
	sregs.cr0 ^= X86_CR0_WP;
#endif
	vcpu_sregs_set(vcpu, &sregs);
	rendezvous_with_boss();

	run_vcpu(vcpu);
	rendezvous_with_boss();

	return NULL;
}

static pthread_t *spawn_workers(struct kvm_vm *vm, struct kvm_vcpu **vcpus,
				uint64_t start_gpa, uint64_t end_gpa)
{
	struct vcpu_info *info;
	uint64_t gpa, nr_bytes;
	pthread_t *threads;
	int i;

	threads = malloc(nr_vcpus * sizeof(*threads));
	TEST_ASSERT(threads, "Failed to allocate vCPU threads");

	info = malloc(nr_vcpus * sizeof(*info));
	TEST_ASSERT(info, "Failed to allocate vCPU gpa ranges");

	nr_bytes = ((end_gpa - start_gpa) / nr_vcpus) &
			~((uint64_t)vm->page_size - 1);
	TEST_ASSERT(nr_bytes, "C'mon, no way you have %d CPUs", nr_vcpus);

	for (i = 0, gpa = start_gpa; i < nr_vcpus; i++, gpa += nr_bytes) {
		info[i].vcpu = vcpus[i];
		info[i].start_gpa = gpa;
		info[i].end_gpa = gpa + nr_bytes;
		pthread_create(&threads[i], NULL, vcpu_worker, &info[i]);
	}
	return threads;
}

static void rendezvous_with_vcpus(struct timespec *time, const char *name)
{
	int i, rendezvoused;

	pr_info("Waiting for vCPUs to finish %s...\n", name);

	rendezvoused = atomic_read(&rendezvous);
	for (i = 0; abs(rendezvoused) != 1; i++) {
		usleep(100);
		if (!(i & 0x3f))
			pr_info("\r%d vCPUs haven't rendezvoused...",
				abs(rendezvoused) - 1);
		rendezvoused = atomic_read(&rendezvous);
	}

	clock_gettime(CLOCK_MONOTONIC, time);

	/* Release the vCPUs after getting the time of the previous action. */
	pr_info("\rAll vCPUs finished %s, releasing...\n", name);
	if (rendezvoused > 0)
		atomic_set(&rendezvous, -nr_vcpus - 1);
	else
		atomic_set(&rendezvous, nr_vcpus + 1);
}

static void calc_default_nr_vcpus(void)
{
	cpu_set_t possible_mask;
	int r;

	r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
	TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)",
		    errno, strerror(errno));

	nr_vcpus = CPU_COUNT(&possible_mask) * 3/4;
	TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?");
}

int main(int argc, char *argv[])
{
	/*
	 * Skip the first 4gb and slot0.  slot0 maps <1gb and is used to back
	 * the guest's code, stack, and page tables.  Because selftests creates
	 * an IRQCHIP, a.k.a. a local APIC, KVM creates an internal memslot
	 * just below the 4gb boundary.  This test could create memory at
	 * 1gb-3gb,but it's simpler to skip straight to 4gb.
	 */
	const uint64_t start_gpa = SZ_4G;
	const int first_slot = 1;

	struct timespec time_start, time_run1, time_reset, time_run2;
	uint64_t max_gpa, gpa, slot_size, max_mem, i;
	int max_slots, slot, opt, fd;
	bool hugepages = false;
	struct kvm_vcpu **vcpus;
	pthread_t *threads;
	struct kvm_vm *vm;
	void *mem;

	/*
	 * Default to 2gb so that maxing out systems with MAXPHADDR=46, which
	 * are quite common for x86, requires changing only max_mem (KVM allows
	 * 32k memslots, 32k * 2gb == ~64tb of guest memory).
	 */
	slot_size = SZ_2G;

	max_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
	TEST_ASSERT(max_slots > first_slot, "KVM is broken");

	/* All KVM MMUs should be able to survive a 128gb guest. */
	max_mem = 128ull * SZ_1G;

	calc_default_nr_vcpus();

	while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) {
		switch (opt) {
		case 'c':
			nr_vcpus = atoi_positive("Number of vCPUs", optarg);
			break;
		case 'm':
			max_mem = 1ull * atoi_positive("Memory size", optarg) * SZ_1G;
			break;
		case 's':
			slot_size = 1ull * atoi_positive("Slot size", optarg) * SZ_1G;
			break;
		case 'H':
			hugepages = true;
			break;
		case 'h':
		default:
			printf("usage: %s [-c nr_vcpus] [-m max_mem_in_gb] [-s slot_size_in_gb] [-H]\n", argv[0]);
			exit(1);
		}
	}

	vcpus = malloc(nr_vcpus * sizeof(*vcpus));
	TEST_ASSERT(vcpus, "Failed to allocate vCPU array");

	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);

	max_gpa = vm->max_gfn << vm->page_shift;
	TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb ");

	fd = kvm_memfd_alloc(slot_size, hugepages);
	mem = mmap(NULL, slot_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
	TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");

	TEST_ASSERT(!madvise(mem, slot_size, MADV_NOHUGEPAGE), "madvise() failed");

	/* Pre-fault the memory to avoid taking mmap_sem on guest page faults. */
	for (i = 0; i < slot_size; i += vm->page_size)
		((uint8_t *)mem)[i] = 0xaa;

	gpa = 0;
	for (slot = first_slot; slot < max_slots; slot++) {
		gpa = start_gpa + ((slot - first_slot) * slot_size);
		if (gpa + slot_size > max_gpa)
			break;

		if ((gpa - start_gpa) >= max_mem)
			break;

		vm_set_user_memory_region(vm, slot, 0, gpa, slot_size, mem);

#ifdef __x86_64__
		/* Identity map memory in the guest using 1gb pages. */
		for (i = 0; i < slot_size; i += SZ_1G)
			__virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G);
#else
		for (i = 0; i < slot_size; i += vm->page_size)
			virt_pg_map(vm, gpa + i, gpa + i);
#endif
	}

	atomic_set(&rendezvous, nr_vcpus + 1);
	threads = spawn_workers(vm, vcpus, start_gpa, gpa);

	free(vcpus);
	vcpus = NULL;

	pr_info("Running with %lugb of guest memory and %u vCPUs\n",
		(gpa - start_gpa) / SZ_1G, nr_vcpus);

	rendezvous_with_vcpus(&time_start, "spawning");
	rendezvous_with_vcpus(&time_run1, "run 1");
	rendezvous_with_vcpus(&time_reset, "reset");
	rendezvous_with_vcpus(&time_run2, "run 2");

	time_run2  = timespec_sub(time_run2,   time_reset);
	time_reset = timespec_sub(time_reset, time_run1);
	time_run1  = timespec_sub(time_run1,   time_start);

	pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 =  %ld.%.9lds\n",
		time_run1.tv_sec, time_run1.tv_nsec,
		time_reset.tv_sec, time_reset.tv_nsec,
		time_run2.tv_sec, time_run2.tv_nsec);

	/*
	 * Delete even numbered slots (arbitrary) and unmap the first half of
	 * the backing (also arbitrary) to verify KVM correctly drops all
	 * references to the removed regions.
	 */
	for (slot = (slot - 1) & ~1ull; slot >= first_slot; slot -= 2)
		vm_set_user_memory_region(vm, slot, 0, 0, 0, NULL);

	munmap(mem, slot_size / 2);

	/* Sanity check that the vCPUs actually ran. */
	for (i = 0; i < nr_vcpus; i++)
		pthread_join(threads[i], NULL);

	/*
	 * Deliberately exit without deleting the remaining memslots or closing
	 * kvm_fd to test cleanup via mmu_notifier.release.
	 */
}
Commit	Line	Data
b58c55d5 SC	1	// SPDX-License-Identifier: GPL-2.0
	2	#define _GNU_SOURCE
	3
	4	#include <stdio.h>
	5	#include <stdlib.h>
	6	#include <pthread.h>
	7	#include <semaphore.h>
	8	#include <sys/types.h>
	9	#include <signal.h>
	10	#include <errno.h>
	11	#include <linux/bitmap.h>
	12	#include <linux/bitops.h>
	13	#include <linux/atomic.h>
69a62e20	14	#include <linux/sizes.h>
b58c55d5 SC	15
	16	#include "kvm_util.h"
	17	#include "test_util.h"
	18	#include "guest_modes.h"
	19	#include "processor.h"
	20
	21	static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
	22	{
	23	uint64_t gpa;
	24
0ef2dd1f ML	25	for (;;) {
	26	for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
	27	((volatile uint64_t )gpa) = gpa;
	28	GUEST_SYNC(0);
	29	}
b58c55d5 SC	30	}
	31
	32	struct vcpu_info {
3468fd7d	33	struct kvm_vcpu *vcpu;
b58c55d5 SC	34	uint64_t start_gpa;
	35	uint64_t end_gpa;
	36	};
	37
	38	static int nr_vcpus;
	39	static atomic_t rendezvous;
	40
	41	static void rendezvous_with_boss(void)
	42	{
	43	int orig = atomic_read(&rendezvous);
	44
	45	if (orig > 0) {
	46	atomic_dec_and_test(&rendezvous);
	47	while (atomic_read(&rendezvous) > 0)
	48	cpu_relax();
	49	} else {
	50	atomic_inc(&rendezvous);
	51	while (atomic_read(&rendezvous) < 0)
	52	cpu_relax();
	53	}
	54	}
	55
768e9a61	56	static void run_vcpu(struct kvm_vcpu *vcpu)
b58c55d5	57	{
768e9a61	58	vcpu_run(vcpu);
0ef2dd1f	59	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
b58c55d5 SC	60	}
	61
	62	static void vcpu_worker(void data)
	63	{
3468fd7d SC	64	struct vcpu_info *info = data;
3468fd7d SC	65	struct kvm_vcpu *vcpu = info->vcpu;
b58c55d5 SC	66	struct kvm_vm *vm = vcpu->vm;
b58c55d5 SC	67	struct kvm_sregs sregs;
b58c55d5	68
68c1b3e9	69	vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);
b58c55d5	70
b58c55d5 SC	71	rendezvous_with_boss();
b58c55d5 SC	72
768e9a61	73	run_vcpu(vcpu);
b58c55d5	74	rendezvous_with_boss();
768e9a61	75	vcpu_sregs_get(vcpu, &sregs);
b58c55d5 SC	76	#ifdef __x86_64__
	77	/* Toggle CR0.WP to trigger a MMU context reset. */
	78	sregs.cr0 ^= X86_CR0_WP;
	79	#endif
768e9a61	80	vcpu_sregs_set(vcpu, &sregs);
b58c55d5 SC	81	rendezvous_with_boss();
b58c55d5 SC	82
768e9a61	83	run_vcpu(vcpu);
b58c55d5 SC	84	rendezvous_with_boss();
	85
	86	return NULL;
	87	}
	88
3468fd7d SC	89	static pthread_t spawn_workers(struct kvm_vm vm, struct kvm_vcpu **vcpus,
3468fd7d SC	90	uint64_t start_gpa, uint64_t end_gpa)
b58c55d5 SC	91	{
	92	struct vcpu_info *info;
	93	uint64_t gpa, nr_bytes;
	94	pthread_t *threads;
	95	int i;
	96
	97	threads = malloc(nr_vcpus * sizeof(*threads));
	98	TEST_ASSERT(threads, "Failed to allocate vCPU threads");
	99
	100	info = malloc(nr_vcpus * sizeof(*info));
	101	TEST_ASSERT(info, "Failed to allocate vCPU gpa ranges");
	102
	103	nr_bytes = ((end_gpa - start_gpa) / nr_vcpus) &
68c1b3e9	104	~((uint64_t)vm->page_size - 1);
b58c55d5 SC	105	TEST_ASSERT(nr_bytes, "C'mon, no way you have %d CPUs", nr_vcpus);
	106
	107	for (i = 0, gpa = start_gpa; i < nr_vcpus; i++, gpa += nr_bytes) {
3468fd7d	108	info[i].vcpu = vcpus[i];
b58c55d5 SC	109	info[i].start_gpa = gpa;
	110	info[i].end_gpa = gpa + nr_bytes;
	111	pthread_create(&threads[i], NULL, vcpu_worker, &info[i]);
	112	}
	113	return threads;
	114	}
	115
	116	static void rendezvous_with_vcpus(struct timespec time, const char name)
	117	{
	118	int i, rendezvoused;
	119
	120	pr_info("Waiting for vCPUs to finish %s...\n", name);
	121
	122	rendezvoused = atomic_read(&rendezvous);
	123	for (i = 0; abs(rendezvoused) != 1; i++) {
	124	usleep(100);
	125	if (!(i & 0x3f))
	126	pr_info("\r%d vCPUs haven't rendezvoused...",
	127	abs(rendezvoused) - 1);
	128	rendezvoused = atomic_read(&rendezvous);
	129	}
	130
	131	clock_gettime(CLOCK_MONOTONIC, time);
	132
	133	/* Release the vCPUs after getting the time of the previous action. */
	134	pr_info("\rAll vCPUs finished %s, releasing...\n", name);
	135	if (rendezvoused > 0)
	136	atomic_set(&rendezvous, -nr_vcpus - 1);
	137	else
	138	atomic_set(&rendezvous, nr_vcpus + 1);
	139	}
	140
	141	static void calc_default_nr_vcpus(void)
	142	{
	143	cpu_set_t possible_mask;
	144	int r;
	145
	146	r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
	147	TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)",
	148	errno, strerror(errno));
	149
	150	nr_vcpus = CPU_COUNT(&possible_mask) * 3/4;
	151	TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?");
	152	}
	153
	154	int main(int argc, char *argv[])
	155	{
	156	/*
	157	* Skip the first 4gb and slot0. slot0 maps <1gb and is used to back
	158	* the guest's code, stack, and page tables. Because selftests creates
	159	* an IRQCHIP, a.k.a. a local APIC, KVM creates an internal memslot
	160	* just below the 4gb boundary. This test could create memory at
	161	* 1gb-3gb,but it's simpler to skip straight to 4gb.
	162	*/
69a62e20	163	const uint64_t start_gpa = SZ_4G;
b58c55d5 SC	164	const int first_slot = 1;
	165
	166	struct timespec time_start, time_run1, time_reset, time_run2;
	167	uint64_t max_gpa, gpa, slot_size, max_mem, i;
	168	int max_slots, slot, opt, fd;
	169	bool hugepages = false;
3468fd7d	170	struct kvm_vcpu **vcpus;
b58c55d5 SC	171	pthread_t *threads;
	172	struct kvm_vm *vm;
	173	void *mem;
	174
	175	/*
	176	* Default to 2gb so that maxing out systems with MAXPHADDR=46, which
	177	* are quite common for x86, requires changing only max_mem (KVM allows
	178	* 32k memslots, 32k * 2gb == ~64tb of guest memory).
	179	*/
69a62e20	180	slot_size = SZ_2G;
b58c55d5 SC	181
	182	max_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
	183	TEST_ASSERT(max_slots > first_slot, "KVM is broken");
	184
	185	/* All KVM MMUs should be able to survive a 128gb guest. */
69a62e20	186	max_mem = 128ull * SZ_1G;
b58c55d5 SC	187
	188	calc_default_nr_vcpus();
	189
	190	while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) {
	191	switch (opt) {
	192	case 'c':
0001725d	193	nr_vcpus = atoi_positive("Number of vCPUs", optarg);
b58c55d5 SC	194	break;
b58c55d5 SC	195	case 'm':
0001725d	196	max_mem = 1ull * atoi_positive("Memory size", optarg) * SZ_1G;
b58c55d5 SC	197	break;
b58c55d5 SC	198	case 's':
0001725d	199	slot_size = 1ull * atoi_positive("Slot size", optarg) * SZ_1G;
b58c55d5 SC	200	break;
	201	case 'H':
	202	hugepages = true;
	203	break;
	204	case 'h':
	205	default:
	206	printf("usage: %s [-c nr_vcpus] [-m max_mem_in_gb] [-s slot_size_in_gb] [-H]\n", argv[0]);
	207	exit(1);
	208	}
	209	}
	210
3468fd7d SC	211	vcpus = malloc(nr_vcpus * sizeof(*vcpus));
	212	TEST_ASSERT(vcpus, "Failed to allocate vCPU array");
	213
	214	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
b58c55d5	215
68c1b3e9	216	max_gpa = vm->max_gfn << vm->page_shift;
b58c55d5 SC	217	TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb ");
	218
	219	fd = kvm_memfd_alloc(slot_size, hugepages);
	220	mem = mmap(NULL, slot_size, PROT_READ \| PROT_WRITE, MAP_SHARED, fd, 0);
	221	TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");
	222
	223	TEST_ASSERT(!madvise(mem, slot_size, MADV_NOHUGEPAGE), "madvise() failed");
	224
	225	/* Pre-fault the memory to avoid taking mmap_sem on guest page faults. */
68c1b3e9	226	for (i = 0; i < slot_size; i += vm->page_size)
b58c55d5 SC	227	((uint8_t *)mem)[i] = 0xaa;
	228
	229	gpa = 0;
	230	for (slot = first_slot; slot < max_slots; slot++) {
	231	gpa = start_gpa + ((slot - first_slot) * slot_size);
	232	if (gpa + slot_size > max_gpa)
	233	break;
	234
	235	if ((gpa - start_gpa) >= max_mem)
	236	break;
	237
	238	vm_set_user_memory_region(vm, slot, 0, gpa, slot_size, mem);
	239
	240	#ifdef __x86_64__
	241	/* Identity map memory in the guest using 1gb pages. */
69a62e20	242	for (i = 0; i < slot_size; i += SZ_1G)
4ee602e7	243	__virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G);
b58c55d5	244	#else
68c1b3e9	245	for (i = 0; i < slot_size; i += vm->page_size)
b58c55d5 SC	246	virt_pg_map(vm, gpa + i, gpa + i);
	247	#endif
	248	}
	249
	250	atomic_set(&rendezvous, nr_vcpus + 1);
3468fd7d SC	251	threads = spawn_workers(vm, vcpus, start_gpa, gpa);
	252
	253	free(vcpus);
	254	vcpus = NULL;
b58c55d5 SC	255
b58c55d5 SC	256	pr_info("Running with %lugb of guest memory and %u vCPUs\n",
69a62e20	257	(gpa - start_gpa) / SZ_1G, nr_vcpus);
b58c55d5 SC	258
	259	rendezvous_with_vcpus(&time_start, "spawning");
	260	rendezvous_with_vcpus(&time_run1, "run 1");
	261	rendezvous_with_vcpus(&time_reset, "reset");
	262	rendezvous_with_vcpus(&time_run2, "run 2");
	263
	264	time_run2 = timespec_sub(time_run2, time_reset);
	265	time_reset = timespec_sub(time_reset, time_run1);
	266	time_run1 = timespec_sub(time_run1, time_start);
	267
	268	pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 = %ld.%.9lds\n",
	269	time_run1.tv_sec, time_run1.tv_nsec,
	270	time_reset.tv_sec, time_reset.tv_nsec,
	271	time_run2.tv_sec, time_run2.tv_nsec);
	272
	273	/*
	274	* Delete even numbered slots (arbitrary) and unmap the first half of
	275	* the backing (also arbitrary) to verify KVM correctly drops all
	276	* references to the removed regions.
	277	*/
	278	for (slot = (slot - 1) & ~1ull; slot >= first_slot; slot -= 2)
	279	vm_set_user_memory_region(vm, slot, 0, 0, 0, NULL);
	280
	281	munmap(mem, slot_size / 2);
	282
	283	/* Sanity check that the vCPUs actually ran. */
	284	for (i = 0; i < nr_vcpus; i++)
	285	pthread_join(threads[i], NULL);
	286
	287	/*
	288	* Deliberately exit without deleting the remaining memslots or closing
	289	* kvm_fd to test cleanup via mmu_notifier.release.
	290	*/
	291	}