Commit | Line | Data |
---|---|---|
3b4cd0ff PX |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * KVM dirty page logging test | |
4 | * | |
5 | * Copyright (C) 2018, Red Hat, Inc. | |
6 | */ | |
7 | ||
e1b376f1 AJ |
8 | #define _GNU_SOURCE /* for program_invocation_name */ |
9 | ||
3b4cd0ff PX |
10 | #include <stdio.h> |
11 | #include <stdlib.h> | |
3b4cd0ff | 12 | #include <pthread.h> |
84292e56 PX |
13 | #include <semaphore.h> |
14 | #include <sys/types.h> | |
15 | #include <signal.h> | |
16 | #include <errno.h> | |
3b4cd0ff PX |
17 | #include <linux/bitmap.h> |
18 | #include <linux/bitops.h> | |
84292e56 | 19 | #include <asm/barrier.h> |
3b4cd0ff | 20 | |
3b4cd0ff | 21 | #include "kvm_util.h" |
e42ac777 AJ |
22 | #include "test_util.h" |
23 | #include "guest_modes.h" | |
eabe7881 | 24 | #include "processor.h" |
3b4cd0ff | 25 | |
fff8dcd7 | 26 | #define VCPU_ID 1 |
3b4cd0ff | 27 | |
3b4cd0ff | 28 | /* The memory slot index to track dirty pages */ |
fff8dcd7 AJ |
29 | #define TEST_MEM_SLOT_INDEX 1 |
30 | ||
a049a377 TH |
31 | /* Default guest test virtual memory offset */ |
32 | #define DEFAULT_GUEST_TEST_MEM 0xc0000000 | |
fff8dcd7 | 33 | |
3b4cd0ff | 34 | /* How many pages to dirty for each guest loop */ |
fff8dcd7 AJ |
35 | #define TEST_PAGES_PER_LOOP 1024 |
36 | ||
3b4cd0ff | 37 | /* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */ |
f8cab69b | 38 | #define TEST_HOST_LOOP_N 32UL |
fff8dcd7 | 39 | |
3b4cd0ff | 40 | /* Interval for each host loop (ms) */ |
f8cab69b | 41 | #define TEST_HOST_LOOP_INTERVAL 10UL |
3b4cd0ff | 42 | |
a049a377 TH |
43 | /* Dirty bitmaps are always little endian, so we need to swap on big endian */ |
44 | #if defined(__s390x__) | |
45 | # define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) | |
46 | # define test_bit_le(nr, addr) \ | |
47 | test_bit((nr) ^ BITOP_LE_SWIZZLE, addr) | |
48 | # define set_bit_le(nr, addr) \ | |
49 | set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) | |
50 | # define clear_bit_le(nr, addr) \ | |
51 | clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) | |
52 | # define test_and_set_bit_le(nr, addr) \ | |
53 | test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) | |
54 | # define test_and_clear_bit_le(nr, addr) \ | |
55 | test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) | |
56 | #else | |
57 | # define test_bit_le test_bit | |
58 | # define set_bit_le set_bit | |
59 | # define clear_bit_le clear_bit | |
60 | # define test_and_set_bit_le test_and_set_bit | |
61 | # define test_and_clear_bit_le test_and_clear_bit | |
62 | #endif | |
63 | ||
019d321a PX |
64 | #define TEST_DIRTY_RING_COUNT 65536 |
65 | ||
66 | #define SIG_IPI SIGUSR1 | |
84292e56 | 67 | |
3b4cd0ff | 68 | /* |
fff8dcd7 AJ |
69 | * Guest/Host shared variables. Ensure addr_gva2hva() and/or |
70 | * sync_global_to/from_guest() are used when accessing from | |
71 | * the host. READ/WRITE_ONCE() should also be used with anything | |
72 | * that may change. | |
3b4cd0ff | 73 | */ |
fff8dcd7 AJ |
74 | static uint64_t host_page_size; |
75 | static uint64_t guest_page_size; | |
e1b376f1 | 76 | static uint64_t guest_num_pages; |
fff8dcd7 AJ |
77 | static uint64_t random_array[TEST_PAGES_PER_LOOP]; |
78 | static uint64_t iteration; | |
3b4cd0ff | 79 | |
5b8ee879 | 80 | /* |
6498e1da AJ |
81 | * Guest physical memory offset of the testing memory slot. |
82 | * This will be set to the topmost valid physical address minus | |
83 | * the test memory size. | |
84 | */ | |
85 | static uint64_t guest_test_phys_mem; | |
86 | ||
87 | /* | |
88 | * Guest virtual memory offset of the testing memory slot. | |
89 | * Must not conflict with identity mapped test code. | |
5b8ee879 | 90 | */ |
d4df5a15 | 91 | static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; |
5b8ee879 | 92 | |
3b4cd0ff | 93 | /* |
fff8dcd7 AJ |
94 | * Continuously write to the first 8 bytes of a random pages within |
95 | * the testing memory region. | |
3b4cd0ff | 96 | */ |
fff8dcd7 | 97 | static void guest_code(void) |
3b4cd0ff | 98 | { |
a049a377 | 99 | uint64_t addr; |
fff8dcd7 | 100 | int i; |
3b4cd0ff | 101 | |
a049a377 TH |
102 | /* |
103 | * On s390x, all pages of a 1M segment are initially marked as dirty | |
104 | * when a page of the segment is written to for the very first time. | |
105 | * To compensate this specialty in this test, we need to touch all | |
106 | * pages during the first iteration. | |
107 | */ | |
108 | for (i = 0; i < guest_num_pages; i++) { | |
109 | addr = guest_test_virt_mem + i * guest_page_size; | |
110 | *(uint64_t *)addr = READ_ONCE(iteration); | |
111 | } | |
112 | ||
3b4cd0ff PX |
113 | while (true) { |
114 | for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { | |
a049a377 | 115 | addr = guest_test_virt_mem; |
e1b376f1 | 116 | addr += (READ_ONCE(random_array[i]) % guest_num_pages) |
fff8dcd7 AJ |
117 | * guest_page_size; |
118 | addr &= ~(host_page_size - 1); | |
119 | *(uint64_t *)addr = READ_ONCE(iteration); | |
3b4cd0ff | 120 | } |
fff8dcd7 | 121 | |
3b4cd0ff PX |
122 | /* Tell the host that we need more random numbers */ |
123 | GUEST_SYNC(1); | |
124 | } | |
125 | } | |
126 | ||
fff8dcd7 AJ |
127 | /* Host variables */ |
128 | static bool host_quit; | |
3b4cd0ff PX |
129 | |
130 | /* Points to the test VM memory region on which we track dirty logs */ | |
fff8dcd7 AJ |
131 | static void *host_test_mem; |
132 | static uint64_t host_num_pages; | |
3b4cd0ff PX |
133 | |
134 | /* For statistics only */ | |
fff8dcd7 AJ |
135 | static uint64_t host_dirty_count; |
136 | static uint64_t host_clear_count; | |
137 | static uint64_t host_track_next_count; | |
3b4cd0ff | 138 | |
84292e56 PX |
139 | /* Whether dirty ring reset is requested, or finished */ |
140 | static sem_t dirty_ring_vcpu_stop; | |
141 | static sem_t dirty_ring_vcpu_cont; | |
019d321a PX |
142 | /* |
143 | * This is updated by the vcpu thread to tell the host whether it's a | |
144 | * ring-full event. It should only be read until a sem_wait() of | |
145 | * dirty_ring_vcpu_stop and before vcpu continues to run. | |
146 | */ | |
147 | static bool dirty_ring_vcpu_ring_full; | |
84292e56 PX |
148 | /* |
149 | * This is only used for verifying the dirty pages. Dirty ring has a very | |
150 | * tricky case when the ring just got full, kvm will do userspace exit due to | |
151 | * ring full. When that happens, the very last PFN is set but actually the | |
152 | * data is not changed (the guest WRITE is not really applied yet), because | |
153 | * we found that the dirty ring is full, refused to continue the vcpu, and | |
154 | * recorded the dirty gfn with the old contents. | |
155 | * | |
156 | * For this specific case, it's safe to skip checking this pfn for this | |
157 | * bit, because it's a redundant bit, and when the write happens later the bit | |
158 | * will be set again. We use this variable to always keep track of the latest | |
159 | * dirty gfn we've collected, so that if a mismatch of data found later in the | |
160 | * verifying process, we let it pass. | |
161 | */ | |
162 | static uint64_t dirty_ring_last_page; | |
163 | ||
afdb1960 PX |
164 | enum log_mode_t { |
165 | /* Only use KVM_GET_DIRTY_LOG for logging */ | |
166 | LOG_MODE_DIRTY_LOG = 0, | |
167 | ||
168 | /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */ | |
169 | LOG_MODE_CLEAR_LOG = 1, | |
170 | ||
84292e56 PX |
171 | /* Use dirty ring for logging */ |
172 | LOG_MODE_DIRTY_RING = 2, | |
173 | ||
afdb1960 PX |
174 | LOG_MODE_NUM, |
175 | ||
176 | /* Run all supported modes */ | |
177 | LOG_MODE_ALL = LOG_MODE_NUM, | |
178 | }; | |
179 | ||
180 | /* Mode of logging to test. Default is to run all supported modes */ | |
181 | static enum log_mode_t host_log_mode_option = LOG_MODE_ALL; | |
182 | /* Logging mode for current run */ | |
183 | static enum log_mode_t host_log_mode; | |
84292e56 | 184 | static pthread_t vcpu_thread; |
edd3de6f | 185 | static uint32_t test_dirty_ring_count = TEST_DIRTY_RING_COUNT; |
84292e56 | 186 | |
019d321a PX |
187 | static void vcpu_kick(void) |
188 | { | |
189 | pthread_kill(vcpu_thread, SIG_IPI); | |
190 | } | |
191 | ||
84292e56 PX |
192 | /* |
193 | * In our test we do signal tricks, let's use a better version of | |
194 | * sem_wait to avoid signal interrupts | |
195 | */ | |
196 | static void sem_wait_until(sem_t *sem) | |
197 | { | |
198 | int ret; | |
199 | ||
200 | do | |
201 | ret = sem_wait(sem); | |
202 | while (ret == -1 && errno == EINTR); | |
203 | } | |
afdb1960 PX |
204 | |
205 | static bool clear_log_supported(void) | |
206 | { | |
207 | return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); | |
208 | } | |
209 | ||
210 | static void clear_log_create_vm_done(struct kvm_vm *vm) | |
211 | { | |
212 | struct kvm_enable_cap cap = {}; | |
213 | u64 manual_caps; | |
214 | ||
215 | manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); | |
216 | TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!"); | |
217 | manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | | |
218 | KVM_DIRTY_LOG_INITIALLY_SET); | |
219 | cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; | |
220 | cap.args[0] = manual_caps; | |
221 | vm_enable_cap(vm, &cap); | |
222 | } | |
223 | ||
224 | static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot, | |
225 | void *bitmap, uint32_t num_pages) | |
226 | { | |
227 | kvm_vm_get_dirty_log(vm, slot, bitmap); | |
228 | } | |
229 | ||
230 | static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot, | |
231 | void *bitmap, uint32_t num_pages) | |
232 | { | |
233 | kvm_vm_get_dirty_log(vm, slot, bitmap); | |
234 | kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages); | |
235 | } | |
236 | ||
84292e56 | 237 | static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err) |
60f644fb PX |
238 | { |
239 | struct kvm_run *run = vcpu_state(vm, VCPU_ID); | |
240 | ||
84292e56 PX |
241 | TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR), |
242 | "vcpu run failed: errno=%d", err); | |
243 | ||
60f644fb PX |
244 | TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, |
245 | "Invalid guest sync status: exit_reason=%s\n", | |
246 | exit_reason_str(run->exit_reason)); | |
247 | } | |
248 | ||
84292e56 PX |
249 | static bool dirty_ring_supported(void) |
250 | { | |
251 | return kvm_check_cap(KVM_CAP_DIRTY_LOG_RING); | |
252 | } | |
253 | ||
254 | static void dirty_ring_create_vm_done(struct kvm_vm *vm) | |
255 | { | |
256 | /* | |
257 | * Switch to dirty ring mode after VM creation but before any | |
258 | * of the vcpu creation. | |
259 | */ | |
edd3de6f | 260 | vm_enable_dirty_ring(vm, test_dirty_ring_count * |
84292e56 PX |
261 | sizeof(struct kvm_dirty_gfn)); |
262 | } | |
263 | ||
264 | static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) | |
265 | { | |
266 | return gfn->flags == KVM_DIRTY_GFN_F_DIRTY; | |
267 | } | |
268 | ||
269 | static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) | |
270 | { | |
271 | gfn->flags = KVM_DIRTY_GFN_F_RESET; | |
272 | } | |
273 | ||
274 | static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns, | |
275 | int slot, void *bitmap, | |
276 | uint32_t num_pages, uint32_t *fetch_index) | |
277 | { | |
278 | struct kvm_dirty_gfn *cur; | |
279 | uint32_t count = 0; | |
280 | ||
281 | while (true) { | |
edd3de6f | 282 | cur = &dirty_gfns[*fetch_index % test_dirty_ring_count]; |
84292e56 PX |
283 | if (!dirty_gfn_is_dirtied(cur)) |
284 | break; | |
285 | TEST_ASSERT(cur->slot == slot, "Slot number didn't match: " | |
286 | "%u != %u", cur->slot, slot); | |
287 | TEST_ASSERT(cur->offset < num_pages, "Offset overflow: " | |
288 | "0x%llx >= 0x%x", cur->offset, num_pages); | |
289 | //pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset); | |
290 | set_bit_le(cur->offset, bitmap); | |
291 | dirty_ring_last_page = cur->offset; | |
292 | dirty_gfn_set_collected(cur); | |
293 | (*fetch_index)++; | |
294 | count++; | |
295 | } | |
296 | ||
297 | return count; | |
298 | } | |
299 | ||
300 | static void dirty_ring_wait_vcpu(void) | |
301 | { | |
019d321a PX |
302 | /* This makes sure that hardware PML cache flushed */ |
303 | vcpu_kick(); | |
84292e56 PX |
304 | sem_wait_until(&dirty_ring_vcpu_stop); |
305 | } | |
306 | ||
307 | static void dirty_ring_continue_vcpu(void) | |
308 | { | |
309 | pr_info("Notifying vcpu to continue\n"); | |
310 | sem_post(&dirty_ring_vcpu_cont); | |
311 | } | |
312 | ||
313 | static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot, | |
314 | void *bitmap, uint32_t num_pages) | |
315 | { | |
316 | /* We only have one vcpu */ | |
317 | static uint32_t fetch_index = 0; | |
318 | uint32_t count = 0, cleared; | |
019d321a | 319 | bool continued_vcpu = false; |
84292e56 PX |
320 | |
321 | dirty_ring_wait_vcpu(); | |
322 | ||
019d321a PX |
323 | if (!dirty_ring_vcpu_ring_full) { |
324 | /* | |
325 | * This is not a ring-full event, it's safe to allow | |
326 | * vcpu to continue | |
327 | */ | |
328 | dirty_ring_continue_vcpu(); | |
329 | continued_vcpu = true; | |
330 | } | |
331 | ||
84292e56 PX |
332 | /* Only have one vcpu */ |
333 | count = dirty_ring_collect_one(vcpu_map_dirty_ring(vm, VCPU_ID), | |
334 | slot, bitmap, num_pages, &fetch_index); | |
335 | ||
336 | cleared = kvm_vm_reset_dirty_ring(vm); | |
337 | ||
338 | /* Cleared pages should be the same as collected */ | |
339 | TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch " | |
340 | "with collected (%u)", cleared, count); | |
341 | ||
019d321a PX |
342 | if (!continued_vcpu) { |
343 | TEST_ASSERT(dirty_ring_vcpu_ring_full, | |
344 | "Didn't continue vcpu even without ring full"); | |
345 | dirty_ring_continue_vcpu(); | |
346 | } | |
84292e56 PX |
347 | |
348 | pr_info("Iteration %ld collected %u pages\n", iteration, count); | |
349 | } | |
350 | ||
351 | static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err) | |
352 | { | |
353 | struct kvm_run *run = vcpu_state(vm, VCPU_ID); | |
354 | ||
355 | /* A ucall-sync or ring-full event is allowed */ | |
356 | if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) { | |
357 | /* We should allow this to continue */ | |
358 | ; | |
019d321a PX |
359 | } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL || |
360 | (ret == -1 && err == EINTR)) { | |
84292e56 | 361 | /* Update the flag first before pause */ |
019d321a PX |
362 | WRITE_ONCE(dirty_ring_vcpu_ring_full, |
363 | run->exit_reason == KVM_EXIT_DIRTY_RING_FULL); | |
84292e56 | 364 | sem_post(&dirty_ring_vcpu_stop); |
019d321a PX |
365 | pr_info("vcpu stops because %s...\n", |
366 | dirty_ring_vcpu_ring_full ? | |
367 | "dirty ring is full" : "vcpu is kicked out"); | |
84292e56 PX |
368 | sem_wait_until(&dirty_ring_vcpu_cont); |
369 | pr_info("vcpu continues now.\n"); | |
370 | } else { | |
371 | TEST_ASSERT(false, "Invalid guest sync status: " | |
372 | "exit_reason=%s\n", | |
373 | exit_reason_str(run->exit_reason)); | |
374 | } | |
375 | } | |
376 | ||
377 | static void dirty_ring_before_vcpu_join(void) | |
378 | { | |
379 | /* Kick another round of vcpu just to make sure it will quit */ | |
380 | sem_post(&dirty_ring_vcpu_cont); | |
381 | } | |
382 | ||
afdb1960 PX |
383 | struct log_mode { |
384 | const char *name; | |
385 | /* Return true if this mode is supported, otherwise false */ | |
386 | bool (*supported)(void); | |
387 | /* Hook when the vm creation is done (before vcpu creation) */ | |
388 | void (*create_vm_done)(struct kvm_vm *vm); | |
389 | /* Hook to collect the dirty pages into the bitmap provided */ | |
390 | void (*collect_dirty_pages) (struct kvm_vm *vm, int slot, | |
391 | void *bitmap, uint32_t num_pages); | |
60f644fb | 392 | /* Hook to call when after each vcpu run */ |
84292e56 PX |
393 | void (*after_vcpu_run)(struct kvm_vm *vm, int ret, int err); |
394 | void (*before_vcpu_join) (void); | |
afdb1960 PX |
395 | } log_modes[LOG_MODE_NUM] = { |
396 | { | |
397 | .name = "dirty-log", | |
398 | .collect_dirty_pages = dirty_log_collect_dirty_pages, | |
60f644fb | 399 | .after_vcpu_run = default_after_vcpu_run, |
afdb1960 PX |
400 | }, |
401 | { | |
402 | .name = "clear-log", | |
403 | .supported = clear_log_supported, | |
404 | .create_vm_done = clear_log_create_vm_done, | |
405 | .collect_dirty_pages = clear_log_collect_dirty_pages, | |
60f644fb | 406 | .after_vcpu_run = default_after_vcpu_run, |
afdb1960 | 407 | }, |
84292e56 PX |
408 | { |
409 | .name = "dirty-ring", | |
410 | .supported = dirty_ring_supported, | |
411 | .create_vm_done = dirty_ring_create_vm_done, | |
412 | .collect_dirty_pages = dirty_ring_collect_dirty_pages, | |
413 | .before_vcpu_join = dirty_ring_before_vcpu_join, | |
414 | .after_vcpu_run = dirty_ring_after_vcpu_run, | |
415 | }, | |
afdb1960 PX |
416 | }; |
417 | ||
3b4cd0ff PX |
418 | /* |
419 | * We use this bitmap to track some pages that should have its dirty | |
420 | * bit set in the _next_ iteration. For example, if we detected the | |
421 | * page value changed to current iteration but at the same time the | |
422 | * page bit is cleared in the latest bitmap, then the system must | |
423 | * report that write in the next get dirty log call. | |
424 | */ | |
fff8dcd7 | 425 | static unsigned long *host_bmap_track; |
3b4cd0ff | 426 | |
afdb1960 PX |
427 | static void log_modes_dump(void) |
428 | { | |
429 | int i; | |
430 | ||
431 | printf("all"); | |
432 | for (i = 0; i < LOG_MODE_NUM; i++) | |
433 | printf(", %s", log_modes[i].name); | |
434 | printf("\n"); | |
435 | } | |
436 | ||
437 | static bool log_mode_supported(void) | |
438 | { | |
439 | struct log_mode *mode = &log_modes[host_log_mode]; | |
440 | ||
441 | if (mode->supported) | |
442 | return mode->supported(); | |
443 | ||
444 | return true; | |
445 | } | |
446 | ||
447 | static void log_mode_create_vm_done(struct kvm_vm *vm) | |
448 | { | |
449 | struct log_mode *mode = &log_modes[host_log_mode]; | |
450 | ||
451 | if (mode->create_vm_done) | |
452 | mode->create_vm_done(vm); | |
453 | } | |
454 | ||
455 | static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot, | |
456 | void *bitmap, uint32_t num_pages) | |
457 | { | |
458 | struct log_mode *mode = &log_modes[host_log_mode]; | |
459 | ||
460 | TEST_ASSERT(mode->collect_dirty_pages != NULL, | |
461 | "collect_dirty_pages() is required for any log mode!"); | |
462 | mode->collect_dirty_pages(vm, slot, bitmap, num_pages); | |
463 | } | |
464 | ||
84292e56 | 465 | static void log_mode_after_vcpu_run(struct kvm_vm *vm, int ret, int err) |
60f644fb PX |
466 | { |
467 | struct log_mode *mode = &log_modes[host_log_mode]; | |
468 | ||
469 | if (mode->after_vcpu_run) | |
84292e56 PX |
470 | mode->after_vcpu_run(vm, ret, err); |
471 | } | |
472 | ||
473 | static void log_mode_before_vcpu_join(void) | |
474 | { | |
475 | struct log_mode *mode = &log_modes[host_log_mode]; | |
476 | ||
477 | if (mode->before_vcpu_join) | |
478 | mode->before_vcpu_join(); | |
60f644fb PX |
479 | } |
480 | ||
fff8dcd7 | 481 | static void generate_random_array(uint64_t *guest_array, uint64_t size) |
3b4cd0ff PX |
482 | { |
483 | uint64_t i; | |
484 | ||
fff8dcd7 | 485 | for (i = 0; i < size; i++) |
3b4cd0ff | 486 | guest_array[i] = random(); |
3b4cd0ff PX |
487 | } |
488 | ||
fff8dcd7 | 489 | static void *vcpu_worker(void *data) |
3b4cd0ff | 490 | { |
84292e56 | 491 | int ret, vcpu_fd; |
3b4cd0ff | 492 | struct kvm_vm *vm = data; |
fff8dcd7 AJ |
493 | uint64_t *guest_array; |
494 | uint64_t pages_count = 0; | |
019d321a PX |
495 | struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset) |
496 | + sizeof(sigset_t)); | |
497 | sigset_t *sigset = (sigset_t *) &sigmask->sigset; | |
3b4cd0ff | 498 | |
84292e56 PX |
499 | vcpu_fd = vcpu_get_fd(vm, VCPU_ID); |
500 | ||
019d321a PX |
501 | /* |
502 | * SIG_IPI is unblocked atomically while in KVM_RUN. It causes the | |
503 | * ioctl to return with -EINTR, but it is still pending and we need | |
504 | * to accept it with the sigwait. | |
505 | */ | |
506 | sigmask->len = 8; | |
507 | pthread_sigmask(0, NULL, sigset); | |
508 | vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask); | |
509 | sigaddset(sigset, SIG_IPI); | |
510 | pthread_sigmask(SIG_BLOCK, sigset, NULL); | |
511 | ||
512 | sigemptyset(sigset); | |
513 | sigaddset(sigset, SIG_IPI); | |
514 | ||
fff8dcd7 | 515 | guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array); |
3b4cd0ff PX |
516 | |
517 | while (!READ_ONCE(host_quit)) { | |
84292e56 | 518 | /* Clear any existing kick signals */ |
60f644fb PX |
519 | generate_random_array(guest_array, TEST_PAGES_PER_LOOP); |
520 | pages_count += TEST_PAGES_PER_LOOP; | |
fff8dcd7 | 521 | /* Let the guest dirty the random pages */ |
84292e56 | 522 | ret = ioctl(vcpu_fd, KVM_RUN, NULL); |
019d321a PX |
523 | if (ret == -1 && errno == EINTR) { |
524 | int sig = -1; | |
525 | sigwait(sigset, &sig); | |
526 | assert(sig == SIG_IPI); | |
527 | } | |
84292e56 | 528 | log_mode_after_vcpu_run(vm, ret, errno); |
3b4cd0ff PX |
529 | } |
530 | ||
3439d886 | 531 | pr_info("Dirtied %"PRIu64" pages\n", pages_count); |
3b4cd0ff PX |
532 | |
533 | return NULL; | |
534 | } | |
535 | ||
87a802d9 | 536 | static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) |
3b4cd0ff | 537 | { |
87a802d9 | 538 | uint64_t step = vm_num_host_pages(mode, 1); |
3b4cd0ff | 539 | uint64_t page; |
fff8dcd7 | 540 | uint64_t *value_ptr; |
84292e56 | 541 | uint64_t min_iter = 0; |
3b4cd0ff | 542 | |
e1b376f1 | 543 | for (page = 0; page < host_num_pages; page += step) { |
fff8dcd7 | 544 | value_ptr = host_test_mem + page * host_page_size; |
3b4cd0ff PX |
545 | |
546 | /* If this is a special page that we were tracking... */ | |
a049a377 | 547 | if (test_and_clear_bit_le(page, host_bmap_track)) { |
3b4cd0ff | 548 | host_track_next_count++; |
a049a377 | 549 | TEST_ASSERT(test_bit_le(page, bmap), |
3b4cd0ff PX |
550 | "Page %"PRIu64" should have its dirty bit " |
551 | "set in this iteration but it is missing", | |
552 | page); | |
553 | } | |
554 | ||
3031e028 | 555 | if (test_and_clear_bit_le(page, bmap)) { |
84292e56 PX |
556 | bool matched; |
557 | ||
3b4cd0ff | 558 | host_dirty_count++; |
84292e56 | 559 | |
3b4cd0ff PX |
560 | /* |
561 | * If the bit is set, the value written onto | |
562 | * the corresponding page should be either the | |
563 | * previous iteration number or the current one. | |
564 | */ | |
84292e56 PX |
565 | matched = (*value_ptr == iteration || |
566 | *value_ptr == iteration - 1); | |
567 | ||
568 | if (host_log_mode == LOG_MODE_DIRTY_RING && !matched) { | |
569 | if (*value_ptr == iteration - 2 && min_iter <= iteration - 2) { | |
570 | /* | |
571 | * Short answer: this case is special | |
572 | * only for dirty ring test where the | |
573 | * page is the last page before a kvm | |
574 | * dirty ring full in iteration N-2. | |
575 | * | |
576 | * Long answer: Assuming ring size R, | |
577 | * one possible condition is: | |
578 | * | |
579 | * main thr vcpu thr | |
580 | * -------- -------- | |
581 | * iter=1 | |
582 | * write 1 to page 0~(R-1) | |
583 | * full, vmexit | |
584 | * collect 0~(R-1) | |
585 | * kick vcpu | |
586 | * write 1 to (R-1)~(2R-2) | |
587 | * full, vmexit | |
588 | * iter=2 | |
589 | * collect (R-1)~(2R-2) | |
590 | * kick vcpu | |
591 | * write 1 to (2R-2) | |
592 | * (NOTE!!! "1" cached in cpu reg) | |
593 | * write 2 to (2R-1)~(3R-3) | |
594 | * full, vmexit | |
595 | * iter=3 | |
596 | * collect (2R-2)~(3R-3) | |
597 | * (here if we read value on page | |
598 | * "2R-2" is 1, while iter=3!!!) | |
599 | * | |
600 | * This however can only happen once per iteration. | |
601 | */ | |
602 | min_iter = iteration - 1; | |
603 | continue; | |
604 | } else if (page == dirty_ring_last_page) { | |
605 | /* | |
606 | * Please refer to comments in | |
607 | * dirty_ring_last_page. | |
608 | */ | |
609 | continue; | |
610 | } | |
611 | } | |
612 | ||
613 | TEST_ASSERT(matched, | |
3b4cd0ff PX |
614 | "Set page %"PRIu64" value %"PRIu64 |
615 | " incorrect (iteration=%"PRIu64")", | |
616 | page, *value_ptr, iteration); | |
617 | } else { | |
618 | host_clear_count++; | |
619 | /* | |
620 | * If cleared, the value written can be any | |
621 | * value smaller or equals to the iteration | |
622 | * number. Note that the value can be exactly | |
623 | * (iteration-1) if that write can happen | |
624 | * like this: | |
625 | * | |
626 | * (1) increase loop count to "iteration-1" | |
627 | * (2) write to page P happens (with value | |
628 | * "iteration-1") | |
629 | * (3) get dirty log for "iteration-1"; we'll | |
630 | * see that page P bit is set (dirtied), | |
631 | * and not set the bit in host_bmap_track | |
632 | * (4) increase loop count to "iteration" | |
633 | * (which is current iteration) | |
634 | * (5) get dirty log for current iteration, | |
635 | * we'll see that page P is cleared, with | |
636 | * value "iteration-1". | |
637 | */ | |
638 | TEST_ASSERT(*value_ptr <= iteration, | |
639 | "Clear page %"PRIu64" value %"PRIu64 | |
640 | " incorrect (iteration=%"PRIu64")", | |
641 | page, *value_ptr, iteration); | |
642 | if (*value_ptr == iteration) { | |
643 | /* | |
644 | * This page is _just_ modified; it | |
645 | * should report its dirtyness in the | |
646 | * next run | |
647 | */ | |
a049a377 | 648 | set_bit_le(page, host_bmap_track); |
3b4cd0ff PX |
649 | } |
650 | } | |
651 | } | |
652 | } | |
653 | ||
e1b376f1 | 654 | static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, |
12c386b2 | 655 | uint64_t extra_mem_pages, void *guest_code) |
3b4cd0ff | 656 | { |
e1b376f1 AJ |
657 | struct kvm_vm *vm; |
658 | uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; | |
659 | ||
3439d886 AJ |
660 | pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); |
661 | ||
f663132d | 662 | vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); |
e1b376f1 AJ |
663 | kvm_vm_elf_load(vm, program_invocation_name, 0, 0); |
664 | #ifdef __x86_64__ | |
665 | vm_create_irqchip(vm); | |
666 | #endif | |
afdb1960 | 667 | log_mode_create_vm_done(vm); |
e1b376f1 AJ |
668 | vm_vcpu_add_default(vm, vcpuid, guest_code); |
669 | return vm; | |
3b4cd0ff PX |
670 | } |
671 | ||
338eb298 PX |
672 | #define DIRTY_MEM_BITS 30 /* 1G */ |
673 | #define PAGE_SHIFT_4K 12 | |
674 | ||
e42ac777 AJ |
675 | struct test_params { |
676 | unsigned long iterations; | |
677 | unsigned long interval; | |
678 | uint64_t phys_offset; | |
679 | }; | |
680 | ||
681 | static void run_test(enum vm_guest_mode mode, void *arg) | |
3b4cd0ff | 682 | { |
e42ac777 | 683 | struct test_params *p = arg; |
3b4cd0ff | 684 | struct kvm_vm *vm; |
fff8dcd7 | 685 | unsigned long *bmap; |
3b4cd0ff | 686 | |
afdb1960 PX |
687 | if (!log_mode_supported()) { |
688 | print_skip("Log mode '%s' not supported", | |
689 | log_modes[host_log_mode].name); | |
690 | return; | |
691 | } | |
692 | ||
338eb298 PX |
693 | /* |
694 | * We reserve page table for 2 times of extra dirty mem which | |
695 | * will definitely cover the original (1G+) test range. Here | |
696 | * we do the calculation with 4K page size which is the | |
697 | * smallest so the page number will be enough for all archs | |
698 | * (e.g., 64K page size guest will need even less memory for | |
699 | * page tables). | |
700 | */ | |
701 | vm = create_vm(mode, VCPU_ID, | |
702 | 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), | |
703 | guest_code); | |
704 | ||
52200d0d | 705 | guest_page_size = vm_get_page_size(vm); |
76d58e0f PB |
706 | /* |
707 | * A little more than 1G of guest page sized pages. Cover the | |
708 | * case where the size is not aligned to 64 pages. | |
709 | */ | |
52200d0d | 710 | guest_num_pages = (1ul << (DIRTY_MEM_BITS - |
87a802d9 AJ |
711 | vm_get_page_shift(vm))) + 3; |
712 | guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); | |
331b4de9 | 713 | |
fff8dcd7 | 714 | host_page_size = getpagesize(); |
87a802d9 | 715 | host_num_pages = vm_num_host_pages(mode, guest_num_pages); |
fff8dcd7 | 716 | |
e42ac777 | 717 | if (!p->phys_offset) { |
52200d0d PX |
718 | guest_test_phys_mem = (vm_get_max_gfn(vm) - |
719 | guest_num_pages) * guest_page_size; | |
d4df5a15 | 720 | guest_test_phys_mem &= ~(host_page_size - 1); |
fd3f6f81 | 721 | } else { |
e42ac777 | 722 | guest_test_phys_mem = p->phys_offset; |
5b8ee879 AJ |
723 | } |
724 | ||
a049a377 TH |
725 | #ifdef __s390x__ |
726 | /* Align to 1M (segment size) */ | |
727 | guest_test_phys_mem &= ~((1 << 20) - 1); | |
728 | #endif | |
729 | ||
3439d886 | 730 | pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); |
5b8ee879 | 731 | |
fff8dcd7 AJ |
732 | bmap = bitmap_alloc(host_num_pages); |
733 | host_bmap_track = bitmap_alloc(host_num_pages); | |
3b4cd0ff | 734 | |
3b4cd0ff PX |
735 | /* Add an extra memory slot for testing dirty logging */ |
736 | vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, | |
d4df5a15 | 737 | guest_test_phys_mem, |
3b4cd0ff | 738 | TEST_MEM_SLOT_INDEX, |
e1b376f1 | 739 | guest_num_pages, |
3b4cd0ff | 740 | KVM_MEM_LOG_DIRTY_PAGES); |
3b4cd0ff | 741 | |
d4df5a15 | 742 | /* Do mapping for the dirty track memory slot */ |
beca5470 | 743 | virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); |
fff8dcd7 AJ |
744 | |
745 | /* Cache the HVA pointer of the region */ | |
d4df5a15 | 746 | host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); |
3b4cd0ff | 747 | |
2040f414 | 748 | ucall_init(vm, NULL); |
3b4cd0ff | 749 | |
e1b376f1 | 750 | /* Export the shared variables to the guest */ |
fff8dcd7 AJ |
751 | sync_global_to_guest(vm, host_page_size); |
752 | sync_global_to_guest(vm, guest_page_size); | |
d4df5a15 | 753 | sync_global_to_guest(vm, guest_test_virt_mem); |
e1b376f1 | 754 | sync_global_to_guest(vm, guest_num_pages); |
3b4cd0ff PX |
755 | |
756 | /* Start the iterations */ | |
fff8dcd7 AJ |
757 | iteration = 1; |
758 | sync_global_to_guest(vm, iteration); | |
e1b376f1 AJ |
759 | host_quit = false; |
760 | host_dirty_count = 0; | |
761 | host_clear_count = 0; | |
762 | host_track_next_count = 0; | |
3b4cd0ff | 763 | |
3b4cd0ff PX |
764 | pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); |
765 | ||
e42ac777 | 766 | while (iteration < p->iterations) { |
3b4cd0ff | 767 | /* Give the vcpu thread some time to dirty some pages */ |
e42ac777 | 768 | usleep(p->interval * 1000); |
afdb1960 PX |
769 | log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX, |
770 | bmap, host_num_pages); | |
87a802d9 | 771 | vm_dirty_log_verify(mode, bmap); |
fff8dcd7 AJ |
772 | iteration++; |
773 | sync_global_to_guest(vm, iteration); | |
3b4cd0ff PX |
774 | } |
775 | ||
776 | /* Tell the vcpu thread to quit */ | |
777 | host_quit = true; | |
84292e56 | 778 | log_mode_before_vcpu_join(); |
3b4cd0ff PX |
779 | pthread_join(vcpu_thread, NULL); |
780 | ||
3439d886 AJ |
781 | pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " |
782 | "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count, | |
783 | host_track_next_count); | |
3b4cd0ff PX |
784 | |
785 | free(bmap); | |
786 | free(host_bmap_track); | |
fff8dcd7 | 787 | ucall_uninit(vm); |
3b4cd0ff | 788 | kvm_vm_free(vm); |
e1b376f1 AJ |
789 | } |
790 | ||
e1b376f1 AJ |
791 | static void help(char *name) |
792 | { | |
e1b376f1 | 793 | puts(""); |
5b8ee879 | 794 | printf("usage: %s [-h] [-i iterations] [-I interval] " |
6498e1da | 795 | "[-p offset] [-m mode]\n", name); |
e1b376f1 | 796 | puts(""); |
edd3de6f PX |
797 | printf(" -c: specify dirty ring size, in number of entries\n"); |
798 | printf(" (only useful for dirty-ring test; default: %"PRIu32")\n", | |
799 | TEST_DIRTY_RING_COUNT); | |
e1b376f1 AJ |
800 | printf(" -i: specify iteration counts (default: %"PRIu64")\n", |
801 | TEST_HOST_LOOP_N); | |
802 | printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", | |
803 | TEST_HOST_LOOP_INTERVAL); | |
6498e1da AJ |
804 | printf(" -p: specify guest physical test memory offset\n" |
805 | " Warning: a low offset can conflict with the loaded test code.\n"); | |
afdb1960 PX |
806 | printf(" -M: specify the host logging mode " |
807 | "(default: run all log modes). Supported modes: \n\t"); | |
808 | log_modes_dump(); | |
e42ac777 | 809 | guest_modes_help(); |
e1b376f1 AJ |
810 | puts(""); |
811 | exit(0); | |
812 | } | |
813 | ||
814 | int main(int argc, char *argv[]) | |
815 | { | |
e42ac777 AJ |
816 | struct test_params p = { |
817 | .iterations = TEST_HOST_LOOP_N, | |
818 | .interval = TEST_HOST_LOOP_INTERVAL, | |
819 | }; | |
820 | int opt, i; | |
2a31b9db | 821 | |
84292e56 PX |
822 | sem_init(&dirty_ring_vcpu_stop, 0, 0); |
823 | sem_init(&dirty_ring_vcpu_cont, 0, 0); | |
824 | ||
e42ac777 | 825 | guest_modes_append_default(); |
696ade77 | 826 | |
edd3de6f | 827 | while ((opt = getopt(argc, argv, "c:hi:I:p:m:M:")) != -1) { |
e1b376f1 | 828 | switch (opt) { |
edd3de6f PX |
829 | case 'c': |
830 | test_dirty_ring_count = strtol(optarg, NULL, 10); | |
831 | break; | |
e1b376f1 | 832 | case 'i': |
e42ac777 | 833 | p.iterations = strtol(optarg, NULL, 10); |
e1b376f1 AJ |
834 | break; |
835 | case 'I': | |
e42ac777 | 836 | p.interval = strtol(optarg, NULL, 10); |
e1b376f1 | 837 | break; |
6498e1da | 838 | case 'p': |
e42ac777 | 839 | p.phys_offset = strtoull(optarg, NULL, 0); |
5b8ee879 | 840 | break; |
e1b376f1 | 841 | case 'm': |
e42ac777 | 842 | guest_modes_cmdline(optarg); |
e1b376f1 | 843 | break; |
afdb1960 PX |
844 | case 'M': |
845 | if (!strcmp(optarg, "all")) { | |
846 | host_log_mode_option = LOG_MODE_ALL; | |
847 | break; | |
848 | } | |
849 | for (i = 0; i < LOG_MODE_NUM; i++) { | |
850 | if (!strcmp(optarg, log_modes[i].name)) { | |
851 | pr_info("Setting log mode to: '%s'\n", | |
852 | optarg); | |
853 | host_log_mode_option = i; | |
854 | break; | |
855 | } | |
856 | } | |
857 | if (i == LOG_MODE_NUM) { | |
858 | printf("Log mode '%s' invalid. Please choose " | |
859 | "from: ", optarg); | |
860 | log_modes_dump(); | |
861 | exit(1); | |
862 | } | |
863 | break; | |
e1b376f1 AJ |
864 | case 'h': |
865 | default: | |
866 | help(argv[0]); | |
867 | break; | |
868 | } | |
869 | } | |
870 | ||
e42ac777 AJ |
871 | TEST_ASSERT(p.iterations > 2, "Iterations must be greater than two"); |
872 | TEST_ASSERT(p.interval > 0, "Interval must be greater than zero"); | |
e1b376f1 | 873 | |
3439d886 | 874 | pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", |
e42ac777 | 875 | p.iterations, p.interval); |
e1b376f1 AJ |
876 | |
877 | srandom(time(0)); | |
878 | ||
e42ac777 AJ |
879 | if (host_log_mode_option == LOG_MODE_ALL) { |
880 | /* Run each log mode */ | |
881 | for (i = 0; i < LOG_MODE_NUM; i++) { | |
882 | pr_info("Testing Log Mode '%s'\n", log_modes[i].name); | |
883 | host_log_mode = i; | |
884 | for_each_guest_mode(run_test, &p); | |
afdb1960 | 885 | } |
e42ac777 AJ |
886 | } else { |
887 | host_log_mode = host_log_mode_option; | |
888 | for_each_guest_mode(run_test, &p); | |
e1b376f1 | 889 | } |
3b4cd0ff PX |
890 | |
891 | return 0; | |
892 | } |