Commit | Line | Data |
---|---|---|
61e52f16 SC |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #define _GNU_SOURCE /* for program_invocation_short_name */ | |
3 | #include <errno.h> | |
4 | #include <fcntl.h> | |
5 | #include <pthread.h> | |
6 | #include <sched.h> | |
7 | #include <stdio.h> | |
8 | #include <stdlib.h> | |
9 | #include <string.h> | |
10 | #include <signal.h> | |
11 | #include <syscall.h> | |
12 | #include <sys/ioctl.h> | |
7b0035ea | 13 | #include <sys/sysinfo.h> |
61e52f16 SC |
14 | #include <asm/barrier.h> |
15 | #include <linux/atomic.h> | |
16 | #include <linux/rseq.h> | |
17 | #include <linux/unistd.h> | |
18 | ||
19 | #include "kvm_util.h" | |
20 | #include "processor.h" | |
21 | #include "test_util.h" | |
22 | ||
66d42ac7 | 23 | #include "../rseq/rseq.c" |
61e52f16 SC |
24 | |
25 | /* | |
26 | * Any bug related to task migration is likely to be timing-dependent; perform | |
27 | * a large number of migrations to reduce the odds of a false negative. | |
28 | */ | |
29 | #define NR_TASK_MIGRATIONS 100000 | |
30 | ||
31 | static pthread_t migration_thread; | |
32 | static cpu_set_t possible_mask; | |
7b0035ea | 33 | static int min_cpu, max_cpu; |
61e52f16 SC |
34 | static bool done; |
35 | ||
36 | static atomic_t seq_cnt; | |
37 | ||
38 | static void guest_code(void) | |
39 | { | |
40 | for (;;) | |
41 | GUEST_SYNC(0); | |
42 | } | |
43 | ||
7b0035ea SC |
44 | static int next_cpu(int cpu) |
45 | { | |
46 | /* | |
47 | * Advance to the next CPU, skipping those that weren't in the original | |
48 | * affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's | |
49 | * data storage is considered as opaque. Note, if this task is pinned | |
50 | * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will | |
51 | * burn a lot cycles and the test will take longer than normal to | |
52 | * complete. | |
53 | */ | |
54 | do { | |
55 | cpu++; | |
56 | if (cpu > max_cpu) { | |
57 | cpu = min_cpu; | |
58 | TEST_ASSERT(CPU_ISSET(cpu, &possible_mask), | |
59 | "Min CPU = %d must always be usable", cpu); | |
60 | break; | |
61 | } | |
62 | } while (!CPU_ISSET(cpu, &possible_mask)); | |
63 | ||
64 | return cpu; | |
65 | } | |
66 | ||
e923b053 | 67 | static void *migration_worker(void *__rseq_tid) |
61e52f16 | 68 | { |
e923b053 | 69 | pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid; |
61e52f16 | 70 | cpu_set_t allowed_mask; |
7b0035ea | 71 | int r, i, cpu; |
61e52f16 SC |
72 | |
73 | CPU_ZERO(&allowed_mask); | |
74 | ||
7b0035ea | 75 | for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) { |
61e52f16 SC |
76 | CPU_SET(cpu, &allowed_mask); |
77 | ||
78 | /* | |
79 | * Bump the sequence count twice to allow the reader to detect | |
80 | * that a migration may have occurred in between rseq and sched | |
81 | * CPU ID reads. An odd sequence count indicates a migration | |
82 | * is in-progress, while a completely different count indicates | |
83 | * a migration occurred since the count was last read. | |
84 | */ | |
85 | atomic_inc(&seq_cnt); | |
86 | ||
87 | /* | |
0fcc1029 | 88 | * Ensure the odd count is visible while getcpu() isn't |
61e52f16 SC |
89 | * stable, i.e. while changing affinity is in-progress. |
90 | */ | |
91 | smp_wmb(); | |
e923b053 | 92 | r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask); |
61e52f16 SC |
93 | TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", |
94 | errno, strerror(errno)); | |
95 | smp_wmb(); | |
96 | atomic_inc(&seq_cnt); | |
97 | ||
98 | CPU_CLR(cpu, &allowed_mask); | |
99 | ||
100 | /* | |
101 | * Wait 1-10us before proceeding to the next iteration and more | |
102 | * specifically, before bumping seq_cnt again. A delay is | |
103 | * needed on three fronts: | |
104 | * | |
105 | * 1. To allow sched_setaffinity() to prompt migration before | |
106 | * ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME | |
107 | * (or TIF_NEED_RESCHED, which indirectly leads to handling | |
108 | * NOTIFY_RESUME) is handled in KVM context. | |
109 | * | |
110 | * If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters | |
111 | * the guest, the guest will trigger a IO/MMIO exit all the | |
112 | * way to userspace and the TIF flags will be handled by | |
113 | * the generic "exit to userspace" logic, not by KVM. The | |
114 | * exit to userspace is necessary to give the test a chance | |
115 | * to check the rseq CPU ID (see #2). | |
116 | * | |
117 | * Alternatively, guest_code() could include an instruction | |
118 | * to trigger an exit that is handled by KVM, but any such | |
119 | * exit requires architecture specific code. | |
120 | * | |
121 | * 2. To let ioctl(KVM_RUN) make its way back to the test | |
122 | * before the next round of migration. The test's check on | |
123 | * the rseq CPU ID must wait for migration to complete in | |
124 | * order to avoid false positive, thus any kernel rseq bug | |
125 | * will be missed if the next migration starts before the | |
126 | * check completes. | |
127 | * | |
128 | * 3. To ensure the read-side makes efficient forward progress, | |
0fcc1029 GS |
129 | * e.g. if getcpu() involves a syscall. Stalling the read-side |
130 | * means the test will spend more time waiting for getcpu() | |
131 | * to stabilize and less time trying to hit the timing-dependent | |
132 | * bug. | |
61e52f16 SC |
133 | * |
134 | * Because any bug in this area is likely to be timing-dependent, | |
135 | * run with a range of delays at 1us intervals from 1us to 10us | |
136 | * as a best effort to avoid tuning the test to the point where | |
137 | * it can hit _only_ the original bug and not detect future | |
138 | * regressions. | |
139 | * | |
140 | * The original bug can reproduce with a delay up to ~500us on | |
141 | * x86-64, but starts to require more iterations to reproduce | |
142 | * as the delay creeps above ~10us, and the average runtime of | |
143 | * each iteration obviously increases as well. Cap the delay | |
144 | * at 10us to keep test runtime reasonable while minimizing | |
145 | * potential coverage loss. | |
146 | * | |
147 | * The lower bound for reproducing the bug is likely below 1us, | |
148 | * e.g. failures occur on x86-64 with nanosleep(0), but at that | |
149 | * point the overhead of the syscall likely dominates the delay. | |
150 | * Use usleep() for simplicity and to avoid unnecessary kernel | |
151 | * dependencies. | |
152 | */ | |
153 | usleep((i % 10) + 1); | |
154 | } | |
155 | done = true; | |
156 | return NULL; | |
157 | } | |
158 | ||
7ed397d1 | 159 | static void calc_min_max_cpu(void) |
7b0035ea SC |
160 | { |
161 | int i, cnt, nproc; | |
162 | ||
7ed397d1 | 163 | TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2); |
7b0035ea SC |
164 | |
165 | /* | |
166 | * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that | |
167 | * this task is affined to in order to reduce the time spent querying | |
168 | * unusable CPUs, e.g. if this task is pinned to a small percentage of | |
169 | * total CPUs. | |
170 | */ | |
171 | nproc = get_nprocs_conf(); | |
172 | min_cpu = -1; | |
173 | max_cpu = -1; | |
174 | cnt = 0; | |
175 | ||
176 | for (i = 0; i < nproc; i++) { | |
177 | if (!CPU_ISSET(i, &possible_mask)) | |
178 | continue; | |
179 | if (min_cpu == -1) | |
180 | min_cpu = i; | |
181 | max_cpu = i; | |
182 | cnt++; | |
183 | } | |
184 | ||
7ed397d1 SC |
185 | __TEST_REQUIRE(cnt >= 2, |
186 | "Only one usable CPU, task migration not possible"); | |
7b0035ea SC |
187 | } |
188 | ||
61e52f16 SC |
189 | int main(int argc, char *argv[]) |
190 | { | |
191 | int r, i, snapshot; | |
192 | struct kvm_vm *vm; | |
2494a6d8 | 193 | struct kvm_vcpu *vcpu; |
61e52f16 SC |
194 | u32 cpu, rseq_cpu; |
195 | ||
61e52f16 SC |
196 | r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); |
197 | TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, | |
198 | strerror(errno)); | |
199 | ||
7ed397d1 | 200 | calc_min_max_cpu(); |
61e52f16 | 201 | |
66d42ac7 GS |
202 | r = rseq_register_current_thread(); |
203 | TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)", | |
204 | errno, strerror(errno)); | |
61e52f16 SC |
205 | |
206 | /* | |
207 | * Create and run a dummy VM that immediately exits to userspace via | |
208 | * GUEST_SYNC, while concurrently migrating the process by setting its | |
209 | * CPU affinity. | |
210 | */ | |
2494a6d8 | 211 | vm = vm_create_with_one_vcpu(&vcpu, guest_code); |
61e52f16 | 212 | |
e923b053 | 213 | pthread_create(&migration_thread, NULL, migration_worker, |
561cafeb | 214 | (void *)(unsigned long)syscall(SYS_gettid)); |
61e52f16 SC |
215 | |
216 | for (i = 0; !done; i++) { | |
768e9a61 SC |
217 | vcpu_run(vcpu); |
218 | TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, | |
61e52f16 SC |
219 | "Guest failed?"); |
220 | ||
221 | /* | |
222 | * Verify rseq's CPU matches sched's CPU. Ensure migration | |
0fcc1029 GS |
223 | * doesn't occur between getcpu() and reading the rseq cpu_id |
224 | * by rereading both if the sequence count changes, or if the | |
225 | * count is odd (migration in-progress). | |
61e52f16 SC |
226 | */ |
227 | do { | |
228 | /* | |
229 | * Drop bit 0 to force a mismatch if the count is odd, | |
230 | * i.e. if a migration is in-progress. | |
231 | */ | |
232 | snapshot = atomic_read(&seq_cnt) & ~1; | |
233 | ||
234 | /* | |
0fcc1029 GS |
235 | * Ensure calling getcpu() and reading rseq.cpu_id complete |
236 | * in a single "no migration" window, i.e. are not reordered | |
237 | * across the seq_cnt reads. | |
61e52f16 SC |
238 | */ |
239 | smp_rmb(); | |
68efe8f7 MB |
240 | r = sys_getcpu(&cpu, NULL); |
241 | TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)", | |
242 | errno, strerror(errno)); | |
66d42ac7 | 243 | rseq_cpu = rseq_current_cpu_raw(); |
61e52f16 SC |
244 | smp_rmb(); |
245 | } while (snapshot != atomic_read(&seq_cnt)); | |
246 | ||
247 | TEST_ASSERT(rseq_cpu == cpu, | |
250e138d | 248 | "rseq CPU = %d, sched CPU = %d", rseq_cpu, cpu); |
61e52f16 SC |
249 | } |
250 | ||
251 | /* | |
252 | * Sanity check that the test was able to enter the guest a reasonable | |
253 | * number of times, e.g. didn't get stalled too often/long waiting for | |
0fcc1029 GS |
254 | * getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly |
255 | * conservative ratio on x86-64, which can do _more_ KVM_RUNs than | |
256 | * migrations given the 1us+ delay in the migration task. | |
61e52f16 SC |
257 | */ |
258 | TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), | |
250e138d | 259 | "Only performed %d KVM_RUNs, task stalled too much?", i); |
61e52f16 SC |
260 | |
261 | pthread_join(migration_thread, NULL); | |
262 | ||
263 | kvm_vm_free(vm); | |
264 | ||
66d42ac7 | 265 | rseq_unregister_current_thread(); |
61e52f16 SC |
266 | |
267 | return 0; | |
268 | } |