Commit | Line | Data |
---|---|---|
649472a1 | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
790c73f6 GOC |
2 | /* KVM paravirtual clock driver. A clocksource implementation |
3 | Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. | |
790c73f6 GOC |
4 | */ |
5 | ||
6 | #include <linux/clocksource.h> | |
7 | #include <linux/kvm_para.h> | |
f6e16d5a | 8 | #include <asm/pvclock.h> |
790c73f6 GOC |
9 | #include <asm/msr.h> |
10 | #include <asm/apic.h> | |
11 | #include <linux/percpu.h> | |
3b5d56b9 | 12 | #include <linux/hardirq.h> |
95a3d445 | 13 | #include <linux/cpuhotplug.h> |
0ad83caa | 14 | #include <linux/sched.h> |
e6017571 | 15 | #include <linux/sched/clock.h> |
368a540e | 16 | #include <linux/mm.h> |
958f338e | 17 | #include <linux/slab.h> |
6a1cac56 | 18 | #include <linux/set_memory.h> |
4d96f910 | 19 | #include <linux/cc_platform.h> |
736decac | 20 | |
e499a9b6 | 21 | #include <asm/hypervisor.h> |
736decac | 22 | #include <asm/x86_init.h> |
f4066c2b | 23 | #include <asm/kvmclock.h> |
790c73f6 | 24 | |
42f8df93 | 25 | static int kvmclock __initdata = 1; |
e499a9b6 | 26 | static int kvmclock_vsyscall __initdata = 1; |
1c6d984f KS |
27 | static int msr_kvm_system_time __ro_after_init; |
28 | static int msr_kvm_wall_clock __ro_after_init; | |
42f8df93 | 29 | static u64 kvm_sched_clock_offset __ro_after_init; |
790c73f6 | 30 | |
146c394d | 31 | static int __init parse_no_kvmclock(char *arg) |
790c73f6 GOC |
32 | { |
33 | kvmclock = 0; | |
34 | return 0; | |
35 | } | |
36 | early_param("no-kvmclock", parse_no_kvmclock); | |
37 | ||
e499a9b6 TG |
38 | static int __init parse_no_kvmclock_vsyscall(char *arg) |
39 | { | |
40 | kvmclock_vsyscall = 0; | |
41 | return 0; | |
42 | } | |
43 | early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); | |
44 | ||
54aa699e | 45 | /* Aligned to page sizes to match what's mapped via vsyscalls to userspace */ |
95a3d445 TG |
46 | #define HVC_BOOT_ARRAY_SIZE \ |
47 | (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) | |
368a540e | 48 | |
95a3d445 | 49 | static struct pvclock_vsyscall_time_info |
6a1cac56 BS |
50 | hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); |
51 | static struct pvclock_wall_clock wall_clock __bss_decrypted; | |
6a1cac56 | 52 | static struct pvclock_vsyscall_time_info *hvclock_mem; |
ad9af930 ZD |
53 | DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); |
54 | EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); | |
790c73f6 | 55 | |
790c73f6 GOC |
56 | /* |
57 | * The wallclock is the time of day when we booted. Since then, some time may | |
58 | * have elapsed since the hypervisor wrote the data. So we try to account for | |
59 | * that with system time | |
60 | */ | |
e27c4929 | 61 | static void kvm_get_wallclock(struct timespec64 *now) |
790c73f6 | 62 | { |
78255eb2 | 63 | wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); |
95a3d445 TG |
64 | preempt_disable(); |
65 | pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); | |
66 | preempt_enable(); | |
790c73f6 GOC |
67 | } |
68 | ||
e27c4929 | 69 | static int kvm_set_wallclock(const struct timespec64 *now) |
790c73f6 | 70 | { |
00875520 | 71 | return -ENODEV; |
790c73f6 GOC |
72 | } |
73 | ||
5c5e9a2b | 74 | static u64 kvm_clock_read(void) |
790c73f6 | 75 | { |
a5a1d1c2 | 76 | u64 ret; |
790c73f6 | 77 | |
95ef1e52 | 78 | preempt_disable_notrace(); |
8739c681 | 79 | ret = pvclock_clocksource_read_nowd(this_cpu_pvti()); |
95ef1e52 | 80 | preempt_enable_notrace(); |
f6e16d5a | 81 | return ret; |
790c73f6 | 82 | } |
f6e16d5a | 83 | |
a5a1d1c2 | 84 | static u64 kvm_clock_get_cycles(struct clocksource *cs) |
8e19608e MD |
85 | { |
86 | return kvm_clock_read(); | |
87 | } | |
88 | ||
8739c681 | 89 | static noinstr u64 kvm_sched_clock_read(void) |
72c930dc | 90 | { |
5c5e9a2b | 91 | return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset; |
72c930dc RK |
92 | } |
93 | ||
94 | static inline void kvm_sched_clock_init(bool stable) | |
95 | { | |
b5179ec4 | 96 | if (!stable) |
acb04058 | 97 | clear_sched_clock_stable(); |
72c930dc | 98 | kvm_sched_clock_offset = kvm_clock_read(); |
a0e2bf7c | 99 | paravirt_set_sched_clock(kvm_sched_clock_read); |
72c930dc | 100 | |
146c394d TG |
101 | pr_info("kvm-clock: using sched offset of %llu cycles", |
102 | kvm_sched_clock_offset); | |
72c930dc RK |
103 | |
104 | BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > | |
146c394d | 105 | sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); |
72c930dc RK |
106 | } |
107 | ||
0293615f GC |
108 | /* |
109 | * If we don't do that, there is the possibility that the guest | |
110 | * will calibrate under heavy load - thus, getting a lower lpj - | |
111 | * and execute the delays themselves without load. This is wrong, | |
112 | * because no delay loop can finish beforehand. | |
113 | * Any heuristics is subject to fail, because ultimately, a large | |
114 | * poll of guests can be running and trouble each other. So we preset | |
115 | * lpj here | |
116 | */ | |
117 | static unsigned long kvm_get_tsc_khz(void) | |
118 | { | |
e10f7805 | 119 | setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); |
95a3d445 | 120 | return pvclock_tsc_khz(this_cpu_pvti()); |
0293615f GC |
121 | } |
122 | ||
1088c6ee | 123 | static void __init kvm_get_preset_lpj(void) |
0293615f | 124 | { |
0293615f GC |
125 | unsigned long khz; |
126 | u64 lpj; | |
127 | ||
e93353c9 | 128 | khz = kvm_get_tsc_khz(); |
0293615f GC |
129 | |
130 | lpj = ((u64)khz * 1000); | |
131 | do_div(lpj, HZ); | |
132 | preset_lpj = lpj; | |
133 | } | |
134 | ||
3b5d56b9 EM |
135 | bool kvm_check_and_clear_guest_paused(void) |
136 | { | |
95a3d445 | 137 | struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); |
146c394d | 138 | bool ret = false; |
7069ed67 | 139 | |
95a3d445 | 140 | if (!src) |
7069ed67 | 141 | return ret; |
3b5d56b9 | 142 | |
95a3d445 TG |
143 | if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { |
144 | src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; | |
d63285e9 | 145 | pvclock_touch_watchdogs(); |
3b5d56b9 EM |
146 | ret = true; |
147 | } | |
3b5d56b9 EM |
148 | return ret; |
149 | } | |
3b5d56b9 | 150 | |
eec399dd TG |
151 | static int kvm_cs_enable(struct clocksource *cs) |
152 | { | |
b95a8a27 | 153 | vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); |
eec399dd TG |
154 | return 0; |
155 | } | |
156 | ||
27f6a9c8 | 157 | static struct clocksource kvm_clock = { |
146c394d TG |
158 | .name = "kvm-clock", |
159 | .read = kvm_clock_get_cycles, | |
160 | .rating = 400, | |
161 | .mask = CLOCKSOURCE_MASK(64), | |
162 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | |
576bd496 | 163 | .id = CSID_X86_KVM_CLK, |
eec399dd | 164 | .enable = kvm_cs_enable, |
790c73f6 GOC |
165 | }; |
166 | ||
7a5ddc8f | 167 | static void kvm_register_clock(char *txt) |
790c73f6 | 168 | { |
95a3d445 | 169 | struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); |
7a5ddc8f | 170 | u64 pa; |
fe1140cc | 171 | |
95a3d445 | 172 | if (!src) |
7a5ddc8f | 173 | return; |
19b6a85b | 174 | |
95a3d445 | 175 | pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; |
78255eb2 | 176 | wrmsrq(msr_kvm_system_time, pa); |
f3f26dae | 177 | pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); |
790c73f6 GOC |
178 | } |
179 | ||
b74f05d6 MT |
180 | static void kvm_save_sched_clock_state(void) |
181 | { | |
182 | } | |
183 | ||
184 | static void kvm_restore_sched_clock_state(void) | |
185 | { | |
186 | kvm_register_clock("primary cpu clock, resume"); | |
187 | } | |
188 | ||
b8ba5f10 | 189 | #ifdef CONFIG_X86_LOCAL_APIC |
148f9bb8 | 190 | static void kvm_setup_secondary_clock(void) |
790c73f6 | 191 | { |
7a5ddc8f | 192 | kvm_register_clock("secondary cpu clock"); |
790c73f6 | 193 | } |
b8ba5f10 | 194 | #endif |
790c73f6 | 195 | |
c02027b5 | 196 | void kvmclock_disable(void) |
1e977aa1 | 197 | { |
1c6d984f | 198 | if (msr_kvm_system_time) |
0c2678ef | 199 | native_write_msr(msr_kvm_system_time, 0); |
1e977aa1 GC |
200 | } |
201 | ||
6a1cac56 BS |
202 | static void __init kvmclock_init_mem(void) |
203 | { | |
204 | unsigned long ncpus; | |
205 | unsigned int order; | |
206 | struct page *p; | |
207 | int r; | |
208 | ||
209 | if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) | |
210 | return; | |
211 | ||
212 | ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; | |
213 | order = get_order(ncpus * sizeof(*hvclock_mem)); | |
214 | ||
215 | p = alloc_pages(GFP_KERNEL, order); | |
216 | if (!p) { | |
217 | pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); | |
218 | return; | |
219 | } | |
220 | ||
221 | hvclock_mem = page_address(p); | |
222 | ||
223 | /* | |
224 | * hvclock is shared between the guest and the hypervisor, must | |
225 | * be mapped decrypted. | |
226 | */ | |
4d96f910 | 227 | if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { |
6a1cac56 BS |
228 | r = set_memory_decrypted((unsigned long) hvclock_mem, |
229 | 1UL << order); | |
230 | if (r) { | |
231 | __free_pages(p, order); | |
232 | hvclock_mem = NULL; | |
233 | pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); | |
234 | return; | |
235 | } | |
236 | } | |
237 | ||
238 | memset(hvclock_mem, 0, PAGE_SIZE << order); | |
239 | } | |
240 | ||
e499a9b6 TG |
241 | static int __init kvm_setup_vsyscall_timeinfo(void) |
242 | { | |
77d72792 | 243 | if (!kvm_para_available() || !kvmclock || nopv) |
3c51d0a6 WL |
244 | return 0; |
245 | ||
d7eb79c6 | 246 | kvmclock_init_mem(); |
e499a9b6 | 247 | |
d7eb79c6 WL |
248 | #ifdef CONFIG_X86_64 |
249 | if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { | |
250 | u8 flags; | |
e499a9b6 | 251 | |
d7eb79c6 WL |
252 | flags = pvclock_read_flags(&hv_clock_boot[0].pvti); |
253 | if (!(flags & PVCLOCK_TSC_STABLE_BIT)) | |
254 | return 0; | |
e499a9b6 | 255 | |
d7eb79c6 WL |
256 | kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; |
257 | } | |
e499a9b6 | 258 | #endif |
6a1cac56 | 259 | |
e499a9b6 TG |
260 | return 0; |
261 | } | |
262 | early_initcall(kvm_setup_vsyscall_timeinfo); | |
263 | ||
95a3d445 TG |
264 | static int kvmclock_setup_percpu(unsigned int cpu) |
265 | { | |
266 | struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); | |
267 | ||
268 | /* | |
269 | * The per cpu area setup replicates CPU0 data to all cpu | |
270 | * pointers. So carefully check. CPU0 has been set up in init | |
271 | * already. | |
272 | */ | |
273 | if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) | |
274 | return 0; | |
275 | ||
276 | /* Use the static page for the first CPUs, allocate otherwise */ | |
277 | if (cpu < HVC_BOOT_ARRAY_SIZE) | |
278 | p = &hv_clock_boot[cpu]; | |
6a1cac56 BS |
279 | else if (hvclock_mem) |
280 | p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; | |
95a3d445 | 281 | else |
6a1cac56 | 282 | return -ENOMEM; |
95a3d445 TG |
283 | |
284 | per_cpu(hv_clock_per_cpu, cpu) = p; | |
285 | return p ? 0 : -ENOMEM; | |
286 | } | |
287 | ||
790c73f6 GOC |
288 | void __init kvmclock_init(void) |
289 | { | |
0ad83caa | 290 | u8 flags; |
ed55705d | 291 | |
146c394d | 292 | if (!kvm_para_available() || !kvmclock) |
790c73f6 GOC |
293 | return; |
294 | ||
146c394d | 295 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { |
838815a7 GC |
296 | msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; |
297 | msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; | |
1c6d984f KS |
298 | } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { |
299 | msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; | |
300 | msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; | |
301 | } else { | |
838815a7 | 302 | return; |
146c394d | 303 | } |
838815a7 | 304 | |
95a3d445 TG |
305 | if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", |
306 | kvmclock_setup_percpu, NULL) < 0) { | |
307 | return; | |
308 | } | |
309 | ||
146c394d | 310 | pr_info("kvm-clock: Using msrs %x and %x", |
819aeee0 BS |
311 | msr_kvm_system_time, msr_kvm_wall_clock); |
312 | ||
95a3d445 | 313 | this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); |
7a5ddc8f | 314 | kvm_register_clock("primary cpu clock"); |
95a3d445 | 315 | pvclock_set_pvti_cpu0_va(hv_clock_boot); |
94ffba48 | 316 | |
72c930dc RK |
317 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) |
318 | pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); | |
319 | ||
95a3d445 | 320 | flags = pvclock_read_flags(&hv_clock_boot[0].pvti); |
72c930dc | 321 | kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); |
72c930dc | 322 | |
838815a7 | 323 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; |
a4497a86 | 324 | x86_platform.calibrate_cpu = kvm_get_tsc_khz; |
838815a7 GC |
325 | x86_platform.get_wallclock = kvm_get_wallclock; |
326 | x86_platform.set_wallclock = kvm_set_wallclock; | |
b8ba5f10 | 327 | #ifdef CONFIG_X86_LOCAL_APIC |
146c394d | 328 | x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; |
b8ba5f10 | 329 | #endif |
b74f05d6 MT |
330 | x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; |
331 | x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; | |
838815a7 | 332 | kvm_get_preset_lpj(); |
7539b174 MT |
333 | |
334 | /* | |
335 | * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate | |
336 | * with P/T states and does not stop in deep C-states. | |
337 | * | |
338 | * Invariant TSC exposed by host means kvmclock is not necessary: | |
339 | * can use TSC as clocksource. | |
340 | * | |
341 | */ | |
342 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && | |
343 | boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && | |
344 | !check_tsc_unstable()) | |
345 | kvm_clock.rating = 299; | |
346 | ||
b01cc1b0 | 347 | clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); |
838815a7 | 348 | pv_info.name = "KVM"; |
790c73f6 | 349 | } |