Commit | Line | Data |
---|---|---|
fd1fea68 MK |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | /* | |
4 | * Clocksource driver for the synthetic counter and timers | |
5 | * provided by the Hyper-V hypervisor to guest VMs, as described | |
6 | * in the Hyper-V Top Level Functional Spec (TLFS). This driver | |
7 | * is instruction set architecture independent. | |
8 | * | |
9 | * Copyright (C) 2019, Microsoft, Inc. | |
10 | * | |
11 | * Author: Michael Kelley <mikelley@microsoft.com> | |
12 | */ | |
13 | ||
14 | #include <linux/percpu.h> | |
15 | #include <linux/cpumask.h> | |
16 | #include <linux/clockchips.h> | |
dd2cb348 MK |
17 | #include <linux/clocksource.h> |
18 | #include <linux/sched_clock.h> | |
fd1fea68 | 19 | #include <linux/mm.h> |
4df4cb9e | 20 | #include <linux/cpuhotplug.h> |
fd1fea68 MK |
21 | #include <clocksource/hyperv_timer.h> |
22 | #include <asm/hyperv-tlfs.h> | |
23 | #include <asm/mshyperv.h> | |
24 | ||
25 | static struct clock_event_device __percpu *hv_clock_event; | |
bd00cd52 | 26 | static u64 hv_sched_clock_offset __ro_after_init; |
fd1fea68 MK |
27 | |
28 | /* | |
29 | * If false, we're using the old mechanism for stimer0 interrupts | |
30 | * where it sends a VMbus message when it expires. The old | |
31 | * mechanism is used when running on older versions of Hyper-V | |
32 | * that don't support Direct Mode. While Hyper-V provides | |
33 | * four stimer's per CPU, Linux uses only stimer0. | |
4df4cb9e MK |
34 | * |
35 | * Because Direct Mode does not require processing a VMbus | |
36 | * message, stimer interrupts can be enabled earlier in the | |
37 | * process of booting a CPU, and consistent with when timer | |
38 | * interrupts are enabled for other clocksource drivers. | |
39 | * However, for legacy versions of Hyper-V when Direct Mode | |
40 | * is not enabled, setting up stimer interrupts must be | |
41 | * delayed until VMbus is initialized and can process the | |
42 | * interrupt message. | |
fd1fea68 MK |
43 | */ |
44 | static bool direct_mode_enabled; | |
45 | ||
46 | static int stimer0_irq; | |
47 | static int stimer0_vector; | |
48 | static int stimer0_message_sint; | |
49 | ||
50 | /* | |
51 | * ISR for when stimer0 is operating in Direct Mode. Direct Mode | |
52 | * does not use VMbus or any VMbus messages, so process here and not | |
53 | * in the VMbus driver code. | |
54 | */ | |
55 | void hv_stimer0_isr(void) | |
56 | { | |
57 | struct clock_event_device *ce; | |
58 | ||
59 | ce = this_cpu_ptr(hv_clock_event); | |
60 | ce->event_handler(ce); | |
61 | } | |
62 | EXPORT_SYMBOL_GPL(hv_stimer0_isr); | |
63 | ||
64 | static int hv_ce_set_next_event(unsigned long delta, | |
65 | struct clock_event_device *evt) | |
66 | { | |
67 | u64 current_tick; | |
68 | ||
0af3e137 | 69 | current_tick = hv_read_reference_counter(); |
fd1fea68 MK |
70 | current_tick += delta; |
71 | hv_init_timer(0, current_tick); | |
72 | return 0; | |
73 | } | |
74 | ||
75 | static int hv_ce_shutdown(struct clock_event_device *evt) | |
76 | { | |
77 | hv_init_timer(0, 0); | |
78 | hv_init_timer_config(0, 0); | |
79 | if (direct_mode_enabled) | |
80 | hv_disable_stimer0_percpu_irq(stimer0_irq); | |
81 | ||
82 | return 0; | |
83 | } | |
84 | ||
85 | static int hv_ce_set_oneshot(struct clock_event_device *evt) | |
86 | { | |
87 | union hv_stimer_config timer_cfg; | |
88 | ||
89 | timer_cfg.as_uint64 = 0; | |
90 | timer_cfg.enable = 1; | |
91 | timer_cfg.auto_enable = 1; | |
92 | if (direct_mode_enabled) { | |
93 | /* | |
94 | * When it expires, the timer will directly interrupt | |
95 | * on the specified hardware vector/IRQ. | |
96 | */ | |
97 | timer_cfg.direct_mode = 1; | |
98 | timer_cfg.apic_vector = stimer0_vector; | |
99 | hv_enable_stimer0_percpu_irq(stimer0_irq); | |
100 | } else { | |
101 | /* | |
102 | * When it expires, the timer will generate a VMbus message, | |
103 | * to be handled by the normal VMbus interrupt handler. | |
104 | */ | |
105 | timer_cfg.direct_mode = 0; | |
106 | timer_cfg.sintx = stimer0_message_sint; | |
107 | } | |
108 | hv_init_timer_config(0, timer_cfg.as_uint64); | |
109 | return 0; | |
110 | } | |
111 | ||
112 | /* | |
113 | * hv_stimer_init - Per-cpu initialization of the clockevent | |
114 | */ | |
4df4cb9e | 115 | static int hv_stimer_init(unsigned int cpu) |
fd1fea68 MK |
116 | { |
117 | struct clock_event_device *ce; | |
118 | ||
4df4cb9e MK |
119 | if (!hv_clock_event) |
120 | return 0; | |
fd1fea68 MK |
121 | |
122 | ce = per_cpu_ptr(hv_clock_event, cpu); | |
123 | ce->name = "Hyper-V clockevent"; | |
124 | ce->features = CLOCK_EVT_FEAT_ONESHOT; | |
125 | ce->cpumask = cpumask_of(cpu); | |
126 | ce->rating = 1000; | |
127 | ce->set_state_shutdown = hv_ce_shutdown; | |
128 | ce->set_state_oneshot = hv_ce_set_oneshot; | |
129 | ce->set_next_event = hv_ce_set_next_event; | |
130 | ||
131 | clockevents_config_and_register(ce, | |
132 | HV_CLOCK_HZ, | |
133 | HV_MIN_DELTA_TICKS, | |
134 | HV_MAX_MAX_DELTA_TICKS); | |
4df4cb9e | 135 | return 0; |
fd1fea68 | 136 | } |
fd1fea68 MK |
137 | |
138 | /* | |
139 | * hv_stimer_cleanup - Per-cpu cleanup of the clockevent | |
140 | */ | |
4df4cb9e | 141 | int hv_stimer_cleanup(unsigned int cpu) |
fd1fea68 MK |
142 | { |
143 | struct clock_event_device *ce; | |
144 | ||
4df4cb9e MK |
145 | if (!hv_clock_event) |
146 | return 0; | |
147 | ||
148 | /* | |
149 | * In the legacy case where Direct Mode is not enabled | |
150 | * (which can only be on x86/64), stimer cleanup happens | |
151 | * relatively early in the CPU offlining process. We | |
152 | * must unbind the stimer-based clockevent device so | |
153 | * that the LAPIC timer can take over until clockevents | |
154 | * are no longer needed in the offlining process. Note | |
155 | * that clockevents_unbind_device() eventually calls | |
156 | * hv_ce_shutdown(). | |
157 | * | |
158 | * The unbind should not be done when Direct Mode is | |
159 | * enabled because we may be on an architecture where | |
160 | * there are no other clockevent devices to fallback to. | |
161 | */ | |
162 | ce = per_cpu_ptr(hv_clock_event, cpu); | |
163 | if (direct_mode_enabled) | |
fd1fea68 | 164 | hv_ce_shutdown(ce); |
4df4cb9e MK |
165 | else |
166 | clockevents_unbind_device(ce, cpu); | |
167 | ||
168 | return 0; | |
fd1fea68 MK |
169 | } |
170 | EXPORT_SYMBOL_GPL(hv_stimer_cleanup); | |
171 | ||
172 | /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ | |
4df4cb9e | 173 | int hv_stimer_alloc(void) |
fd1fea68 | 174 | { |
4df4cb9e MK |
175 | int ret = 0; |
176 | ||
177 | /* | |
178 | * Synthetic timers are always available except on old versions of | |
179 | * Hyper-V on x86. In that case, return as error as Linux will use a | |
180 | * clockevent based on emulated LAPIC timer hardware. | |
181 | */ | |
182 | if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) | |
183 | return -EINVAL; | |
fd1fea68 MK |
184 | |
185 | hv_clock_event = alloc_percpu(struct clock_event_device); | |
186 | if (!hv_clock_event) | |
187 | return -ENOMEM; | |
188 | ||
189 | direct_mode_enabled = ms_hyperv.misc_features & | |
190 | HV_STIMER_DIRECT_MODE_AVAILABLE; | |
191 | if (direct_mode_enabled) { | |
192 | ret = hv_setup_stimer0_irq(&stimer0_irq, &stimer0_vector, | |
193 | hv_stimer0_isr); | |
4df4cb9e MK |
194 | if (ret) |
195 | goto free_percpu; | |
196 | ||
197 | /* | |
198 | * Since we are in Direct Mode, stimer initialization | |
199 | * can be done now with a CPUHP value in the same range | |
200 | * as other clockevent devices. | |
201 | */ | |
202 | ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, | |
203 | "clockevents/hyperv/stimer:starting", | |
204 | hv_stimer_init, hv_stimer_cleanup); | |
205 | if (ret < 0) | |
206 | goto free_stimer0_irq; | |
fd1fea68 | 207 | } |
4df4cb9e | 208 | return ret; |
fd1fea68 | 209 | |
4df4cb9e MK |
210 | free_stimer0_irq: |
211 | hv_remove_stimer0_irq(stimer0_irq); | |
212 | stimer0_irq = 0; | |
213 | free_percpu: | |
214 | free_percpu(hv_clock_event); | |
215 | hv_clock_event = NULL; | |
216 | return ret; | |
fd1fea68 MK |
217 | } |
218 | EXPORT_SYMBOL_GPL(hv_stimer_alloc); | |
219 | ||
4df4cb9e MK |
220 | /* |
221 | * hv_stimer_legacy_init -- Called from the VMbus driver to handle | |
222 | * the case when Direct Mode is not enabled, and the stimer | |
223 | * must be initialized late in the CPU onlining process. | |
224 | * | |
225 | */ | |
226 | void hv_stimer_legacy_init(unsigned int cpu, int sint) | |
227 | { | |
228 | if (direct_mode_enabled) | |
229 | return; | |
230 | ||
231 | /* | |
232 | * This function gets called by each vCPU, so setting the | |
233 | * global stimer_message_sint value each time is conceptually | |
234 | * not ideal, but the value passed in is always the same and | |
235 | * it avoids introducing yet another interface into this | |
236 | * clocksource driver just to set the sint in the legacy case. | |
237 | */ | |
238 | stimer0_message_sint = sint; | |
239 | (void)hv_stimer_init(cpu); | |
240 | } | |
241 | EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); | |
242 | ||
243 | /* | |
244 | * hv_stimer_legacy_cleanup -- Called from the VMbus driver to | |
245 | * handle the case when Direct Mode is not enabled, and the | |
246 | * stimer must be cleaned up early in the CPU offlining | |
247 | * process. | |
248 | */ | |
249 | void hv_stimer_legacy_cleanup(unsigned int cpu) | |
250 | { | |
251 | if (direct_mode_enabled) | |
252 | return; | |
253 | (void)hv_stimer_cleanup(cpu); | |
254 | } | |
255 | EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); | |
256 | ||
257 | ||
fd1fea68 MK |
258 | /* hv_stimer_free - Free global resources allocated by hv_stimer_alloc() */ |
259 | void hv_stimer_free(void) | |
260 | { | |
4df4cb9e MK |
261 | if (!hv_clock_event) |
262 | return; | |
263 | ||
264 | if (direct_mode_enabled) { | |
265 | cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); | |
fd1fea68 MK |
266 | hv_remove_stimer0_irq(stimer0_irq); |
267 | stimer0_irq = 0; | |
268 | } | |
269 | free_percpu(hv_clock_event); | |
270 | hv_clock_event = NULL; | |
271 | } | |
272 | EXPORT_SYMBOL_GPL(hv_stimer_free); | |
273 | ||
274 | /* | |
275 | * Do a global cleanup of clockevents for the cases of kexec and | |
276 | * vmbus exit | |
277 | */ | |
278 | void hv_stimer_global_cleanup(void) | |
279 | { | |
280 | int cpu; | |
fd1fea68 | 281 | |
4df4cb9e MK |
282 | /* |
283 | * hv_stime_legacy_cleanup() will stop the stimer if Direct | |
284 | * Mode is not enabled, and fallback to the LAPIC timer. | |
285 | */ | |
286 | for_each_present_cpu(cpu) { | |
287 | hv_stimer_legacy_cleanup(cpu); | |
fd1fea68 | 288 | } |
4df4cb9e MK |
289 | |
290 | /* | |
291 | * If Direct Mode is enabled, the cpuhp teardown callback | |
292 | * (hv_stimer_cleanup) will be run on all CPUs to stop the | |
293 | * stimers. | |
294 | */ | |
fd1fea68 MK |
295 | hv_stimer_free(); |
296 | } | |
297 | EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); | |
dd2cb348 MK |
298 | |
299 | /* | |
300 | * Code and definitions for the Hyper-V clocksources. Two | |
301 | * clocksources are defined: one that reads the Hyper-V defined MSR, and | |
302 | * the other that uses the TSC reference page feature as defined in the | |
303 | * TLFS. The MSR version is for compatibility with old versions of | |
304 | * Hyper-V and 32-bit x86. The TSC reference page version is preferred. | |
9e0333ae AP |
305 | * |
306 | * The Hyper-V clocksource ratings of 250 are chosen to be below the | |
307 | * TSC clocksource rating of 300. In configurations where Hyper-V offers | |
308 | * an InvariantTSC, the TSC is not marked "unstable", so the TSC clocksource | |
309 | * is available and preferred. With the higher rating, it will be the | |
310 | * default. On older hardware and Hyper-V versions, the TSC is marked | |
311 | * "unstable", so no TSC clocksource is created and the selected Hyper-V | |
312 | * clocksource will be the default. | |
dd2cb348 MK |
313 | */ |
314 | ||
0af3e137 AP |
315 | u64 (*hv_read_reference_counter)(void); |
316 | EXPORT_SYMBOL_GPL(hv_read_reference_counter); | |
dd2cb348 | 317 | |
ddc61bbc BF |
318 | static union { |
319 | struct ms_hyperv_tsc_page page; | |
320 | u8 reserved[PAGE_SIZE]; | |
321 | } tsc_pg __aligned(PAGE_SIZE); | |
dd2cb348 MK |
322 | |
323 | struct ms_hyperv_tsc_page *hv_get_tsc_page(void) | |
324 | { | |
ddc61bbc | 325 | return &tsc_pg.page; |
dd2cb348 MK |
326 | } |
327 | EXPORT_SYMBOL_GPL(hv_get_tsc_page); | |
328 | ||
0af3e137 | 329 | static u64 notrace read_hv_clock_tsc(void) |
dd2cb348 | 330 | { |
ddc61bbc | 331 | u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); |
dd2cb348 MK |
332 | |
333 | if (current_tick == U64_MAX) | |
334 | hv_get_time_ref_count(current_tick); | |
335 | ||
336 | return current_tick; | |
337 | } | |
338 | ||
0af3e137 AP |
339 | static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) |
340 | { | |
341 | return read_hv_clock_tsc(); | |
342 | } | |
343 | ||
1f3aed01 | 344 | static u64 notrace read_hv_sched_clock_tsc(void) |
dd2cb348 | 345 | { |
749da8ca YX |
346 | return (read_hv_clock_tsc() - hv_sched_clock_offset) * |
347 | (NSEC_PER_SEC / HV_CLOCK_HZ); | |
dd2cb348 MK |
348 | } |
349 | ||
1349401f DC |
350 | static void suspend_hv_clock_tsc(struct clocksource *arg) |
351 | { | |
352 | u64 tsc_msr; | |
353 | ||
354 | /* Disable the TSC page */ | |
355 | hv_get_reference_tsc(tsc_msr); | |
356 | tsc_msr &= ~BIT_ULL(0); | |
357 | hv_set_reference_tsc(tsc_msr); | |
358 | } | |
359 | ||
360 | ||
361 | static void resume_hv_clock_tsc(struct clocksource *arg) | |
362 | { | |
363 | phys_addr_t phys_addr = virt_to_phys(&tsc_pg); | |
364 | u64 tsc_msr; | |
365 | ||
366 | /* Re-enable the TSC page */ | |
367 | hv_get_reference_tsc(tsc_msr); | |
368 | tsc_msr &= GENMASK_ULL(11, 0); | |
369 | tsc_msr |= BIT_ULL(0) | (u64)phys_addr; | |
370 | hv_set_reference_tsc(tsc_msr); | |
371 | } | |
372 | ||
eec399dd TG |
373 | static int hv_cs_enable(struct clocksource *cs) |
374 | { | |
375 | hv_enable_vdso_clocksource(); | |
376 | return 0; | |
377 | } | |
378 | ||
dd2cb348 MK |
379 | static struct clocksource hyperv_cs_tsc = { |
380 | .name = "hyperv_clocksource_tsc_page", | |
9e0333ae | 381 | .rating = 250, |
0af3e137 | 382 | .read = read_hv_clock_tsc_cs, |
dd2cb348 MK |
383 | .mask = CLOCKSOURCE_MASK(64), |
384 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | |
1349401f DC |
385 | .suspend= suspend_hv_clock_tsc, |
386 | .resume = resume_hv_clock_tsc, | |
eec399dd | 387 | .enable = hv_cs_enable, |
dd2cb348 | 388 | }; |
dd2cb348 | 389 | |
0af3e137 | 390 | static u64 notrace read_hv_clock_msr(void) |
dd2cb348 MK |
391 | { |
392 | u64 current_tick; | |
393 | /* | |
394 | * Read the partition counter to get the current tick count. This count | |
395 | * is set to 0 when the partition is created and is incremented in | |
396 | * 100 nanosecond units. | |
397 | */ | |
398 | hv_get_time_ref_count(current_tick); | |
399 | return current_tick; | |
400 | } | |
401 | ||
0af3e137 AP |
402 | static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) |
403 | { | |
404 | return read_hv_clock_msr(); | |
405 | } | |
406 | ||
1f3aed01 | 407 | static u64 notrace read_hv_sched_clock_msr(void) |
dd2cb348 | 408 | { |
749da8ca YX |
409 | return (read_hv_clock_msr() - hv_sched_clock_offset) * |
410 | (NSEC_PER_SEC / HV_CLOCK_HZ); | |
dd2cb348 MK |
411 | } |
412 | ||
413 | static struct clocksource hyperv_cs_msr = { | |
414 | .name = "hyperv_clocksource_msr", | |
9e0333ae | 415 | .rating = 250, |
0af3e137 | 416 | .read = read_hv_clock_msr_cs, |
dd2cb348 MK |
417 | .mask = CLOCKSOURCE_MASK(64), |
418 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | |
419 | }; | |
420 | ||
dd2cb348 MK |
421 | static bool __init hv_init_tsc_clocksource(void) |
422 | { | |
423 | u64 tsc_msr; | |
424 | phys_addr_t phys_addr; | |
425 | ||
426 | if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) | |
427 | return false; | |
428 | ||
0af3e137 | 429 | hv_read_reference_counter = read_hv_clock_tsc; |
ddc61bbc | 430 | phys_addr = virt_to_phys(hv_get_tsc_page()); |
dd2cb348 MK |
431 | |
432 | /* | |
433 | * The Hyper-V TLFS specifies to preserve the value of reserved | |
434 | * bits in registers. So read the existing value, preserve the | |
435 | * low order 12 bits, and add in the guest physical address | |
436 | * (which already has at least the low 12 bits set to zero since | |
437 | * it is page aligned). Also set the "enable" bit, which is bit 0. | |
438 | */ | |
439 | hv_get_reference_tsc(tsc_msr); | |
440 | tsc_msr &= GENMASK_ULL(11, 0); | |
441 | tsc_msr = tsc_msr | 0x1 | (u64)phys_addr; | |
442 | hv_set_reference_tsc(tsc_msr); | |
443 | ||
444 | hv_set_clocksource_vdso(hyperv_cs_tsc); | |
445 | clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); | |
446 | ||
0af3e137 | 447 | hv_sched_clock_offset = hv_read_reference_counter(); |
bd00cd52 TL |
448 | hv_setup_sched_clock(read_hv_sched_clock_tsc); |
449 | ||
dd2cb348 MK |
450 | return true; |
451 | } | |
dd2cb348 MK |
452 | |
453 | void __init hv_init_clocksource(void) | |
454 | { | |
455 | /* | |
456 | * Try to set up the TSC page clocksource. If it succeeds, we're | |
457 | * done. Otherwise, set up the MSR clocksoruce. At least one of | |
458 | * these will always be available except on very old versions of | |
459 | * Hyper-V on x86. In that case we won't have a Hyper-V | |
460 | * clocksource, but Linux will still run with a clocksource based | |
461 | * on the emulated PIT or LAPIC timer. | |
462 | */ | |
463 | if (hv_init_tsc_clocksource()) | |
464 | return; | |
465 | ||
466 | if (!(ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)) | |
467 | return; | |
468 | ||
0af3e137 | 469 | hv_read_reference_counter = read_hv_clock_msr; |
dd2cb348 MK |
470 | clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); |
471 | ||
0af3e137 | 472 | hv_sched_clock_offset = hv_read_reference_counter(); |
bd00cd52 | 473 | hv_setup_sched_clock(read_hv_sched_clock_msr); |
dd2cb348 MK |
474 | } |
475 | EXPORT_SYMBOL_GPL(hv_init_clocksource); |