Commit | Line | Data |
---|---|---|
fd1fea68 MK |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | /* | |
4 | * Clocksource driver for the synthetic counter and timers | |
5 | * provided by the Hyper-V hypervisor to guest VMs, as described | |
6 | * in the Hyper-V Top Level Functional Spec (TLFS). This driver | |
7 | * is instruction set architecture independent. | |
8 | * | |
9 | * Copyright (C) 2019, Microsoft, Inc. | |
10 | * | |
11 | * Author: Michael Kelley <mikelley@microsoft.com> | |
12 | */ | |
13 | ||
14 | #include <linux/percpu.h> | |
15 | #include <linux/cpumask.h> | |
16 | #include <linux/clockchips.h> | |
dd2cb348 MK |
17 | #include <linux/clocksource.h> |
18 | #include <linux/sched_clock.h> | |
fd1fea68 | 19 | #include <linux/mm.h> |
4df4cb9e | 20 | #include <linux/cpuhotplug.h> |
ec866be6 MK |
21 | #include <linux/interrupt.h> |
22 | #include <linux/irq.h> | |
23 | #include <linux/acpi.h> | |
4ad1aa57 | 24 | #include <linux/hyperv.h> |
fd1fea68 MK |
25 | #include <clocksource/hyperv_timer.h> |
26 | #include <asm/hyperv-tlfs.h> | |
27 | #include <asm/mshyperv.h> | |
28 | ||
29 | static struct clock_event_device __percpu *hv_clock_event; | |
bd00cd52 | 30 | static u64 hv_sched_clock_offset __ro_after_init; |
fd1fea68 MK |
31 | |
32 | /* | |
33 | * If false, we're using the old mechanism for stimer0 interrupts | |
34 | * where it sends a VMbus message when it expires. The old | |
35 | * mechanism is used when running on older versions of Hyper-V | |
36 | * that don't support Direct Mode. While Hyper-V provides | |
37 | * four stimer's per CPU, Linux uses only stimer0. | |
4df4cb9e MK |
38 | * |
39 | * Because Direct Mode does not require processing a VMbus | |
40 | * message, stimer interrupts can be enabled earlier in the | |
41 | * process of booting a CPU, and consistent with when timer | |
42 | * interrupts are enabled for other clocksource drivers. | |
43 | * However, for legacy versions of Hyper-V when Direct Mode | |
44 | * is not enabled, setting up stimer interrupts must be | |
45 | * delayed until VMbus is initialized and can process the | |
46 | * interrupt message. | |
fd1fea68 MK |
47 | */ |
48 | static bool direct_mode_enabled; | |
49 | ||
ec866be6 | 50 | static int stimer0_irq = -1; |
fd1fea68 | 51 | static int stimer0_message_sint; |
a4fea9b7 | 52 | static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt); |
fd1fea68 MK |
53 | |
54 | /* | |
ec866be6 MK |
55 | * Common code for stimer0 interrupts coming via Direct Mode or |
56 | * as a VMbus message. | |
fd1fea68 MK |
57 | */ |
58 | void hv_stimer0_isr(void) | |
59 | { | |
60 | struct clock_event_device *ce; | |
61 | ||
62 | ce = this_cpu_ptr(hv_clock_event); | |
63 | ce->event_handler(ce); | |
64 | } | |
65 | EXPORT_SYMBOL_GPL(hv_stimer0_isr); | |
66 | ||
ec866be6 MK |
67 | /* |
68 | * stimer0 interrupt handler for architectures that support | |
69 | * per-cpu interrupts, which also implies Direct Mode. | |
70 | */ | |
a4fea9b7 | 71 | static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id) |
ec866be6 MK |
72 | { |
73 | hv_stimer0_isr(); | |
74 | return IRQ_HANDLED; | |
75 | } | |
76 | ||
fd1fea68 MK |
77 | static int hv_ce_set_next_event(unsigned long delta, |
78 | struct clock_event_device *evt) | |
79 | { | |
80 | u64 current_tick; | |
81 | ||
0af3e137 | 82 | current_tick = hv_read_reference_counter(); |
fd1fea68 | 83 | current_tick += delta; |
f3c5e63c | 84 | hv_set_register(HV_REGISTER_STIMER0_COUNT, current_tick); |
fd1fea68 MK |
85 | return 0; |
86 | } | |
87 | ||
88 | static int hv_ce_shutdown(struct clock_event_device *evt) | |
89 | { | |
f3c5e63c MK |
90 | hv_set_register(HV_REGISTER_STIMER0_COUNT, 0); |
91 | hv_set_register(HV_REGISTER_STIMER0_CONFIG, 0); | |
ec866be6 MK |
92 | if (direct_mode_enabled && stimer0_irq >= 0) |
93 | disable_percpu_irq(stimer0_irq); | |
fd1fea68 MK |
94 | |
95 | return 0; | |
96 | } | |
97 | ||
98 | static int hv_ce_set_oneshot(struct clock_event_device *evt) | |
99 | { | |
100 | union hv_stimer_config timer_cfg; | |
101 | ||
102 | timer_cfg.as_uint64 = 0; | |
103 | timer_cfg.enable = 1; | |
104 | timer_cfg.auto_enable = 1; | |
105 | if (direct_mode_enabled) { | |
106 | /* | |
107 | * When it expires, the timer will directly interrupt | |
108 | * on the specified hardware vector/IRQ. | |
109 | */ | |
110 | timer_cfg.direct_mode = 1; | |
ec866be6 MK |
111 | timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; |
112 | if (stimer0_irq >= 0) | |
113 | enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE); | |
fd1fea68 MK |
114 | } else { |
115 | /* | |
116 | * When it expires, the timer will generate a VMbus message, | |
117 | * to be handled by the normal VMbus interrupt handler. | |
118 | */ | |
119 | timer_cfg.direct_mode = 0; | |
120 | timer_cfg.sintx = stimer0_message_sint; | |
121 | } | |
f3c5e63c | 122 | hv_set_register(HV_REGISTER_STIMER0_CONFIG, timer_cfg.as_uint64); |
fd1fea68 MK |
123 | return 0; |
124 | } | |
125 | ||
126 | /* | |
127 | * hv_stimer_init - Per-cpu initialization of the clockevent | |
128 | */ | |
4df4cb9e | 129 | static int hv_stimer_init(unsigned int cpu) |
fd1fea68 MK |
130 | { |
131 | struct clock_event_device *ce; | |
132 | ||
4df4cb9e MK |
133 | if (!hv_clock_event) |
134 | return 0; | |
fd1fea68 MK |
135 | |
136 | ce = per_cpu_ptr(hv_clock_event, cpu); | |
137 | ce->name = "Hyper-V clockevent"; | |
138 | ce->features = CLOCK_EVT_FEAT_ONESHOT; | |
139 | ce->cpumask = cpumask_of(cpu); | |
140 | ce->rating = 1000; | |
141 | ce->set_state_shutdown = hv_ce_shutdown; | |
142 | ce->set_state_oneshot = hv_ce_set_oneshot; | |
143 | ce->set_next_event = hv_ce_set_next_event; | |
144 | ||
145 | clockevents_config_and_register(ce, | |
146 | HV_CLOCK_HZ, | |
147 | HV_MIN_DELTA_TICKS, | |
148 | HV_MAX_MAX_DELTA_TICKS); | |
4df4cb9e | 149 | return 0; |
fd1fea68 | 150 | } |
fd1fea68 MK |
151 | |
152 | /* | |
153 | * hv_stimer_cleanup - Per-cpu cleanup of the clockevent | |
154 | */ | |
4df4cb9e | 155 | int hv_stimer_cleanup(unsigned int cpu) |
fd1fea68 MK |
156 | { |
157 | struct clock_event_device *ce; | |
158 | ||
4df4cb9e MK |
159 | if (!hv_clock_event) |
160 | return 0; | |
161 | ||
162 | /* | |
163 | * In the legacy case where Direct Mode is not enabled | |
164 | * (which can only be on x86/64), stimer cleanup happens | |
165 | * relatively early in the CPU offlining process. We | |
166 | * must unbind the stimer-based clockevent device so | |
167 | * that the LAPIC timer can take over until clockevents | |
168 | * are no longer needed in the offlining process. Note | |
169 | * that clockevents_unbind_device() eventually calls | |
170 | * hv_ce_shutdown(). | |
171 | * | |
172 | * The unbind should not be done when Direct Mode is | |
173 | * enabled because we may be on an architecture where | |
174 | * there are no other clockevent devices to fallback to. | |
175 | */ | |
176 | ce = per_cpu_ptr(hv_clock_event, cpu); | |
177 | if (direct_mode_enabled) | |
fd1fea68 | 178 | hv_ce_shutdown(ce); |
4df4cb9e MK |
179 | else |
180 | clockevents_unbind_device(ce, cpu); | |
181 | ||
182 | return 0; | |
fd1fea68 MK |
183 | } |
184 | EXPORT_SYMBOL_GPL(hv_stimer_cleanup); | |
185 | ||
ec866be6 MK |
186 | /* |
187 | * These placeholders are overridden by arch specific code on | |
188 | * architectures that need special setup of the stimer0 IRQ because | |
189 | * they don't support per-cpu IRQs (such as x86/x64). | |
190 | */ | |
191 | void __weak hv_setup_stimer0_handler(void (*handler)(void)) | |
192 | { | |
193 | }; | |
194 | ||
195 | void __weak hv_remove_stimer0_handler(void) | |
196 | { | |
197 | }; | |
198 | ||
a4fea9b7 | 199 | #ifdef CONFIG_ACPI |
ec866be6 MK |
200 | /* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ |
201 | static int hv_setup_stimer0_irq(void) | |
202 | { | |
203 | int ret; | |
204 | ||
205 | ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, | |
206 | ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); | |
207 | if (ret < 0) { | |
208 | pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret); | |
209 | return ret; | |
210 | } | |
211 | stimer0_irq = ret; | |
212 | ||
213 | ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr, | |
214 | "Hyper-V stimer0", &stimer0_evt); | |
215 | if (ret) { | |
216 | pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d", | |
217 | stimer0_irq, ret); | |
218 | acpi_unregister_gsi(stimer0_irq); | |
219 | stimer0_irq = -1; | |
220 | } | |
221 | return ret; | |
222 | } | |
223 | ||
224 | static void hv_remove_stimer0_irq(void) | |
225 | { | |
226 | if (stimer0_irq == -1) { | |
227 | hv_remove_stimer0_handler(); | |
228 | } else { | |
229 | free_percpu_irq(stimer0_irq, &stimer0_evt); | |
230 | acpi_unregister_gsi(stimer0_irq); | |
231 | stimer0_irq = -1; | |
232 | } | |
233 | } | |
a4fea9b7 SS |
234 | #else |
235 | static int hv_setup_stimer0_irq(void) | |
236 | { | |
237 | return 0; | |
238 | } | |
239 | ||
240 | static void hv_remove_stimer0_irq(void) | |
241 | { | |
242 | } | |
243 | #endif | |
ec866be6 | 244 | |
fd1fea68 | 245 | /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ |
ec866be6 | 246 | int hv_stimer_alloc(bool have_percpu_irqs) |
fd1fea68 | 247 | { |
ec866be6 | 248 | int ret; |
4df4cb9e MK |
249 | |
250 | /* | |
251 | * Synthetic timers are always available except on old versions of | |
252 | * Hyper-V on x86. In that case, return as error as Linux will use a | |
253 | * clockevent based on emulated LAPIC timer hardware. | |
254 | */ | |
255 | if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) | |
256 | return -EINVAL; | |
fd1fea68 MK |
257 | |
258 | hv_clock_event = alloc_percpu(struct clock_event_device); | |
259 | if (!hv_clock_event) | |
260 | return -ENOMEM; | |
261 | ||
262 | direct_mode_enabled = ms_hyperv.misc_features & | |
263 | HV_STIMER_DIRECT_MODE_AVAILABLE; | |
ec866be6 MK |
264 | |
265 | /* | |
266 | * If Direct Mode isn't enabled, the remainder of the initialization | |
267 | * is done later by hv_stimer_legacy_init() | |
268 | */ | |
269 | if (!direct_mode_enabled) | |
270 | return 0; | |
271 | ||
272 | if (have_percpu_irqs) { | |
273 | ret = hv_setup_stimer0_irq(); | |
4df4cb9e | 274 | if (ret) |
ec866be6 MK |
275 | goto free_clock_event; |
276 | } else { | |
277 | hv_setup_stimer0_handler(hv_stimer0_isr); | |
278 | } | |
4df4cb9e | 279 | |
ec866be6 MK |
280 | /* |
281 | * Since we are in Direct Mode, stimer initialization | |
282 | * can be done now with a CPUHP value in the same range | |
283 | * as other clockevent devices. | |
284 | */ | |
285 | ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, | |
286 | "clockevents/hyperv/stimer:starting", | |
287 | hv_stimer_init, hv_stimer_cleanup); | |
288 | if (ret < 0) { | |
289 | hv_remove_stimer0_irq(); | |
290 | goto free_clock_event; | |
fd1fea68 | 291 | } |
4df4cb9e | 292 | return ret; |
fd1fea68 | 293 | |
ec866be6 | 294 | free_clock_event: |
4df4cb9e MK |
295 | free_percpu(hv_clock_event); |
296 | hv_clock_event = NULL; | |
297 | return ret; | |
fd1fea68 MK |
298 | } |
299 | EXPORT_SYMBOL_GPL(hv_stimer_alloc); | |
300 | ||
4df4cb9e MK |
301 | /* |
302 | * hv_stimer_legacy_init -- Called from the VMbus driver to handle | |
303 | * the case when Direct Mode is not enabled, and the stimer | |
304 | * must be initialized late in the CPU onlining process. | |
305 | * | |
306 | */ | |
307 | void hv_stimer_legacy_init(unsigned int cpu, int sint) | |
308 | { | |
309 | if (direct_mode_enabled) | |
310 | return; | |
311 | ||
312 | /* | |
313 | * This function gets called by each vCPU, so setting the | |
314 | * global stimer_message_sint value each time is conceptually | |
315 | * not ideal, but the value passed in is always the same and | |
316 | * it avoids introducing yet another interface into this | |
317 | * clocksource driver just to set the sint in the legacy case. | |
318 | */ | |
319 | stimer0_message_sint = sint; | |
320 | (void)hv_stimer_init(cpu); | |
321 | } | |
322 | EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); | |
323 | ||
324 | /* | |
325 | * hv_stimer_legacy_cleanup -- Called from the VMbus driver to | |
326 | * handle the case when Direct Mode is not enabled, and the | |
327 | * stimer must be cleaned up early in the CPU offlining | |
328 | * process. | |
329 | */ | |
330 | void hv_stimer_legacy_cleanup(unsigned int cpu) | |
331 | { | |
332 | if (direct_mode_enabled) | |
333 | return; | |
334 | (void)hv_stimer_cleanup(cpu); | |
335 | } | |
336 | EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); | |
337 | ||
fd1fea68 MK |
338 | /* |
339 | * Do a global cleanup of clockevents for the cases of kexec and | |
340 | * vmbus exit | |
341 | */ | |
342 | void hv_stimer_global_cleanup(void) | |
343 | { | |
344 | int cpu; | |
fd1fea68 | 345 | |
4df4cb9e MK |
346 | /* |
347 | * hv_stime_legacy_cleanup() will stop the stimer if Direct | |
348 | * Mode is not enabled, and fallback to the LAPIC timer. | |
349 | */ | |
350 | for_each_present_cpu(cpu) { | |
351 | hv_stimer_legacy_cleanup(cpu); | |
fd1fea68 | 352 | } |
4df4cb9e | 353 | |
ec866be6 MK |
354 | if (!hv_clock_event) |
355 | return; | |
356 | ||
357 | if (direct_mode_enabled) { | |
358 | cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); | |
359 | hv_remove_stimer0_irq(); | |
360 | stimer0_irq = -1; | |
361 | } | |
362 | free_percpu(hv_clock_event); | |
363 | hv_clock_event = NULL; | |
364 | ||
fd1fea68 MK |
365 | } |
366 | EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); | |
dd2cb348 | 367 | |
e39acc37 PZ |
368 | static __always_inline u64 read_hv_clock_msr(void) |
369 | { | |
370 | /* | |
371 | * Read the partition counter to get the current tick count. This count | |
372 | * is set to 0 when the partition is created and is incremented in 100 | |
373 | * nanosecond units. | |
374 | * | |
375 | * Use hv_raw_get_register() because this function is used from | |
376 | * noinstr. Notable; while HV_REGISTER_TIME_REF_COUNT is a synthetic | |
377 | * register it doesn't need the GHCB path. | |
378 | */ | |
379 | return hv_raw_get_register(HV_REGISTER_TIME_REF_COUNT); | |
380 | } | |
381 | ||
dd2cb348 MK |
382 | /* |
383 | * Code and definitions for the Hyper-V clocksources. Two | |
384 | * clocksources are defined: one that reads the Hyper-V defined MSR, and | |
385 | * the other that uses the TSC reference page feature as defined in the | |
386 | * TLFS. The MSR version is for compatibility with old versions of | |
387 | * Hyper-V and 32-bit x86. The TSC reference page version is preferred. | |
388 | */ | |
389 | ||
ddc61bbc BF |
390 | static union { |
391 | struct ms_hyperv_tsc_page page; | |
392 | u8 reserved[PAGE_SIZE]; | |
45f46b1a | 393 | } tsc_pg __bss_decrypted __aligned(PAGE_SIZE); |
dd2cb348 | 394 | |
76be6331 | 395 | static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; |
e1f5c66d SK |
396 | static unsigned long tsc_pfn; |
397 | ||
364adc45 | 398 | unsigned long hv_get_tsc_pfn(void) |
e1f5c66d SK |
399 | { |
400 | return tsc_pfn; | |
401 | } | |
364adc45 | 402 | EXPORT_SYMBOL_GPL(hv_get_tsc_pfn); |
76be6331 | 403 | |
dd2cb348 MK |
404 | struct ms_hyperv_tsc_page *hv_get_tsc_page(void) |
405 | { | |
76be6331 | 406 | return tsc_page; |
dd2cb348 MK |
407 | } |
408 | EXPORT_SYMBOL_GPL(hv_get_tsc_page); | |
409 | ||
e39acc37 | 410 | static __always_inline u64 read_hv_clock_tsc(void) |
dd2cb348 | 411 | { |
9397fa2e | 412 | u64 cur_tsc, time; |
dd2cb348 | 413 | |
9397fa2e PZ |
414 | /* |
415 | * The Hyper-V Top-Level Function Spec (TLFS), section Timers, | |
416 | * subsection Refererence Counter, guarantees that the TSC and MSR | |
417 | * times are in sync and monotonic. Therefore we can fall back | |
418 | * to the MSR in case the TSC page indicates unavailability. | |
419 | */ | |
420 | if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time)) | |
e39acc37 | 421 | time = read_hv_clock_msr(); |
dd2cb348 | 422 | |
9397fa2e | 423 | return time; |
dd2cb348 MK |
424 | } |
425 | ||
0af3e137 AP |
426 | static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) |
427 | { | |
428 | return read_hv_clock_tsc(); | |
429 | } | |
430 | ||
e39acc37 | 431 | static u64 noinstr read_hv_sched_clock_tsc(void) |
dd2cb348 | 432 | { |
749da8ca YX |
433 | return (read_hv_clock_tsc() - hv_sched_clock_offset) * |
434 | (NSEC_PER_SEC / HV_CLOCK_HZ); | |
dd2cb348 MK |
435 | } |
436 | ||
1349401f DC |
437 | static void suspend_hv_clock_tsc(struct clocksource *arg) |
438 | { | |
4ad1aa57 | 439 | union hv_reference_tsc_msr tsc_msr; |
1349401f DC |
440 | |
441 | /* Disable the TSC page */ | |
4ad1aa57 AR |
442 | tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC); |
443 | tsc_msr.enable = 0; | |
444 | hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64); | |
1349401f DC |
445 | } |
446 | ||
447 | ||
448 | static void resume_hv_clock_tsc(struct clocksource *arg) | |
449 | { | |
4ad1aa57 | 450 | union hv_reference_tsc_msr tsc_msr; |
1349401f DC |
451 | |
452 | /* Re-enable the TSC page */ | |
4ad1aa57 AR |
453 | tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC); |
454 | tsc_msr.enable = 1; | |
e1f5c66d | 455 | tsc_msr.pfn = tsc_pfn; |
4ad1aa57 | 456 | hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64); |
1349401f DC |
457 | } |
458 | ||
3486d2c9 | 459 | #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK |
eec399dd TG |
460 | static int hv_cs_enable(struct clocksource *cs) |
461 | { | |
e4ab4658 | 462 | vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); |
eec399dd TG |
463 | return 0; |
464 | } | |
e4ab4658 | 465 | #endif |
eec399dd | 466 | |
dd2cb348 MK |
467 | static struct clocksource hyperv_cs_tsc = { |
468 | .name = "hyperv_clocksource_tsc_page", | |
4c78738e | 469 | .rating = 500, |
0af3e137 | 470 | .read = read_hv_clock_tsc_cs, |
dd2cb348 MK |
471 | .mask = CLOCKSOURCE_MASK(64), |
472 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | |
1349401f DC |
473 | .suspend= suspend_hv_clock_tsc, |
474 | .resume = resume_hv_clock_tsc, | |
3486d2c9 | 475 | #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK |
eec399dd | 476 | .enable = hv_cs_enable, |
e4ab4658 MK |
477 | .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, |
478 | #else | |
479 | .vdso_clock_mode = VDSO_CLOCKMODE_NONE, | |
480 | #endif | |
dd2cb348 | 481 | }; |
dd2cb348 | 482 | |
0af3e137 AP |
483 | static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) |
484 | { | |
485 | return read_hv_clock_msr(); | |
486 | } | |
487 | ||
dd2cb348 MK |
488 | static struct clocksource hyperv_cs_msr = { |
489 | .name = "hyperv_clocksource_msr", | |
e5313f1c | 490 | .rating = 495, |
0af3e137 | 491 | .read = read_hv_clock_msr_cs, |
dd2cb348 MK |
492 | .mask = CLOCKSOURCE_MASK(64), |
493 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | |
494 | }; | |
495 | ||
eb3e1d37 MK |
496 | /* |
497 | * Reference to pv_ops must be inline so objtool | |
498 | * detection of noinstr violations can work correctly. | |
499 | */ | |
500 | #ifdef CONFIG_GENERIC_SCHED_CLOCK | |
501 | static __always_inline void hv_setup_sched_clock(void *sched_clock) | |
502 | { | |
503 | /* | |
504 | * We're on an architecture with generic sched clock (not x86/x64). | |
505 | * The Hyper-V sched clock read function returns nanoseconds, not | |
506 | * the normal 100ns units of the Hyper-V synthetic clock. | |
507 | */ | |
508 | sched_clock_register(sched_clock, 64, NSEC_PER_SEC); | |
509 | } | |
510 | #elif defined CONFIG_PARAVIRT | |
511 | static __always_inline void hv_setup_sched_clock(void *sched_clock) | |
512 | { | |
513 | /* We're on x86/x64 *and* using PV ops */ | |
4d480dbf | 514 | paravirt_set_sched_clock(sched_clock); |
eb3e1d37 MK |
515 | } |
516 | #else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ | |
517 | static __always_inline void hv_setup_sched_clock(void *sched_clock) {} | |
518 | #endif /* CONFIG_GENERIC_SCHED_CLOCK */ | |
519 | ||
e5313f1c | 520 | static void __init hv_init_tsc_clocksource(void) |
dd2cb348 | 521 | { |
4ad1aa57 | 522 | union hv_reference_tsc_msr tsc_msr; |
dd2cb348 | 523 | |
4c78738e MK |
524 | /* |
525 | * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly | |
526 | * handles frequency and offset changes due to live migration, | |
527 | * pause/resume, and other VM management operations. So lower the | |
528 | * Hyper-V Reference TSC rating, causing the generic TSC to be used. | |
529 | * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference | |
530 | * TSC will be preferred over the virtualized ARM64 arch counter. | |
531 | */ | |
ec866be6 | 532 | if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { |
4c78738e | 533 | hyperv_cs_tsc.rating = 250; |
e5313f1c | 534 | hyperv_cs_msr.rating = 245; |
ec866be6 | 535 | } |
4c78738e | 536 | |
c0e96acf | 537 | if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) |
e5313f1c | 538 | return; |
c0e96acf | 539 | |
0af3e137 | 540 | hv_read_reference_counter = read_hv_clock_tsc; |
dd2cb348 MK |
541 | |
542 | /* | |
0408f16b SK |
543 | * TSC page mapping works differently in root compared to guest. |
544 | * - In guest partition the guest PFN has to be passed to the | |
545 | * hypervisor. | |
546 | * - In root partition it's other way around: it has to map the PFN | |
547 | * provided by the hypervisor. | |
548 | * But it can't be mapped right here as it's too early and MMU isn't | |
549 | * ready yet. So, we only set the enable bit here and will remap the | |
550 | * page later in hv_remap_tsc_clocksource(). | |
551 | * | |
552 | * It worth mentioning, that TSC clocksource read function | |
553 | * (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when | |
554 | * TSC page is zeroed (which is the case until the PFN is remapped) and | |
555 | * thus TSC clocksource will work even without the real TSC page | |
556 | * mapped. | |
dd2cb348 | 557 | */ |
4ad1aa57 | 558 | tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC); |
0408f16b SK |
559 | if (hv_root_partition) |
560 | tsc_pfn = tsc_msr.pfn; | |
561 | else | |
562 | tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); | |
4ad1aa57 | 563 | tsc_msr.enable = 1; |
e1f5c66d | 564 | tsc_msr.pfn = tsc_pfn; |
4ad1aa57 | 565 | hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64); |
dd2cb348 | 566 | |
dd2cb348 MK |
567 | clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); |
568 | ||
e5313f1c MK |
569 | /* |
570 | * If TSC is invariant, then let it stay as the sched clock since it | |
571 | * will be faster than reading the TSC page. But if not invariant, use | |
572 | * the TSC page so that live migrations across hosts with different | |
573 | * frequencies is handled correctly. | |
574 | */ | |
575 | if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) { | |
576 | hv_sched_clock_offset = hv_read_reference_counter(); | |
577 | hv_setup_sched_clock(read_hv_sched_clock_tsc); | |
578 | } | |
dd2cb348 | 579 | } |
dd2cb348 MK |
580 | |
581 | void __init hv_init_clocksource(void) | |
582 | { | |
583 | /* | |
e5313f1c MK |
584 | * Try to set up the TSC page clocksource, then the MSR clocksource. |
585 | * At least one of these will always be available except on very old | |
586 | * versions of Hyper-V on x86. In that case we won't have a Hyper-V | |
dd2cb348 MK |
587 | * clocksource, but Linux will still run with a clocksource based |
588 | * on the emulated PIT or LAPIC timer. | |
e5313f1c MK |
589 | * |
590 | * Never use the MSR clocksource as sched clock. It's too slow. | |
591 | * Better to use the native sched clock as the fallback. | |
dd2cb348 | 592 | */ |
e5313f1c | 593 | hv_init_tsc_clocksource(); |
dd2cb348 | 594 | |
e5313f1c MK |
595 | if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) |
596 | clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); | |
dd2cb348 | 597 | } |
0408f16b SK |
598 | |
599 | void __init hv_remap_tsc_clocksource(void) | |
600 | { | |
601 | if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) | |
602 | return; | |
603 | ||
604 | if (!hv_root_partition) { | |
605 | WARN(1, "%s: attempt to remap TSC page in guest partition\n", | |
606 | __func__); | |
607 | return; | |
608 | } | |
609 | ||
610 | tsc_page = memremap(tsc_pfn << HV_HYP_PAGE_SHIFT, sizeof(tsc_pg), | |
611 | MEMREMAP_WB); | |
612 | if (!tsc_page) | |
613 | pr_err("Failed to remap Hyper-V TSC page.\n"); | |
614 | } |