Commit | Line | Data |
---|---|---|
790c73f6 GOC |
1 | /* KVM paravirtual clock driver. A clocksource implementation |
2 | Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. | |
3 | ||
4 | This program is free software; you can redistribute it and/or modify | |
5 | it under the terms of the GNU General Public License as published by | |
6 | the Free Software Foundation; either version 2 of the License, or | |
7 | (at your option) any later version. | |
8 | ||
9 | This program is distributed in the hope that it will be useful, | |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | GNU General Public License for more details. | |
13 | ||
14 | You should have received a copy of the GNU General Public License | |
15 | along with this program; if not, write to the Free Software | |
16 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | |
17 | */ | |
18 | ||
19 | #include <linux/clocksource.h> | |
20 | #include <linux/kvm_para.h> | |
f6e16d5a | 21 | #include <asm/pvclock.h> |
790c73f6 GOC |
22 | #include <asm/msr.h> |
23 | #include <asm/apic.h> | |
24 | #include <linux/percpu.h> | |
3b5d56b9 | 25 | #include <linux/hardirq.h> |
0ad83caa | 26 | #include <linux/sched.h> |
e6017571 | 27 | #include <linux/sched/clock.h> |
368a540e | 28 | #include <linux/mm.h> |
736decac | 29 | |
819aeee0 | 30 | #include <asm/mem_encrypt.h> |
736decac | 31 | #include <asm/x86_init.h> |
1e977aa1 | 32 | #include <asm/reboot.h> |
f4066c2b | 33 | #include <asm/kvmclock.h> |
790c73f6 | 34 | |
404f6aac | 35 | static int kvmclock __ro_after_init = 1; |
838815a7 GC |
36 | static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; |
37 | static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; | |
a5a1d1c2 | 38 | static u64 kvm_sched_clock_offset; |
790c73f6 | 39 | |
146c394d | 40 | static int __init parse_no_kvmclock(char *arg) |
790c73f6 GOC |
41 | { |
42 | kvmclock = 0; | |
43 | return 0; | |
44 | } | |
45 | early_param("no-kvmclock", parse_no_kvmclock); | |
46 | ||
368a540e PT |
47 | /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ |
48 | #define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) | |
368a540e PT |
49 | |
50 | static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE); | |
368a540e | 51 | |
790c73f6 | 52 | /* The hypervisor will put information about time periodically here */ |
3dc4f7cf | 53 | static struct pvclock_vsyscall_time_info *hv_clock; |
7ef363a3 | 54 | static struct pvclock_wall_clock wall_clock; |
790c73f6 | 55 | |
790c73f6 GOC |
56 | /* |
57 | * The wallclock is the time of day when we booted. Since then, some time may | |
58 | * have elapsed since the hypervisor wrote the data. So we try to account for | |
59 | * that with system time | |
60 | */ | |
e27c4929 | 61 | static void kvm_get_wallclock(struct timespec64 *now) |
790c73f6 | 62 | { |
f6e16d5a | 63 | struct pvclock_vcpu_time_info *vcpu_time; |
7069ed67 | 64 | int cpu; |
790c73f6 | 65 | |
146c394d | 66 | wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); |
790c73f6 | 67 | |
c6338ce4 | 68 | cpu = get_cpu(); |
7069ed67 | 69 | |
3dc4f7cf | 70 | vcpu_time = &hv_clock[cpu].pvti; |
7ef363a3 | 71 | pvclock_read_wallclock(&wall_clock, vcpu_time, now); |
7069ed67 | 72 | |
c6338ce4 | 73 | put_cpu(); |
790c73f6 GOC |
74 | } |
75 | ||
e27c4929 | 76 | static int kvm_set_wallclock(const struct timespec64 *now) |
790c73f6 | 77 | { |
00875520 | 78 | return -ENODEV; |
790c73f6 GOC |
79 | } |
80 | ||
a5a1d1c2 | 81 | static u64 kvm_clock_read(void) |
790c73f6 | 82 | { |
f6e16d5a | 83 | struct pvclock_vcpu_time_info *src; |
a5a1d1c2 | 84 | u64 ret; |
7069ed67 | 85 | int cpu; |
790c73f6 | 86 | |
95ef1e52 | 87 | preempt_disable_notrace(); |
7069ed67 | 88 | cpu = smp_processor_id(); |
3dc4f7cf | 89 | src = &hv_clock[cpu].pvti; |
f6e16d5a | 90 | ret = pvclock_clocksource_read(src); |
95ef1e52 | 91 | preempt_enable_notrace(); |
f6e16d5a | 92 | return ret; |
790c73f6 | 93 | } |
f6e16d5a | 94 | |
a5a1d1c2 | 95 | static u64 kvm_clock_get_cycles(struct clocksource *cs) |
8e19608e MD |
96 | { |
97 | return kvm_clock_read(); | |
98 | } | |
99 | ||
a5a1d1c2 | 100 | static u64 kvm_sched_clock_read(void) |
72c930dc RK |
101 | { |
102 | return kvm_clock_read() - kvm_sched_clock_offset; | |
103 | } | |
104 | ||
105 | static inline void kvm_sched_clock_init(bool stable) | |
106 | { | |
107 | if (!stable) { | |
108 | pv_time_ops.sched_clock = kvm_clock_read; | |
acb04058 | 109 | clear_sched_clock_stable(); |
72c930dc RK |
110 | return; |
111 | } | |
112 | ||
113 | kvm_sched_clock_offset = kvm_clock_read(); | |
114 | pv_time_ops.sched_clock = kvm_sched_clock_read; | |
72c930dc | 115 | |
146c394d TG |
116 | pr_info("kvm-clock: using sched offset of %llu cycles", |
117 | kvm_sched_clock_offset); | |
72c930dc RK |
118 | |
119 | BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > | |
146c394d | 120 | sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); |
72c930dc RK |
121 | } |
122 | ||
0293615f GC |
123 | /* |
124 | * If we don't do that, there is the possibility that the guest | |
125 | * will calibrate under heavy load - thus, getting a lower lpj - | |
126 | * and execute the delays themselves without load. This is wrong, | |
127 | * because no delay loop can finish beforehand. | |
128 | * Any heuristics is subject to fail, because ultimately, a large | |
129 | * poll of guests can be running and trouble each other. So we preset | |
130 | * lpj here | |
131 | */ | |
132 | static unsigned long kvm_get_tsc_khz(void) | |
133 | { | |
e10f7805 | 134 | setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); |
146c394d | 135 | return pvclock_tsc_khz(&hv_clock[0].pvti); |
0293615f GC |
136 | } |
137 | ||
138 | static void kvm_get_preset_lpj(void) | |
139 | { | |
0293615f GC |
140 | unsigned long khz; |
141 | u64 lpj; | |
142 | ||
e93353c9 | 143 | khz = kvm_get_tsc_khz(); |
0293615f GC |
144 | |
145 | lpj = ((u64)khz * 1000); | |
146 | do_div(lpj, HZ); | |
147 | preset_lpj = lpj; | |
148 | } | |
149 | ||
3b5d56b9 EM |
150 | bool kvm_check_and_clear_guest_paused(void) |
151 | { | |
3b5d56b9 | 152 | struct pvclock_vcpu_time_info *src; |
146c394d | 153 | bool ret = false; |
7069ed67 MT |
154 | |
155 | if (!hv_clock) | |
156 | return ret; | |
3b5d56b9 | 157 | |
146c394d | 158 | src = &hv_clock[smp_processor_id()].pvti; |
3b5d56b9 | 159 | if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { |
7069ed67 | 160 | src->flags &= ~PVCLOCK_GUEST_STOPPED; |
d63285e9 | 161 | pvclock_touch_watchdogs(); |
3b5d56b9 EM |
162 | ret = true; |
163 | } | |
3b5d56b9 EM |
164 | return ret; |
165 | } | |
3b5d56b9 | 166 | |
f4066c2b | 167 | struct clocksource kvm_clock = { |
146c394d TG |
168 | .name = "kvm-clock", |
169 | .read = kvm_clock_get_cycles, | |
170 | .rating = 400, | |
171 | .mask = CLOCKSOURCE_MASK(64), | |
172 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | |
790c73f6 | 173 | }; |
f4066c2b | 174 | EXPORT_SYMBOL_GPL(kvm_clock); |
790c73f6 | 175 | |
7a5ddc8f | 176 | static void kvm_register_clock(char *txt) |
790c73f6 | 177 | { |
fe1140cc | 178 | struct pvclock_vcpu_time_info *src; |
7a5ddc8f TG |
179 | int cpu = smp_processor_id(); |
180 | u64 pa; | |
fe1140cc JK |
181 | |
182 | if (!hv_clock) | |
7a5ddc8f | 183 | return; |
19b6a85b | 184 | |
fe1140cc | 185 | src = &hv_clock[cpu].pvti; |
7a5ddc8f TG |
186 | pa = slow_virt_to_phys(src) | 0x01ULL; |
187 | wrmsrl(msr_kvm_system_time, pa); | |
146c394d | 188 | pr_info("kvm-clock: cpu %d, msr %llx, %s", cpu, pa, txt); |
790c73f6 GOC |
189 | } |
190 | ||
b74f05d6 MT |
191 | static void kvm_save_sched_clock_state(void) |
192 | { | |
193 | } | |
194 | ||
195 | static void kvm_restore_sched_clock_state(void) | |
196 | { | |
197 | kvm_register_clock("primary cpu clock, resume"); | |
198 | } | |
199 | ||
b8ba5f10 | 200 | #ifdef CONFIG_X86_LOCAL_APIC |
148f9bb8 | 201 | static void kvm_setup_secondary_clock(void) |
790c73f6 | 202 | { |
7a5ddc8f | 203 | kvm_register_clock("secondary cpu clock"); |
790c73f6 | 204 | } |
b8ba5f10 | 205 | #endif |
790c73f6 | 206 | |
1e977aa1 GC |
207 | /* |
208 | * After the clock is registered, the host will keep writing to the | |
209 | * registered memory location. If the guest happens to shutdown, this memory | |
210 | * won't be valid. In cases like kexec, in which you install a new kernel, this | |
211 | * means a random memory location will be kept being written. So before any | |
6a6256f9 | 212 | * kind of shutdown from our side, we unregister the clock by writing anything |
1e977aa1 GC |
213 | * that does not have the 'enable' bit set in the msr |
214 | */ | |
2965faa5 | 215 | #ifdef CONFIG_KEXEC_CORE |
1e977aa1 GC |
216 | static void kvm_crash_shutdown(struct pt_regs *regs) |
217 | { | |
838815a7 | 218 | native_write_msr(msr_kvm_system_time, 0, 0); |
d910f5c1 | 219 | kvm_disable_steal_time(); |
1e977aa1 GC |
220 | native_machine_crash_shutdown(regs); |
221 | } | |
222 | #endif | |
223 | ||
224 | static void kvm_shutdown(void) | |
225 | { | |
838815a7 | 226 | native_write_msr(msr_kvm_system_time, 0, 0); |
d910f5c1 | 227 | kvm_disable_steal_time(); |
1e977aa1 GC |
228 | native_machine_shutdown(); |
229 | } | |
230 | ||
790c73f6 GOC |
231 | void __init kvmclock_init(void) |
232 | { | |
0ad83caa | 233 | u8 flags; |
ed55705d | 234 | |
146c394d | 235 | if (!kvm_para_available() || !kvmclock) |
790c73f6 GOC |
236 | return; |
237 | ||
146c394d | 238 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { |
838815a7 GC |
239 | msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; |
240 | msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; | |
146c394d | 241 | } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { |
838815a7 | 242 | return; |
146c394d | 243 | } |
838815a7 | 244 | |
146c394d | 245 | pr_info("kvm-clock: Using msrs %x and %x", |
819aeee0 BS |
246 | msr_kvm_system_time, msr_kvm_wall_clock); |
247 | ||
7a5ddc8f TG |
248 | hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; |
249 | kvm_register_clock("primary cpu clock"); | |
94ffba48 RK |
250 | pvclock_set_pvti_cpu0_va(hv_clock); |
251 | ||
72c930dc RK |
252 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) |
253 | pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); | |
254 | ||
146c394d | 255 | flags = pvclock_read_flags(&hv_clock[0].pvti); |
72c930dc | 256 | kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); |
72c930dc | 257 | |
838815a7 | 258 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; |
a4497a86 | 259 | x86_platform.calibrate_cpu = kvm_get_tsc_khz; |
838815a7 GC |
260 | x86_platform.get_wallclock = kvm_get_wallclock; |
261 | x86_platform.set_wallclock = kvm_set_wallclock; | |
b8ba5f10 | 262 | #ifdef CONFIG_X86_LOCAL_APIC |
146c394d | 263 | x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; |
b8ba5f10 | 264 | #endif |
b74f05d6 MT |
265 | x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; |
266 | x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; | |
838815a7 | 267 | machine_ops.shutdown = kvm_shutdown; |
2965faa5 | 268 | #ifdef CONFIG_KEXEC_CORE |
838815a7 | 269 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
1e977aa1 | 270 | #endif |
838815a7 | 271 | kvm_get_preset_lpj(); |
b01cc1b0 | 272 | clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); |
838815a7 | 273 | pv_info.name = "KVM"; |
790c73f6 | 274 | } |
3dc4f7cf MT |
275 | |
276 | int __init kvm_setup_vsyscall_timeinfo(void) | |
277 | { | |
278 | #ifdef CONFIG_X86_64 | |
3dc4f7cf | 279 | u8 flags; |
3dc4f7cf | 280 | |
fe1140cc JK |
281 | if (!hv_clock) |
282 | return 0; | |
283 | ||
146c394d | 284 | flags = pvclock_read_flags(&hv_clock[0].pvti); |
94ffba48 RK |
285 | if (!(flags & PVCLOCK_TSC_STABLE_BIT)) |
286 | return 1; | |
287 | ||
3dc4f7cf MT |
288 | kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; |
289 | #endif | |
290 | return 0; | |
291 | } |