x86/kvmclock: Cleanup the code
[linux-block.git] / arch / x86 / kernel / kvmclock.c
CommitLineData
790c73f6
GOC
1/* KVM paravirtual clock driver. A clocksource implementation
2 Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include <linux/clocksource.h>
20#include <linux/kvm_para.h>
f6e16d5a 21#include <asm/pvclock.h>
790c73f6
GOC
22#include <asm/msr.h>
23#include <asm/apic.h>
24#include <linux/percpu.h>
3b5d56b9 25#include <linux/hardirq.h>
0ad83caa 26#include <linux/sched.h>
e6017571 27#include <linux/sched/clock.h>
368a540e 28#include <linux/mm.h>
736decac 29
819aeee0 30#include <asm/mem_encrypt.h>
736decac 31#include <asm/x86_init.h>
1e977aa1 32#include <asm/reboot.h>
f4066c2b 33#include <asm/kvmclock.h>
790c73f6 34
404f6aac 35static int kvmclock __ro_after_init = 1;
838815a7
GC
36static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
37static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
a5a1d1c2 38static u64 kvm_sched_clock_offset;
790c73f6 39
146c394d 40static int __init parse_no_kvmclock(char *arg)
790c73f6
GOC
41{
42 kvmclock = 0;
43 return 0;
44}
45early_param("no-kvmclock", parse_no_kvmclock);
46
368a540e
PT
47/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
48#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
368a540e
PT
49
50static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE);
368a540e 51
790c73f6 52/* The hypervisor will put information about time periodically here */
3dc4f7cf 53static struct pvclock_vsyscall_time_info *hv_clock;
7ef363a3 54static struct pvclock_wall_clock wall_clock;
790c73f6 55
790c73f6
GOC
56/*
57 * The wallclock is the time of day when we booted. Since then, some time may
58 * have elapsed since the hypervisor wrote the data. So we try to account for
59 * that with system time
60 */
e27c4929 61static void kvm_get_wallclock(struct timespec64 *now)
790c73f6 62{
f6e16d5a 63 struct pvclock_vcpu_time_info *vcpu_time;
7069ed67 64 int cpu;
790c73f6 65
146c394d 66 wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
790c73f6 67
c6338ce4 68 cpu = get_cpu();
7069ed67 69
3dc4f7cf 70 vcpu_time = &hv_clock[cpu].pvti;
7ef363a3 71 pvclock_read_wallclock(&wall_clock, vcpu_time, now);
7069ed67 72
c6338ce4 73 put_cpu();
790c73f6
GOC
74}
75
e27c4929 76static int kvm_set_wallclock(const struct timespec64 *now)
790c73f6 77{
00875520 78 return -ENODEV;
790c73f6
GOC
79}
80
a5a1d1c2 81static u64 kvm_clock_read(void)
790c73f6 82{
f6e16d5a 83 struct pvclock_vcpu_time_info *src;
a5a1d1c2 84 u64 ret;
7069ed67 85 int cpu;
790c73f6 86
95ef1e52 87 preempt_disable_notrace();
7069ed67 88 cpu = smp_processor_id();
3dc4f7cf 89 src = &hv_clock[cpu].pvti;
f6e16d5a 90 ret = pvclock_clocksource_read(src);
95ef1e52 91 preempt_enable_notrace();
f6e16d5a 92 return ret;
790c73f6 93}
f6e16d5a 94
a5a1d1c2 95static u64 kvm_clock_get_cycles(struct clocksource *cs)
8e19608e
MD
96{
97 return kvm_clock_read();
98}
99
a5a1d1c2 100static u64 kvm_sched_clock_read(void)
72c930dc
RK
101{
102 return kvm_clock_read() - kvm_sched_clock_offset;
103}
104
105static inline void kvm_sched_clock_init(bool stable)
106{
107 if (!stable) {
108 pv_time_ops.sched_clock = kvm_clock_read;
acb04058 109 clear_sched_clock_stable();
72c930dc
RK
110 return;
111 }
112
113 kvm_sched_clock_offset = kvm_clock_read();
114 pv_time_ops.sched_clock = kvm_sched_clock_read;
72c930dc 115
146c394d
TG
116 pr_info("kvm-clock: using sched offset of %llu cycles",
117 kvm_sched_clock_offset);
72c930dc
RK
118
119 BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
146c394d 120 sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
72c930dc
RK
121}
122
0293615f
GC
123/*
124 * If we don't do that, there is the possibility that the guest
125 * will calibrate under heavy load - thus, getting a lower lpj -
126 * and execute the delays themselves without load. This is wrong,
127 * because no delay loop can finish beforehand.
128 * Any heuristics is subject to fail, because ultimately, a large
129 * poll of guests can be running and trouble each other. So we preset
130 * lpj here
131 */
132static unsigned long kvm_get_tsc_khz(void)
133{
e10f7805 134 setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
146c394d 135 return pvclock_tsc_khz(&hv_clock[0].pvti);
0293615f
GC
136}
137
138static void kvm_get_preset_lpj(void)
139{
0293615f
GC
140 unsigned long khz;
141 u64 lpj;
142
e93353c9 143 khz = kvm_get_tsc_khz();
0293615f
GC
144
145 lpj = ((u64)khz * 1000);
146 do_div(lpj, HZ);
147 preset_lpj = lpj;
148}
149
3b5d56b9
EM
150bool kvm_check_and_clear_guest_paused(void)
151{
3b5d56b9 152 struct pvclock_vcpu_time_info *src;
146c394d 153 bool ret = false;
7069ed67
MT
154
155 if (!hv_clock)
156 return ret;
3b5d56b9 157
146c394d 158 src = &hv_clock[smp_processor_id()].pvti;
3b5d56b9 159 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
7069ed67 160 src->flags &= ~PVCLOCK_GUEST_STOPPED;
d63285e9 161 pvclock_touch_watchdogs();
3b5d56b9
EM
162 ret = true;
163 }
3b5d56b9
EM
164 return ret;
165}
3b5d56b9 166
f4066c2b 167struct clocksource kvm_clock = {
146c394d
TG
168 .name = "kvm-clock",
169 .read = kvm_clock_get_cycles,
170 .rating = 400,
171 .mask = CLOCKSOURCE_MASK(64),
172 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
790c73f6 173};
f4066c2b 174EXPORT_SYMBOL_GPL(kvm_clock);
790c73f6 175
7a5ddc8f 176static void kvm_register_clock(char *txt)
790c73f6 177{
fe1140cc 178 struct pvclock_vcpu_time_info *src;
7a5ddc8f
TG
179 int cpu = smp_processor_id();
180 u64 pa;
fe1140cc
JK
181
182 if (!hv_clock)
7a5ddc8f 183 return;
19b6a85b 184
fe1140cc 185 src = &hv_clock[cpu].pvti;
7a5ddc8f
TG
186 pa = slow_virt_to_phys(src) | 0x01ULL;
187 wrmsrl(msr_kvm_system_time, pa);
146c394d 188 pr_info("kvm-clock: cpu %d, msr %llx, %s", cpu, pa, txt);
790c73f6
GOC
189}
190
b74f05d6
MT
191static void kvm_save_sched_clock_state(void)
192{
193}
194
195static void kvm_restore_sched_clock_state(void)
196{
197 kvm_register_clock("primary cpu clock, resume");
198}
199
b8ba5f10 200#ifdef CONFIG_X86_LOCAL_APIC
148f9bb8 201static void kvm_setup_secondary_clock(void)
790c73f6 202{
7a5ddc8f 203 kvm_register_clock("secondary cpu clock");
790c73f6 204}
b8ba5f10 205#endif
790c73f6 206
1e977aa1
GC
207/*
208 * After the clock is registered, the host will keep writing to the
209 * registered memory location. If the guest happens to shutdown, this memory
210 * won't be valid. In cases like kexec, in which you install a new kernel, this
211 * means a random memory location will be kept being written. So before any
6a6256f9 212 * kind of shutdown from our side, we unregister the clock by writing anything
1e977aa1
GC
213 * that does not have the 'enable' bit set in the msr
214 */
2965faa5 215#ifdef CONFIG_KEXEC_CORE
1e977aa1
GC
216static void kvm_crash_shutdown(struct pt_regs *regs)
217{
838815a7 218 native_write_msr(msr_kvm_system_time, 0, 0);
d910f5c1 219 kvm_disable_steal_time();
1e977aa1
GC
220 native_machine_crash_shutdown(regs);
221}
222#endif
223
224static void kvm_shutdown(void)
225{
838815a7 226 native_write_msr(msr_kvm_system_time, 0, 0);
d910f5c1 227 kvm_disable_steal_time();
1e977aa1
GC
228 native_machine_shutdown();
229}
230
790c73f6
GOC
231void __init kvmclock_init(void)
232{
0ad83caa 233 u8 flags;
ed55705d 234
146c394d 235 if (!kvm_para_available() || !kvmclock)
790c73f6
GOC
236 return;
237
146c394d 238 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
838815a7
GC
239 msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
240 msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
146c394d 241 } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
838815a7 242 return;
146c394d 243 }
838815a7 244
146c394d 245 pr_info("kvm-clock: Using msrs %x and %x",
819aeee0
BS
246 msr_kvm_system_time, msr_kvm_wall_clock);
247
7a5ddc8f
TG
248 hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem;
249 kvm_register_clock("primary cpu clock");
94ffba48
RK
250 pvclock_set_pvti_cpu0_va(hv_clock);
251
72c930dc
RK
252 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
253 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
254
146c394d 255 flags = pvclock_read_flags(&hv_clock[0].pvti);
72c930dc 256 kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
72c930dc 257
838815a7 258 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
a4497a86 259 x86_platform.calibrate_cpu = kvm_get_tsc_khz;
838815a7
GC
260 x86_platform.get_wallclock = kvm_get_wallclock;
261 x86_platform.set_wallclock = kvm_set_wallclock;
b8ba5f10 262#ifdef CONFIG_X86_LOCAL_APIC
146c394d 263 x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
b8ba5f10 264#endif
b74f05d6
MT
265 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
266 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
838815a7 267 machine_ops.shutdown = kvm_shutdown;
2965faa5 268#ifdef CONFIG_KEXEC_CORE
838815a7 269 machine_ops.crash_shutdown = kvm_crash_shutdown;
1e977aa1 270#endif
838815a7 271 kvm_get_preset_lpj();
b01cc1b0 272 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
838815a7 273 pv_info.name = "KVM";
790c73f6 274}
3dc4f7cf
MT
275
276int __init kvm_setup_vsyscall_timeinfo(void)
277{
278#ifdef CONFIG_X86_64
3dc4f7cf 279 u8 flags;
3dc4f7cf 280
fe1140cc
JK
281 if (!hv_clock)
282 return 0;
283
146c394d 284 flags = pvclock_read_flags(&hv_clock[0].pvti);
94ffba48
RK
285 if (!(flags & PVCLOCK_TSC_STABLE_BIT))
286 return 1;
287
3dc4f7cf
MT
288 kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
289#endif
290 return 0;
291}