ARM/time: Remove read_boot_clock64()
[linux-2.6-block.git] / arch / x86 / kernel / tsc.c
CommitLineData
c767a54b
JP
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
bfc0f594 3#include <linux/kernel.h>
0ef95533 4#include <linux/sched.h>
e6017571 5#include <linux/sched/clock.h>
0ef95533 6#include <linux/init.h>
186f4360 7#include <linux/export.h>
0ef95533 8#include <linux/timer.h>
bfc0f594 9#include <linux/acpi_pmtmr.h>
2dbe06fa 10#include <linux/cpufreq.h>
8fbbc4b4
AK
11#include <linux/delay.h>
12#include <linux/clocksource.h>
13#include <linux/percpu.h>
08604bd9 14#include <linux/timex.h>
10b033d4 15#include <linux/static_key.h>
bfc0f594
AK
16
17#include <asm/hpet.h>
8fbbc4b4
AK
18#include <asm/timer.h>
19#include <asm/vgtod.h>
20#include <asm/time.h>
21#include <asm/delay.h>
88b094fb 22#include <asm/hypervisor.h>
08047c4f 23#include <asm/nmi.h>
2d826404 24#include <asm/x86_init.h>
03da3ff1 25#include <asm/geode.h>
6731b0d6 26#include <asm/apic.h>
655e52d2 27#include <asm/intel-family.h>
30c7e5b1 28#include <asm/i8259.h>
0ef95533 29
f24ade3a 30unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
0ef95533 31EXPORT_SYMBOL(cpu_khz);
f24ade3a
IM
32
33unsigned int __read_mostly tsc_khz;
0ef95533
AK
34EXPORT_SYMBOL(tsc_khz);
35
36/*
37 * TSC can be unstable due to cpufreq or due to unsynced TSCs
38 */
f24ade3a 39static int __read_mostly tsc_unstable;
0ef95533 40
3bbfafb7 41static DEFINE_STATIC_KEY_FALSE(__use_tsc);
10b033d4 42
28a00184 43int tsc_clocksource_reliable;
57c67da2 44
f9677e0f
CH
45static u32 art_to_tsc_numerator;
46static u32 art_to_tsc_denominator;
47static u64 art_to_tsc_offset;
48struct clocksource *art_related_clocksource;
49
20d1c86a 50struct cyc2ns {
59eaef78
PZ
51 struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
52 seqcount_t seq; /* 32 + 4 = 36 */
20d1c86a 53
59eaef78 54}; /* fits one cacheline */
20d1c86a 55
59eaef78 56static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
20d1c86a 57
59eaef78 58void cyc2ns_read_begin(struct cyc2ns_data *data)
20d1c86a 59{
59eaef78 60 int seq, idx;
20d1c86a 61
59eaef78 62 preempt_disable_notrace();
20d1c86a 63
59eaef78
PZ
64 do {
65 seq = this_cpu_read(cyc2ns.seq.sequence);
66 idx = seq & 1;
20d1c86a 67
59eaef78
PZ
68 data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
69 data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
70 data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
20d1c86a 71
59eaef78 72 } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
20d1c86a
PZ
73}
74
59eaef78 75void cyc2ns_read_end(void)
20d1c86a 76{
59eaef78 77 preempt_enable_notrace();
20d1c86a
PZ
78}
79
80/*
81 * Accelerators for sched_clock()
57c67da2
PZ
82 * convert from cycles(64bits) => nanoseconds (64bits)
83 * basic equation:
84 * ns = cycles / (freq / ns_per_sec)
85 * ns = cycles * (ns_per_sec / freq)
86 * ns = cycles * (10^9 / (cpu_khz * 10^3))
87 * ns = cycles * (10^6 / cpu_khz)
88 *
89 * Then we use scaling math (suggested by george@mvista.com) to get:
90 * ns = cycles * (10^6 * SC / cpu_khz) / SC
91 * ns = cycles * cyc2ns_scale / SC
92 *
93 * And since SC is a constant power of two, we can convert the div
b20112ed
AH
94 * into a shift. The larger SC is, the more accurate the conversion, but
95 * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
96 * (64-bit result) can be used.
57c67da2 97 *
b20112ed 98 * We can use khz divisor instead of mhz to keep a better precision.
57c67da2
PZ
99 * (mathieu.desnoyers@polymtl.ca)
100 *
101 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
102 */
103
20d1c86a
PZ
104static void cyc2ns_data_init(struct cyc2ns_data *data)
105{
5e3c1afd 106 data->cyc2ns_mul = 0;
b20112ed 107 data->cyc2ns_shift = 0;
20d1c86a 108 data->cyc2ns_offset = 0;
20d1c86a
PZ
109}
110
120fc3fb 111static void __init cyc2ns_init(int cpu)
20d1c86a
PZ
112{
113 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
114
115 cyc2ns_data_init(&c2n->data[0]);
116 cyc2ns_data_init(&c2n->data[1]);
117
59eaef78 118 seqcount_init(&c2n->seq);
20d1c86a
PZ
119}
120
57c67da2
PZ
121static inline unsigned long long cycles_2_ns(unsigned long long cyc)
122{
59eaef78 123 struct cyc2ns_data data;
20d1c86a
PZ
124 unsigned long long ns;
125
59eaef78 126 cyc2ns_read_begin(&data);
20d1c86a 127
59eaef78
PZ
128 ns = data.cyc2ns_offset;
129 ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
20d1c86a 130
59eaef78 131 cyc2ns_read_end();
20d1c86a 132
57c67da2
PZ
133 return ns;
134}
135
5c3c2ea6 136static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
57c67da2 137{
615cd033 138 unsigned long long ns_now;
59eaef78
PZ
139 struct cyc2ns_data data;
140 struct cyc2ns *c2n;
20d1c86a 141 unsigned long flags;
57c67da2
PZ
142
143 local_irq_save(flags);
144 sched_clock_idle_sleep_event();
145
aa297292 146 if (!khz)
20d1c86a
PZ
147 goto done;
148
57c67da2
PZ
149 ns_now = cycles_2_ns(tsc_now);
150
20d1c86a
PZ
151 /*
152 * Compute a new multiplier as per the above comment and ensure our
153 * time function is continuous; see the comment near struct
154 * cyc2ns_data.
155 */
59eaef78 156 clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
b20112ed
AH
157 NSEC_PER_MSEC, 0);
158
b9511cd7
AH
159 /*
160 * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
161 * not expected to be greater than 31 due to the original published
162 * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
163 * value) - refer perf_event_mmap_page documentation in perf_event.h.
164 */
59eaef78
PZ
165 if (data.cyc2ns_shift == 32) {
166 data.cyc2ns_shift = 31;
167 data.cyc2ns_mul >>= 1;
b9511cd7
AH
168 }
169
59eaef78
PZ
170 data.cyc2ns_offset = ns_now -
171 mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
172
173 c2n = per_cpu_ptr(&cyc2ns, cpu);
20d1c86a 174
59eaef78
PZ
175 raw_write_seqcount_latch(&c2n->seq);
176 c2n->data[0] = data;
177 raw_write_seqcount_latch(&c2n->seq);
178 c2n->data[1] = data;
57c67da2 179
20d1c86a 180done:
ac1e843f 181 sched_clock_idle_wakeup_event();
57c67da2
PZ
182 local_irq_restore(flags);
183}
615cd033 184
0ef95533
AK
185/*
186 * Scheduler clock - returns current time in nanosec units.
187 */
188u64 native_sched_clock(void)
189{
3bbfafb7
PZ
190 if (static_branch_likely(&__use_tsc)) {
191 u64 tsc_now = rdtsc();
192
193 /* return the value in ns */
194 return cycles_2_ns(tsc_now);
195 }
0ef95533
AK
196
197 /*
198 * Fall back to jiffies if there's no TSC available:
199 * ( But note that we still use it if the TSC is marked
200 * unstable. We do this because unlike Time Of Day,
201 * the scheduler clock tolerates small errors and it's
202 * very important for it to be as fast as the platform
3ad2f3fb 203 * can achieve it. )
0ef95533 204 */
0ef95533 205
3bbfafb7
PZ
206 /* No locking but a rare wrong value is not a big deal: */
207 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
0ef95533
AK
208}
209
a94cab23
AK
210/*
211 * Generate a sched_clock if you already have a TSC value.
212 */
213u64 native_sched_clock_from_tsc(u64 tsc)
214{
215 return cycles_2_ns(tsc);
216}
217
0ef95533
AK
218/* We need to define a real function for sched_clock, to override the
219 weak default version */
220#ifdef CONFIG_PARAVIRT
221unsigned long long sched_clock(void)
222{
223 return paravirt_sched_clock();
224}
f94c8d11 225
698eff63 226bool using_native_sched_clock(void)
f94c8d11
PZ
227{
228 return pv_time_ops.sched_clock == native_sched_clock;
229}
0ef95533
AK
230#else
231unsigned long long
232sched_clock(void) __attribute__((alias("native_sched_clock")));
f94c8d11 233
698eff63 234bool using_native_sched_clock(void) { return true; }
0ef95533
AK
235#endif
236
237int check_tsc_unstable(void)
238{
239 return tsc_unstable;
240}
241EXPORT_SYMBOL_GPL(check_tsc_unstable);
242
243#ifdef CONFIG_X86_TSC
244int __init notsc_setup(char *str)
245{
fe9af81e 246 mark_tsc_unstable("boot parameter notsc");
0ef95533
AK
247 return 1;
248}
249#else
250/*
251 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
252 * in cpu/common.c
253 */
254int __init notsc_setup(char *str)
255{
256 setup_clear_cpu_cap(X86_FEATURE_TSC);
257 return 1;
258}
259#endif
260
261__setup("notsc", notsc_setup);
bfc0f594 262
e82b8e4e
VP
263static int no_sched_irq_time;
264
395628ef
AK
265static int __init tsc_setup(char *str)
266{
267 if (!strcmp(str, "reliable"))
268 tsc_clocksource_reliable = 1;
e82b8e4e
VP
269 if (!strncmp(str, "noirqtime", 9))
270 no_sched_irq_time = 1;
8309f86c
PZ
271 if (!strcmp(str, "unstable"))
272 mark_tsc_unstable("boot parameter");
395628ef
AK
273 return 1;
274}
275
276__setup("tsc=", tsc_setup);
277
bfc0f594
AK
278#define MAX_RETRIES 5
279#define SMI_TRESHOLD 50000
280
281/*
282 * Read TSC and the reference counters. Take care of SMI disturbance
283 */
827014be 284static u64 tsc_read_refs(u64 *p, int hpet)
bfc0f594
AK
285{
286 u64 t1, t2;
287 int i;
288
289 for (i = 0; i < MAX_RETRIES; i++) {
290 t1 = get_cycles();
291 if (hpet)
827014be 292 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
bfc0f594 293 else
827014be 294 *p = acpi_pm_read_early();
bfc0f594
AK
295 t2 = get_cycles();
296 if ((t2 - t1) < SMI_TRESHOLD)
297 return t2;
298 }
299 return ULLONG_MAX;
300}
301
d683ef7a
TG
302/*
303 * Calculate the TSC frequency from HPET reference
bfc0f594 304 */
d683ef7a 305static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
bfc0f594 306{
d683ef7a 307 u64 tmp;
bfc0f594 308
d683ef7a
TG
309 if (hpet2 < hpet1)
310 hpet2 += 0x100000000ULL;
311 hpet2 -= hpet1;
312 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
313 do_div(tmp, 1000000);
d3878e16 314 deltatsc = div64_u64(deltatsc, tmp);
d683ef7a
TG
315
316 return (unsigned long) deltatsc;
317}
318
319/*
320 * Calculate the TSC frequency from PMTimer reference
321 */
322static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
323{
324 u64 tmp;
bfc0f594 325
d683ef7a
TG
326 if (!pm1 && !pm2)
327 return ULONG_MAX;
328
329 if (pm2 < pm1)
330 pm2 += (u64)ACPI_PM_OVRRUN;
331 pm2 -= pm1;
332 tmp = pm2 * 1000000000LL;
333 do_div(tmp, PMTMR_TICKS_PER_SEC);
334 do_div(deltatsc, tmp);
335
336 return (unsigned long) deltatsc;
337}
338
a977c400 339#define CAL_MS 10
b7743970 340#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
a977c400
TG
341#define CAL_PIT_LOOPS 1000
342
343#define CAL2_MS 50
b7743970 344#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
a977c400
TG
345#define CAL2_PIT_LOOPS 5000
346
cce3e057 347
ec0c15af
LT
348/*
349 * Try to calibrate the TSC against the Programmable
350 * Interrupt Timer and return the frequency of the TSC
351 * in kHz.
352 *
353 * Return ULONG_MAX on failure to calibrate.
354 */
a977c400 355static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
ec0c15af
LT
356{
357 u64 tsc, t1, t2, delta;
358 unsigned long tscmin, tscmax;
359 int pitcnt;
360
30c7e5b1
PZ
361 if (!has_legacy_pic()) {
362 /*
363 * Relies on tsc_early_delay_calibrate() to have given us semi
364 * usable udelay(), wait for the same 50ms we would have with
365 * the PIT loop below.
366 */
367 udelay(10 * USEC_PER_MSEC);
368 udelay(10 * USEC_PER_MSEC);
369 udelay(10 * USEC_PER_MSEC);
370 udelay(10 * USEC_PER_MSEC);
371 udelay(10 * USEC_PER_MSEC);
372 return ULONG_MAX;
373 }
374
ec0c15af
LT
375 /* Set the Gate high, disable speaker */
376 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
377
378 /*
379 * Setup CTC channel 2* for mode 0, (interrupt on terminal
380 * count mode), binary count. Set the latch register to 50ms
381 * (LSB then MSB) to begin countdown.
382 */
383 outb(0xb0, 0x43);
a977c400
TG
384 outb(latch & 0xff, 0x42);
385 outb(latch >> 8, 0x42);
ec0c15af
LT
386
387 tsc = t1 = t2 = get_cycles();
388
389 pitcnt = 0;
390 tscmax = 0;
391 tscmin = ULONG_MAX;
392 while ((inb(0x61) & 0x20) == 0) {
393 t2 = get_cycles();
394 delta = t2 - tsc;
395 tsc = t2;
396 if ((unsigned long) delta < tscmin)
397 tscmin = (unsigned int) delta;
398 if ((unsigned long) delta > tscmax)
399 tscmax = (unsigned int) delta;
400 pitcnt++;
401 }
402
403 /*
404 * Sanity checks:
405 *
a977c400 406 * If we were not able to read the PIT more than loopmin
ec0c15af
LT
407 * times, then we have been hit by a massive SMI
408 *
409 * If the maximum is 10 times larger than the minimum,
410 * then we got hit by an SMI as well.
411 */
a977c400 412 if (pitcnt < loopmin || tscmax > 10 * tscmin)
ec0c15af
LT
413 return ULONG_MAX;
414
415 /* Calculate the PIT value */
416 delta = t2 - t1;
a977c400 417 do_div(delta, ms);
ec0c15af
LT
418 return delta;
419}
420
6ac40ed0
LT
421/*
422 * This reads the current MSB of the PIT counter, and
423 * checks if we are running on sufficiently fast and
424 * non-virtualized hardware.
425 *
426 * Our expectations are:
427 *
428 * - the PIT is running at roughly 1.19MHz
429 *
430 * - each IO is going to take about 1us on real hardware,
431 * but we allow it to be much faster (by a factor of 10) or
432 * _slightly_ slower (ie we allow up to a 2us read+counter
433 * update - anything else implies a unacceptably slow CPU
434 * or PIT for the fast calibration to work.
435 *
436 * - with 256 PIT ticks to read the value, we have 214us to
437 * see the same MSB (and overhead like doing a single TSC
438 * read per MSB value etc).
439 *
440 * - We're doing 2 reads per loop (LSB, MSB), and we expect
441 * them each to take about a microsecond on real hardware.
442 * So we expect a count value of around 100. But we'll be
443 * generous, and accept anything over 50.
444 *
445 * - if the PIT is stuck, and we see *many* more reads, we
446 * return early (and the next caller of pit_expect_msb()
447 * then consider it a failure when they don't see the
448 * next expected value).
449 *
450 * These expectations mean that we know that we have seen the
451 * transition from one expected value to another with a fairly
452 * high accuracy, and we didn't miss any events. We can thus
453 * use the TSC value at the transitions to calculate a pretty
454 * good value for the TSC frequencty.
455 */
b6e61eef
LT
456static inline int pit_verify_msb(unsigned char val)
457{
458 /* Ignore LSB */
459 inb(0x42);
460 return inb(0x42) == val;
461}
462
9e8912e0 463static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
6ac40ed0 464{
9e8912e0 465 int count;
68f30fbe 466 u64 tsc = 0, prev_tsc = 0;
bfc0f594 467
6ac40ed0 468 for (count = 0; count < 50000; count++) {
b6e61eef 469 if (!pit_verify_msb(val))
6ac40ed0 470 break;
68f30fbe 471 prev_tsc = tsc;
9e8912e0 472 tsc = get_cycles();
6ac40ed0 473 }
68f30fbe 474 *deltap = get_cycles() - prev_tsc;
9e8912e0
LT
475 *tscp = tsc;
476
477 /*
478 * We require _some_ success, but the quality control
479 * will be based on the error terms on the TSC values.
480 */
481 return count > 5;
6ac40ed0
LT
482}
483
484/*
9e8912e0
LT
485 * How many MSB values do we want to see? We aim for
486 * a maximum error rate of 500ppm (in practice the
487 * real error is much smaller), but refuse to spend
68f30fbe 488 * more than 50ms on it.
6ac40ed0 489 */
68f30fbe 490#define MAX_QUICK_PIT_MS 50
9e8912e0 491#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
bfc0f594 492
6ac40ed0
LT
493static unsigned long quick_pit_calibrate(void)
494{
9e8912e0
LT
495 int i;
496 u64 tsc, delta;
497 unsigned long d1, d2;
498
30c7e5b1
PZ
499 if (!has_legacy_pic())
500 return 0;
501
6ac40ed0 502 /* Set the Gate high, disable speaker */
bfc0f594
AK
503 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
504
6ac40ed0
LT
505 /*
506 * Counter 2, mode 0 (one-shot), binary count
507 *
508 * NOTE! Mode 2 decrements by two (and then the
509 * output is flipped each time, giving the same
510 * final output frequency as a decrement-by-one),
511 * so mode 0 is much better when looking at the
512 * individual counts.
513 */
bfc0f594 514 outb(0xb0, 0x43);
bfc0f594 515
6ac40ed0
LT
516 /* Start at 0xffff */
517 outb(0xff, 0x42);
518 outb(0xff, 0x42);
519
a6a80e1d
LT
520 /*
521 * The PIT starts counting at the next edge, so we
522 * need to delay for a microsecond. The easiest way
523 * to do that is to just read back the 16-bit counter
524 * once from the PIT.
525 */
b6e61eef 526 pit_verify_msb(0);
a6a80e1d 527
9e8912e0
LT
528 if (pit_expect_msb(0xff, &tsc, &d1)) {
529 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
530 if (!pit_expect_msb(0xff-i, &delta, &d2))
531 break;
532
5aac644a
AH
533 delta -= tsc;
534
535 /*
536 * Extrapolate the error and fail fast if the error will
537 * never be below 500 ppm.
538 */
539 if (i == 1 &&
540 d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
541 return 0;
542
9e8912e0
LT
543 /*
544 * Iterate until the error is less than 500 ppm
545 */
b6e61eef
LT
546 if (d1+d2 >= delta >> 11)
547 continue;
548
549 /*
550 * Check the PIT one more time to verify that
551 * all TSC reads were stable wrt the PIT.
552 *
553 * This also guarantees serialization of the
554 * last cycle read ('d2') in pit_expect_msb.
555 */
556 if (!pit_verify_msb(0xfe - i))
557 break;
558 goto success;
6ac40ed0 559 }
6ac40ed0 560 }
52045217 561 pr_info("Fast TSC calibration failed\n");
6ac40ed0 562 return 0;
9e8912e0
LT
563
564success:
565 /*
566 * Ok, if we get here, then we've seen the
567 * MSB of the PIT decrement 'i' times, and the
568 * error has shrunk to less than 500 ppm.
569 *
570 * As a result, we can depend on there not being
571 * any odd delays anywhere, and the TSC reads are
68f30fbe 572 * reliable (within the error).
9e8912e0
LT
573 *
574 * kHz = ticks / time-in-seconds / 1000;
575 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
576 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
577 */
9e8912e0
LT
578 delta *= PIT_TICK_RATE;
579 do_div(delta, i*256*1000);
c767a54b 580 pr_info("Fast TSC calibration using PIT\n");
9e8912e0 581 return delta;
6ac40ed0 582}
ec0c15af 583
bfc0f594 584/**
aa297292
LB
585 * native_calibrate_tsc
586 * Determine TSC frequency via CPUID, else return 0.
bfc0f594 587 */
e93ef949 588unsigned long native_calibrate_tsc(void)
aa297292
LB
589{
590 unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
591 unsigned int crystal_khz;
592
593 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
594 return 0;
595
596 if (boot_cpu_data.cpuid_level < 0x15)
597 return 0;
598
599 eax_denominator = ebx_numerator = ecx_hz = edx = 0;
600
601 /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
602 cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
603
604 if (ebx_numerator == 0 || eax_denominator == 0)
605 return 0;
606
607 crystal_khz = ecx_hz / 1000;
608
609 if (crystal_khz == 0) {
610 switch (boot_cpu_data.x86_model) {
655e52d2
PB
611 case INTEL_FAM6_SKYLAKE_MOBILE:
612 case INTEL_FAM6_SKYLAKE_DESKTOP:
6baf3d61
PB
613 case INTEL_FAM6_KABYLAKE_MOBILE:
614 case INTEL_FAM6_KABYLAKE_DESKTOP:
ff4c8663
LB
615 crystal_khz = 24000; /* 24.0 MHz */
616 break;
695085b4 617 case INTEL_FAM6_ATOM_DENVERTON:
6baf3d61
PB
618 crystal_khz = 25000; /* 25.0 MHz */
619 break;
655e52d2 620 case INTEL_FAM6_ATOM_GOLDMONT:
ff4c8663
LB
621 crystal_khz = 19200; /* 19.2 MHz */
622 break;
aa297292
LB
623 }
624 }
625
da4ae6c4
LB
626 if (crystal_khz == 0)
627 return 0;
4ca4df0b
BG
628 /*
629 * TSC frequency determined by CPUID is a "hardware reported"
630 * frequency and is the most accurate one so far we have. This
631 * is considered a known frequency.
632 */
633 setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
634
4635fdc6
BG
635 /*
636 * For Atom SoCs TSC is the only reliable clocksource.
637 * Mark TSC reliable so no watchdog on it.
638 */
639 if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
640 setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
641
aa297292
LB
642 return crystal_khz * ebx_numerator / eax_denominator;
643}
644
645static unsigned long cpu_khz_from_cpuid(void)
646{
647 unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
648
649 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
650 return 0;
651
652 if (boot_cpu_data.cpuid_level < 0x16)
653 return 0;
654
655 eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
656
657 cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
658
659 return eax_base_mhz * 1000;
660}
661
662/**
663 * native_calibrate_cpu - calibrate the cpu on boot
664 */
665unsigned long native_calibrate_cpu(void)
bfc0f594 666{
827014be 667 u64 tsc1, tsc2, delta, ref1, ref2;
fbb16e24 668 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
2d826404 669 unsigned long flags, latch, ms, fast_calibrate;
a977c400 670 int hpet = is_hpet_enabled(), i, loopmin;
bfc0f594 671
aa297292
LB
672 fast_calibrate = cpu_khz_from_cpuid();
673 if (fast_calibrate)
674 return fast_calibrate;
675
02c0cd2d 676 fast_calibrate = cpu_khz_from_msr();
5f0e0309 677 if (fast_calibrate)
7da7c156 678 return fast_calibrate;
7da7c156 679
6ac40ed0
LT
680 local_irq_save(flags);
681 fast_calibrate = quick_pit_calibrate();
bfc0f594 682 local_irq_restore(flags);
6ac40ed0
LT
683 if (fast_calibrate)
684 return fast_calibrate;
bfc0f594 685
fbb16e24
TG
686 /*
687 * Run 5 calibration loops to get the lowest frequency value
688 * (the best estimate). We use two different calibration modes
689 * here:
690 *
691 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
692 * load a timeout of 50ms. We read the time right after we
693 * started the timer and wait until the PIT count down reaches
694 * zero. In each wait loop iteration we read the TSC and check
695 * the delta to the previous read. We keep track of the min
696 * and max values of that delta. The delta is mostly defined
697 * by the IO time of the PIT access, so we can detect when a
0d2eb44f 698 * SMI/SMM disturbance happened between the two reads. If the
fbb16e24
TG
699 * maximum time is significantly larger than the minimum time,
700 * then we discard the result and have another try.
701 *
702 * 2) Reference counter. If available we use the HPET or the
703 * PMTIMER as a reference to check the sanity of that value.
704 * We use separate TSC readouts and check inside of the
705 * reference read for a SMI/SMM disturbance. We dicard
706 * disturbed values here as well. We do that around the PIT
707 * calibration delay loop as we have to wait for a certain
708 * amount of time anyway.
709 */
a977c400
TG
710
711 /* Preset PIT loop values */
712 latch = CAL_LATCH;
713 ms = CAL_MS;
714 loopmin = CAL_PIT_LOOPS;
715
716 for (i = 0; i < 3; i++) {
ec0c15af 717 unsigned long tsc_pit_khz;
fbb16e24
TG
718
719 /*
720 * Read the start value and the reference count of
ec0c15af
LT
721 * hpet/pmtimer when available. Then do the PIT
722 * calibration, which will take at least 50ms, and
723 * read the end value.
fbb16e24 724 */
ec0c15af 725 local_irq_save(flags);
827014be 726 tsc1 = tsc_read_refs(&ref1, hpet);
a977c400 727 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
827014be 728 tsc2 = tsc_read_refs(&ref2, hpet);
fbb16e24
TG
729 local_irq_restore(flags);
730
ec0c15af
LT
731 /* Pick the lowest PIT TSC calibration so far */
732 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
fbb16e24
TG
733
734 /* hpet or pmtimer available ? */
62627bec 735 if (ref1 == ref2)
fbb16e24
TG
736 continue;
737
738 /* Check, whether the sampling was disturbed by an SMI */
739 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
740 continue;
741
742 tsc2 = (tsc2 - tsc1) * 1000000LL;
d683ef7a 743 if (hpet)
827014be 744 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
d683ef7a 745 else
827014be 746 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
fbb16e24 747
fbb16e24 748 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
a977c400
TG
749
750 /* Check the reference deviation */
751 delta = ((u64) tsc_pit_min) * 100;
752 do_div(delta, tsc_ref_min);
753
754 /*
755 * If both calibration results are inside a 10% window
756 * then we can be sure, that the calibration
757 * succeeded. We break out of the loop right away. We
758 * use the reference value, as it is more precise.
759 */
760 if (delta >= 90 && delta <= 110) {
c767a54b
JP
761 pr_info("PIT calibration matches %s. %d loops\n",
762 hpet ? "HPET" : "PMTIMER", i + 1);
a977c400 763 return tsc_ref_min;
fbb16e24
TG
764 }
765
a977c400
TG
766 /*
767 * Check whether PIT failed more than once. This
768 * happens in virtualized environments. We need to
769 * give the virtual PC a slightly longer timeframe for
770 * the HPET/PMTIMER to make the result precise.
771 */
772 if (i == 1 && tsc_pit_min == ULONG_MAX) {
773 latch = CAL2_LATCH;
774 ms = CAL2_MS;
775 loopmin = CAL2_PIT_LOOPS;
776 }
fbb16e24 777 }
bfc0f594
AK
778
779 /*
fbb16e24 780 * Now check the results.
bfc0f594 781 */
fbb16e24
TG
782 if (tsc_pit_min == ULONG_MAX) {
783 /* PIT gave no useful value */
c767a54b 784 pr_warn("Unable to calibrate against PIT\n");
fbb16e24
TG
785
786 /* We don't have an alternative source, disable TSC */
827014be 787 if (!hpet && !ref1 && !ref2) {
c767a54b 788 pr_notice("No reference (HPET/PMTIMER) available\n");
fbb16e24
TG
789 return 0;
790 }
791
792 /* The alternative source failed as well, disable TSC */
793 if (tsc_ref_min == ULONG_MAX) {
c767a54b 794 pr_warn("HPET/PMTIMER calibration failed\n");
fbb16e24
TG
795 return 0;
796 }
797
798 /* Use the alternative source */
c767a54b
JP
799 pr_info("using %s reference calibration\n",
800 hpet ? "HPET" : "PMTIMER");
fbb16e24
TG
801
802 return tsc_ref_min;
803 }
bfc0f594 804
fbb16e24 805 /* We don't have an alternative source, use the PIT calibration value */
827014be 806 if (!hpet && !ref1 && !ref2) {
c767a54b 807 pr_info("Using PIT calibration value\n");
fbb16e24 808 return tsc_pit_min;
bfc0f594
AK
809 }
810
fbb16e24
TG
811 /* The alternative source failed, use the PIT calibration value */
812 if (tsc_ref_min == ULONG_MAX) {
c767a54b 813 pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
fbb16e24 814 return tsc_pit_min;
bfc0f594
AK
815 }
816
fbb16e24
TG
817 /*
818 * The calibration values differ too much. In doubt, we use
819 * the PIT value as we know that there are PMTIMERs around
a977c400 820 * running at double speed. At least we let the user know:
fbb16e24 821 */
c767a54b
JP
822 pr_warn("PIT calibration deviates from %s: %lu %lu\n",
823 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
824 pr_info("Using PIT calibration value\n");
fbb16e24 825 return tsc_pit_min;
bfc0f594
AK
826}
827
af576850 828void recalibrate_cpu_khz(void)
bfc0f594
AK
829{
830#ifndef CONFIG_SMP
831 unsigned long cpu_khz_old = cpu_khz;
832
eff4677e 833 if (!boot_cpu_has(X86_FEATURE_TSC))
af576850 834 return;
eff4677e 835
aa297292 836 cpu_khz = x86_platform.calibrate_cpu();
eff4677e 837 tsc_khz = x86_platform.calibrate_tsc();
aa297292
LB
838 if (tsc_khz == 0)
839 tsc_khz = cpu_khz;
ff4c8663
LB
840 else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
841 cpu_khz = tsc_khz;
eff4677e
BP
842 cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
843 cpu_khz_old, cpu_khz);
bfc0f594
AK
844#endif
845}
846
847EXPORT_SYMBOL(recalibrate_cpu_khz);
848
2dbe06fa 849
cd7240c0
SS
850static unsigned long long cyc2ns_suspend;
851
b74f05d6 852void tsc_save_sched_clock_state(void)
cd7240c0 853{
35af99e6 854 if (!sched_clock_stable())
cd7240c0
SS
855 return;
856
857 cyc2ns_suspend = sched_clock();
858}
859
860/*
861 * Even on processors with invariant TSC, TSC gets reset in some the
862 * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
863 * arbitrary value (still sync'd across cpu's) during resume from such sleep
864 * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
865 * that sched_clock() continues from the point where it was left off during
866 * suspend.
867 */
b74f05d6 868void tsc_restore_sched_clock_state(void)
cd7240c0
SS
869{
870 unsigned long long offset;
871 unsigned long flags;
872 int cpu;
873
35af99e6 874 if (!sched_clock_stable())
cd7240c0
SS
875 return;
876
877 local_irq_save(flags);
878
20d1c86a 879 /*
6a6256f9 880 * We're coming out of suspend, there's no concurrency yet; don't
20d1c86a
PZ
881 * bother being nice about the RCU stuff, just write to both
882 * data fields.
883 */
884
885 this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
886 this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
887
cd7240c0
SS
888 offset = cyc2ns_suspend - sched_clock();
889
20d1c86a
PZ
890 for_each_possible_cpu(cpu) {
891 per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
892 per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
893 }
cd7240c0
SS
894
895 local_irq_restore(flags);
896}
897
2dbe06fa 898#ifdef CONFIG_CPU_FREQ
2dbe06fa
AK
899/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
900 * changes.
901 *
902 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
903 * not that important because current Opteron setups do not support
904 * scaling on SMP anyroads.
905 *
906 * Should fix up last_tsc too. Currently gettimeofday in the
907 * first tick after the change will be slightly wrong.
908 */
909
910static unsigned int ref_freq;
911static unsigned long loops_per_jiffy_ref;
912static unsigned long tsc_khz_ref;
913
914static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
915 void *data)
916{
917 struct cpufreq_freqs *freq = data;
931db6a3 918 unsigned long *lpj;
2dbe06fa 919
931db6a3 920 lpj = &boot_cpu_data.loops_per_jiffy;
2dbe06fa 921#ifdef CONFIG_SMP
931db6a3 922 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
2dbe06fa 923 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
2dbe06fa
AK
924#endif
925
926 if (!ref_freq) {
927 ref_freq = freq->old;
928 loops_per_jiffy_ref = *lpj;
929 tsc_khz_ref = tsc_khz;
930 }
931 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
0b443ead 932 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
878f4f53 933 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
2dbe06fa
AK
934
935 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
936 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
937 mark_tsc_unstable("cpufreq changes");
2dbe06fa 938
5c3c2ea6 939 set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
3896c329 940 }
2dbe06fa
AK
941
942 return 0;
943}
944
945static struct notifier_block time_cpufreq_notifier_block = {
946 .notifier_call = time_cpufreq_notifier
947};
948
a841cca7 949static int __init cpufreq_register_tsc_scaling(void)
2dbe06fa 950{
59e21e3d 951 if (!boot_cpu_has(X86_FEATURE_TSC))
060700b5
LT
952 return 0;
953 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
954 return 0;
2dbe06fa
AK
955 cpufreq_register_notifier(&time_cpufreq_notifier_block,
956 CPUFREQ_TRANSITION_NOTIFIER);
957 return 0;
958}
959
a841cca7 960core_initcall(cpufreq_register_tsc_scaling);
2dbe06fa
AK
961
962#endif /* CONFIG_CPU_FREQ */
8fbbc4b4 963
f9677e0f
CH
964#define ART_CPUID_LEAF (0x15)
965#define ART_MIN_DENOMINATOR (1)
966
967
968/*
969 * If ART is present detect the numerator:denominator to convert to TSC
970 */
120fc3fb 971static void __init detect_art(void)
f9677e0f
CH
972{
973 unsigned int unused[2];
974
975 if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
976 return;
977
6c66350d 978 /*
979 * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
980 * and the TSC counter resets must not occur asynchronously.
981 */
f9677e0f
CH
982 if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
983 !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
6c66350d 984 !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
985 tsc_async_resets)
f9677e0f
CH
986 return;
987
7b3d2f6e
TG
988 cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
989 &art_to_tsc_numerator, unused, unused+1);
990
991 if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
f9677e0f
CH
992 return;
993
7b3d2f6e
TG
994 rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
995
f9677e0f
CH
996 /* Make this sticky over multiple CPU init calls */
997 setup_force_cpu_cap(X86_FEATURE_ART);
998}
999
1000
8fbbc4b4
AK
1001/* clocksource code */
1002
6a369583
TG
1003static void tsc_resume(struct clocksource *cs)
1004{
1005 tsc_verify_tsc_adjust(true);
1006}
1007
8fbbc4b4 1008/*
09ec5442 1009 * We used to compare the TSC to the cycle_last value in the clocksource
8fbbc4b4
AK
1010 * structure to avoid a nasty time-warp. This can be observed in a
1011 * very small window right after one CPU updated cycle_last under
1012 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
1013 * is smaller than the cycle_last reference value due to a TSC which
1014 * is slighty behind. This delta is nowhere else observable, but in
1015 * that case it results in a forward time jump in the range of hours
1016 * due to the unsigned delta calculation of the time keeping core
1017 * code, which is necessary to support wrapping clocksources like pm
1018 * timer.
09ec5442
TG
1019 *
1020 * This sanity check is now done in the core timekeeping code.
1021 * checking the result of read_tsc() - cycle_last for being negative.
1022 * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
8fbbc4b4 1023 */
a5a1d1c2 1024static u64 read_tsc(struct clocksource *cs)
8fbbc4b4 1025{
a5a1d1c2 1026 return (u64)rdtsc_ordered();
1be39679
MS
1027}
1028
12907fbb
TG
1029static void tsc_cs_mark_unstable(struct clocksource *cs)
1030{
1031 if (tsc_unstable)
1032 return;
f94c8d11 1033
12907fbb 1034 tsc_unstable = 1;
f94c8d11
PZ
1035 if (using_native_sched_clock())
1036 clear_sched_clock_stable();
12907fbb
TG
1037 disable_sched_clock_irqtime();
1038 pr_info("Marking TSC unstable due to clocksource watchdog\n");
1039}
1040
b421b22b
PZ
1041static void tsc_cs_tick_stable(struct clocksource *cs)
1042{
1043 if (tsc_unstable)
1044 return;
1045
1046 if (using_native_sched_clock())
1047 sched_clock_tick_stable();
1048}
1049
09ec5442
TG
1050/*
1051 * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
1052 */
aa83c457
PZ
1053static struct clocksource clocksource_tsc_early = {
1054 .name = "tsc-early",
1055 .rating = 299,
1056 .read = read_tsc,
1057 .mask = CLOCKSOURCE_MASK(64),
1058 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
1059 CLOCK_SOURCE_MUST_VERIFY,
1060 .archdata = { .vclock_mode = VCLOCK_TSC },
1061 .resume = tsc_resume,
1062 .mark_unstable = tsc_cs_mark_unstable,
1063 .tick_stable = tsc_cs_tick_stable,
e3b4f790 1064 .list = LIST_HEAD_INIT(clocksource_tsc_early.list),
aa83c457
PZ
1065};
1066
1067/*
1068 * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
1069 * this one will immediately take over. We will only register if TSC has
1070 * been found good.
1071 */
8fbbc4b4
AK
1072static struct clocksource clocksource_tsc = {
1073 .name = "tsc",
1074 .rating = 300,
1075 .read = read_tsc,
1076 .mask = CLOCKSOURCE_MASK(64),
8fbbc4b4 1077 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
aa83c457 1078 CLOCK_SOURCE_VALID_FOR_HRES |
8fbbc4b4 1079 CLOCK_SOURCE_MUST_VERIFY,
98d0ac38 1080 .archdata = { .vclock_mode = VCLOCK_TSC },
6a369583 1081 .resume = tsc_resume,
12907fbb 1082 .mark_unstable = tsc_cs_mark_unstable,
b421b22b 1083 .tick_stable = tsc_cs_tick_stable,
e3b4f790 1084 .list = LIST_HEAD_INIT(clocksource_tsc.list),
8fbbc4b4
AK
1085};
1086
1087void mark_tsc_unstable(char *reason)
1088{
f94c8d11
PZ
1089 if (tsc_unstable)
1090 return;
1091
1092 tsc_unstable = 1;
1093 if (using_native_sched_clock())
35af99e6 1094 clear_sched_clock_stable();
f94c8d11
PZ
1095 disable_sched_clock_irqtime();
1096 pr_info("Marking TSC unstable due to %s\n", reason);
e3b4f790
PZ
1097
1098 clocksource_mark_unstable(&clocksource_tsc_early);
1099 clocksource_mark_unstable(&clocksource_tsc);
8fbbc4b4
AK
1100}
1101
1102EXPORT_SYMBOL_GPL(mark_tsc_unstable);
1103
395628ef
AK
1104static void __init check_system_tsc_reliable(void)
1105{
03da3ff1
DW
1106#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
1107 if (is_geode_lx()) {
1108 /* RTSC counts during suspend */
8fbbc4b4 1109#define RTSC_SUSP 0x100
03da3ff1 1110 unsigned long res_low, res_high;
8fbbc4b4 1111
03da3ff1
DW
1112 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
1113 /* Geode_LX - the OLPC CPU has a very reliable TSC */
1114 if (res_low & RTSC_SUSP)
1115 tsc_clocksource_reliable = 1;
1116 }
8fbbc4b4 1117#endif
395628ef
AK
1118 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
1119 tsc_clocksource_reliable = 1;
1120}
8fbbc4b4
AK
1121
1122/*
1123 * Make an educated guess if the TSC is trustworthy and synchronized
1124 * over all CPUs.
1125 */
148f9bb8 1126int unsynchronized_tsc(void)
8fbbc4b4 1127{
59e21e3d 1128 if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
8fbbc4b4
AK
1129 return 1;
1130
3e5095d1 1131#ifdef CONFIG_SMP
8fbbc4b4
AK
1132 if (apic_is_clustered_box())
1133 return 1;
1134#endif
1135
1136 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
1137 return 0;
d3b8f889 1138
1139 if (tsc_clocksource_reliable)
1140 return 0;
8fbbc4b4
AK
1141 /*
1142 * Intel systems are normally all synchronized.
1143 * Exceptions must mark TSC as unstable:
1144 */
1145 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
1146 /* assume multi socket systems are not synchronized: */
1147 if (num_possible_cpus() > 1)
d3b8f889 1148 return 1;
8fbbc4b4
AK
1149 }
1150
d3b8f889 1151 return 0;
8fbbc4b4
AK
1152}
1153
f9677e0f
CH
1154/*
1155 * Convert ART to TSC given numerator/denominator found in detect_art()
1156 */
a5a1d1c2 1157struct system_counterval_t convert_art_to_tsc(u64 art)
f9677e0f
CH
1158{
1159 u64 tmp, res, rem;
1160
1161 rem = do_div(art, art_to_tsc_denominator);
1162
1163 res = art * art_to_tsc_numerator;
1164 tmp = rem * art_to_tsc_numerator;
1165
1166 do_div(tmp, art_to_tsc_denominator);
1167 res += tmp + art_to_tsc_offset;
1168
1169 return (struct system_counterval_t) {.cs = art_related_clocksource,
1170 .cycles = res};
1171}
1172EXPORT_SYMBOL(convert_art_to_tsc);
08ec0c58 1173
fc804f65
RJ
1174/**
1175 * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC.
1176 * @art_ns: ART (Always Running Timer) in unit of nanoseconds
1177 *
1178 * PTM requires all timestamps to be in units of nanoseconds. When user
1179 * software requests a cross-timestamp, this function converts system timestamp
1180 * to TSC.
1181 *
1182 * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set
1183 * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check
1184 * that this flag is set before conversion to TSC is attempted.
1185 *
1186 * Return:
1187 * struct system_counterval_t - system counter value with the pointer to the
1188 * corresponding clocksource
1189 * @cycles: System counter value
1190 * @cs: Clocksource corresponding to system counter value. Used
1191 * by timekeeping code to verify comparibility of two cycle
1192 * values.
1193 */
1194
1195struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
1196{
1197 u64 tmp, res, rem;
1198
1199 rem = do_div(art_ns, USEC_PER_SEC);
1200
1201 res = art_ns * tsc_khz;
1202 tmp = rem * tsc_khz;
1203
1204 do_div(tmp, USEC_PER_SEC);
1205 res += tmp;
1206
1207 return (struct system_counterval_t) { .cs = art_related_clocksource,
1208 .cycles = res};
1209}
1210EXPORT_SYMBOL(convert_art_ns_to_tsc);
1211
1212
08ec0c58
JS
1213static void tsc_refine_calibration_work(struct work_struct *work);
1214static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
1215/**
1216 * tsc_refine_calibration_work - Further refine tsc freq calibration
1217 * @work - ignored.
1218 *
1219 * This functions uses delayed work over a period of a
1220 * second to further refine the TSC freq value. Since this is
1221 * timer based, instead of loop based, we don't block the boot
1222 * process while this longer calibration is done.
1223 *
0d2eb44f 1224 * If there are any calibration anomalies (too many SMIs, etc),
08ec0c58
JS
1225 * or the refined calibration is off by 1% of the fast early
1226 * calibration, we throw out the new calibration and use the
1227 * early calibration.
1228 */
1229static void tsc_refine_calibration_work(struct work_struct *work)
1230{
1231 static u64 tsc_start = -1, ref_start;
1232 static int hpet;
1233 u64 tsc_stop, ref_stop, delta;
1234 unsigned long freq;
aa7b630e 1235 int cpu;
08ec0c58
JS
1236
1237 /* Don't bother refining TSC on unstable systems */
aa83c457 1238 if (tsc_unstable)
e9088add 1239 goto unreg;
08ec0c58
JS
1240
1241 /*
1242 * Since the work is started early in boot, we may be
1243 * delayed the first time we expire. So set the workqueue
1244 * again once we know timers are working.
1245 */
1246 if (tsc_start == -1) {
1247 /*
1248 * Only set hpet once, to avoid mixing hardware
1249 * if the hpet becomes enabled later.
1250 */
1251 hpet = is_hpet_enabled();
1252 schedule_delayed_work(&tsc_irqwork, HZ);
1253 tsc_start = tsc_read_refs(&ref_start, hpet);
1254 return;
1255 }
1256
1257 tsc_stop = tsc_read_refs(&ref_stop, hpet);
1258
1259 /* hpet or pmtimer available ? */
62627bec 1260 if (ref_start == ref_stop)
08ec0c58
JS
1261 goto out;
1262
1263 /* Check, whether the sampling was disturbed by an SMI */
1264 if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
1265 goto out;
1266
1267 delta = tsc_stop - tsc_start;
1268 delta *= 1000000LL;
1269 if (hpet)
1270 freq = calc_hpet_ref(delta, ref_start, ref_stop);
1271 else
1272 freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
1273
1274 /* Make sure we're within 1% */
1275 if (abs(tsc_khz - freq) > tsc_khz/100)
1276 goto out;
1277
1278 tsc_khz = freq;
c767a54b
JP
1279 pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
1280 (unsigned long)tsc_khz / 1000,
1281 (unsigned long)tsc_khz % 1000);
08ec0c58 1282
6731b0d6
NS
1283 /* Inform the TSC deadline clockevent devices about the recalibration */
1284 lapic_update_tsc_freq();
1285
aa7b630e
PZ
1286 /* Update the sched_clock() rate to match the clocksource one */
1287 for_each_possible_cpu(cpu)
5c3c2ea6 1288 set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
aa7b630e 1289
08ec0c58 1290out:
aa83c457 1291 if (tsc_unstable)
e9088add 1292 goto unreg;
aa83c457 1293
f9677e0f
CH
1294 if (boot_cpu_has(X86_FEATURE_ART))
1295 art_related_clocksource = &clocksource_tsc;
08ec0c58 1296 clocksource_register_khz(&clocksource_tsc, tsc_khz);
e9088add 1297unreg:
aa83c457 1298 clocksource_unregister(&clocksource_tsc_early);
08ec0c58
JS
1299}
1300
1301
1302static int __init init_tsc_clocksource(void)
8fbbc4b4 1303{
fe9af81e 1304 if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
a8760eca
TG
1305 return 0;
1306
e9088add
PZ
1307 if (tsc_unstable)
1308 goto unreg;
aa83c457 1309
395628ef
AK
1310 if (tsc_clocksource_reliable)
1311 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
57779dc2 1312
82f9c080
FT
1313 if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
1314 clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
1315
57779dc2 1316 /*
47c95a46
BG
1317 * When TSC frequency is known (retrieved via MSR or CPUID), we skip
1318 * the refined calibration and directly register it as a clocksource.
57779dc2 1319 */
984feceb 1320 if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
44fee88c
PZ
1321 if (boot_cpu_has(X86_FEATURE_ART))
1322 art_related_clocksource = &clocksource_tsc;
57779dc2 1323 clocksource_register_khz(&clocksource_tsc, tsc_khz);
e9088add 1324unreg:
aa83c457 1325 clocksource_unregister(&clocksource_tsc_early);
57779dc2
AK
1326 return 0;
1327 }
1328
08ec0c58
JS
1329 schedule_delayed_work(&tsc_irqwork, 0);
1330 return 0;
8fbbc4b4 1331}
08ec0c58
JS
1332/*
1333 * We use device_initcall here, to ensure we run after the hpet
1334 * is fully initialized, which may occur at fs_initcall time.
1335 */
1336device_initcall(init_tsc_clocksource);
8fbbc4b4 1337
eb496063
DL
1338void __init tsc_early_delay_calibrate(void)
1339{
1340 unsigned long lpj;
1341
1342 if (!boot_cpu_has(X86_FEATURE_TSC))
1343 return;
1344
1345 cpu_khz = x86_platform.calibrate_cpu();
1346 tsc_khz = x86_platform.calibrate_tsc();
1347
1348 tsc_khz = tsc_khz ? : cpu_khz;
1349 if (!tsc_khz)
1350 return;
1351
1352 lpj = tsc_khz * 1000;
1353 do_div(lpj, HZ);
1354 loops_per_jiffy = lpj;
1355}
1356
8fbbc4b4
AK
1357void __init tsc_init(void)
1358{
615cd033 1359 u64 lpj, cyc;
8fbbc4b4
AK
1360 int cpu;
1361
59e21e3d 1362 if (!boot_cpu_has(X86_FEATURE_TSC)) {
b47dcbdc 1363 setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
8fbbc4b4 1364 return;
b47dcbdc 1365 }
8fbbc4b4 1366
aa297292 1367 cpu_khz = x86_platform.calibrate_cpu();
2d826404 1368 tsc_khz = x86_platform.calibrate_tsc();
ff4c8663
LB
1369
1370 /*
1371 * Trust non-zero tsc_khz as authorative,
1372 * and use it to sanity check cpu_khz,
1373 * which will be off if system timer is off.
1374 */
aa297292
LB
1375 if (tsc_khz == 0)
1376 tsc_khz = cpu_khz;
ff4c8663
LB
1377 else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
1378 cpu_khz = tsc_khz;
8fbbc4b4 1379
e93ef949 1380 if (!tsc_khz) {
8fbbc4b4 1381 mark_tsc_unstable("could not calculate TSC khz");
b47dcbdc 1382 setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
8fbbc4b4
AK
1383 return;
1384 }
1385
c767a54b
JP
1386 pr_info("Detected %lu.%03lu MHz processor\n",
1387 (unsigned long)cpu_khz / 1000,
1388 (unsigned long)cpu_khz % 1000);
8fbbc4b4 1389
4b5b2127
LB
1390 if (cpu_khz != tsc_khz) {
1391 pr_info("Detected %lu.%03lu MHz TSC",
1392 (unsigned long)tsc_khz / 1000,
1393 (unsigned long)tsc_khz % 1000);
1394 }
1395
f2e04214
TG
1396 /* Sanitize TSC ADJUST before cyc2ns gets initialized */
1397 tsc_store_and_check_tsc_adjust(true);
1398
8fbbc4b4
AK
1399 /*
1400 * Secondary CPUs do not run through tsc_init(), so set up
1401 * all the scale factors for all CPUs, assuming the same
1402 * speed as the bootup CPU. (cpufreq notifiers will fix this
1403 * up if their speed diverges)
1404 */
615cd033 1405 cyc = rdtsc();
20d1c86a
PZ
1406 for_each_possible_cpu(cpu) {
1407 cyc2ns_init(cpu);
5c3c2ea6 1408 set_cyc2ns_scale(tsc_khz, cpu, cyc);
20d1c86a 1409 }
8fbbc4b4 1410
3bbfafb7 1411 static_branch_enable(&__use_tsc);
8fbbc4b4 1412
e82b8e4e
VP
1413 if (!no_sched_irq_time)
1414 enable_sched_clock_irqtime();
1415
70de9a97
AK
1416 lpj = ((u64)tsc_khz * 1000);
1417 do_div(lpj, HZ);
1418 lpj_fine = lpj;
1419
8fbbc4b4 1420 use_tsc_delay();
8fbbc4b4 1421
a1272dd5
ZD
1422 check_system_tsc_reliable();
1423
aa83c457 1424 if (unsynchronized_tsc()) {
8fbbc4b4 1425 mark_tsc_unstable("TSCs unsynchronized");
aa83c457
PZ
1426 return;
1427 }
8fbbc4b4 1428
aa83c457 1429 clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
f9677e0f 1430 detect_art();
8fbbc4b4
AK
1431}
1432
b565201c
JS
1433#ifdef CONFIG_SMP
1434/*
1435 * If we have a constant TSC and are using the TSC for the delay loop,
1436 * we can skip clock calibration if another cpu in the same socket has already
1437 * been calibrated. This assumes that CONSTANT_TSC applies to all
1438 * cpus in the socket - this should be a safe assumption.
1439 */
148f9bb8 1440unsigned long calibrate_delay_is_known(void)
b565201c 1441{
c25323c0 1442 int sibling, cpu = smp_processor_id();
76ce7cfe
PT
1443 int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
1444 const struct cpumask *mask = topology_core_cpumask(cpu);
b565201c 1445
fe9af81e 1446 if (!constant_tsc || !mask)
f508a5ba
TG
1447 return 0;
1448
1449 sibling = cpumask_any_but(mask, cpu);
c25323c0
TG
1450 if (sibling < nr_cpu_ids)
1451 return cpu_data(sibling).loops_per_jiffy;
b565201c
JS
1452 return 0;
1453}
1454#endif