sched/preempt/x86: Fix voluntary preempt for x86
[linux-2.6-block.git] / kernel / sched / clock.c
CommitLineData
3e51f33f
PZ
1/*
2 * sched_clock for unstable cpu clocks
3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
c300ba25
SR
6 * Updates and enhancements:
7 * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
8 *
3e51f33f
PZ
9 * Based on code by:
10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com>
12 *
c676329a
PZ
13 *
14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
c676329a
PZ
29 * local_clock() -- is cpu_clock() on the current cpu.
30 *
ef08f0ff
PZ
31 * sched_clock_cpu(i)
32 *
c676329a
PZ
33 * How:
34 *
35 * The implementation either uses sched_clock() when
36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
37 * sched_clock() is assumed to provide these properties (mostly it means
38 * the architecture provides a globally synchronized highres time source).
39 *
40 * Otherwise it tries to create a semi stable clock from a mixture of other
41 * clocks, including:
42 *
43 * - GTOD (clock monotomic)
3e51f33f
PZ
44 * - sched_clock()
45 * - explicit idle events
46 *
c676329a
PZ
47 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
48 * deltas are filtered to provide monotonicity and keeping it within an
49 * expected window.
3e51f33f
PZ
50 *
51 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
52 * that is otherwise invisible (TSC gets stopped).
53 *
3e51f33f 54 */
3e51f33f 55#include <linux/spinlock.h>
6409c4da 56#include <linux/hardirq.h>
9984de1a 57#include <linux/export.h>
b342501c
IM
58#include <linux/percpu.h>
59#include <linux/ktime.h>
60#include <linux/sched.h>
35af99e6 61#include <linux/static_key.h>
6577e42a 62#include <linux/workqueue.h>
3e51f33f 63
2c3d103b
HD
64/*
65 * Scheduler clock - returns current time in nanosec units.
66 * This is default implementation.
67 * Architectures and sub-architectures can override this.
68 */
69unsigned long long __attribute__((weak)) sched_clock(void)
70{
92d23f70
R
71 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
72 * (NSEC_PER_SEC / HZ);
2c3d103b 73}
b6ac23af 74EXPORT_SYMBOL_GPL(sched_clock);
3e51f33f 75
5bb6b1ea 76__read_mostly int sched_clock_running;
c1955a3d 77
3e51f33f 78#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
35af99e6
PZ
79static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
80
81int sched_clock_stable(void)
82{
83 if (static_key_false(&__sched_clock_stable))
84 return false;
85 return true;
86}
87
88void set_sched_clock_stable(void)
89{
90 if (!sched_clock_stable())
91 static_key_slow_dec(&__sched_clock_stable);
92}
93
6577e42a 94static void __clear_sched_clock_stable(struct work_struct *work)
35af99e6
PZ
95{
96 /* XXX worry about clock continuity */
97 if (sched_clock_stable())
98 static_key_slow_inc(&__sched_clock_stable);
99}
3e51f33f 100
6577e42a
PZ
101static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
102
103void clear_sched_clock_stable(void)
104{
105 if (keventd_up())
106 schedule_work(&sched_clock_work);
107 else
108 __clear_sched_clock_stable(&sched_clock_work);
109}
110
3e51f33f 111struct sched_clock_data {
3e51f33f
PZ
112 u64 tick_raw;
113 u64 tick_gtod;
114 u64 clock;
115};
116
117static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
118
119static inline struct sched_clock_data *this_scd(void)
120{
121 return &__get_cpu_var(sched_clock_data);
122}
123
124static inline struct sched_clock_data *cpu_sdc(int cpu)
125{
126 return &per_cpu(sched_clock_data, cpu);
127}
128
129void sched_clock_init(void)
130{
131 u64 ktime_now = ktime_to_ns(ktime_get());
3e51f33f
PZ
132 int cpu;
133
134 for_each_possible_cpu(cpu) {
135 struct sched_clock_data *scd = cpu_sdc(cpu);
136
a381759d 137 scd->tick_raw = 0;
3e51f33f
PZ
138 scd->tick_gtod = ktime_now;
139 scd->clock = ktime_now;
140 }
a381759d
PZ
141
142 sched_clock_running = 1;
3e51f33f
PZ
143}
144
354879bb 145/*
b342501c 146 * min, max except they take wrapping into account
354879bb
PZ
147 */
148
149static inline u64 wrap_min(u64 x, u64 y)
150{
151 return (s64)(x - y) < 0 ? x : y;
152}
153
154static inline u64 wrap_max(u64 x, u64 y)
155{
156 return (s64)(x - y) > 0 ? x : y;
157}
158
3e51f33f
PZ
159/*
160 * update the percpu scd from the raw @now value
161 *
162 * - filter out backward motion
354879bb 163 * - use the GTOD tick value to create a window to filter crazy TSC values
3e51f33f 164 */
def0a9b2 165static u64 sched_clock_local(struct sched_clock_data *scd)
3e51f33f 166{
def0a9b2
PZ
167 u64 now, clock, old_clock, min_clock, max_clock;
168 s64 delta;
3e51f33f 169
def0a9b2
PZ
170again:
171 now = sched_clock();
172 delta = now - scd->tick_raw;
354879bb
PZ
173 if (unlikely(delta < 0))
174 delta = 0;
3e51f33f 175
def0a9b2
PZ
176 old_clock = scd->clock;
177
354879bb
PZ
178 /*
179 * scd->clock = clamp(scd->tick_gtod + delta,
b342501c
IM
180 * max(scd->tick_gtod, scd->clock),
181 * scd->tick_gtod + TICK_NSEC);
354879bb 182 */
3e51f33f 183
354879bb 184 clock = scd->tick_gtod + delta;
def0a9b2
PZ
185 min_clock = wrap_max(scd->tick_gtod, old_clock);
186 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
3e51f33f 187
354879bb
PZ
188 clock = wrap_max(clock, min_clock);
189 clock = wrap_min(clock, max_clock);
3e51f33f 190
152f9d07 191 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
def0a9b2 192 goto again;
56b90612 193
def0a9b2 194 return clock;
3e51f33f
PZ
195}
196
def0a9b2 197static u64 sched_clock_remote(struct sched_clock_data *scd)
3e51f33f 198{
def0a9b2
PZ
199 struct sched_clock_data *my_scd = this_scd();
200 u64 this_clock, remote_clock;
201 u64 *ptr, old_val, val;
202
a1cbcaa9
TG
203#if BITS_PER_LONG != 64
204again:
205 /*
206 * Careful here: The local and the remote clock values need to
207 * be read out atomic as we need to compare the values and
208 * then update either the local or the remote side. So the
209 * cmpxchg64 below only protects one readout.
210 *
211 * We must reread via sched_clock_local() in the retry case on
212 * 32bit as an NMI could use sched_clock_local() via the
213 * tracer and hit between the readout of
214 * the low32bit and the high 32bit portion.
215 */
216 this_clock = sched_clock_local(my_scd);
217 /*
218 * We must enforce atomic readout on 32bit, otherwise the
219 * update on the remote cpu can hit inbetween the readout of
220 * the low32bit and the high 32bit portion.
221 */
222 remote_clock = cmpxchg64(&scd->clock, 0, 0);
223#else
224 /*
225 * On 64bit the read of [my]scd->clock is atomic versus the
226 * update, so we can avoid the above 32bit dance.
227 */
def0a9b2
PZ
228 sched_clock_local(my_scd);
229again:
230 this_clock = my_scd->clock;
231 remote_clock = scd->clock;
a1cbcaa9 232#endif
def0a9b2
PZ
233
234 /*
235 * Use the opportunity that we have both locks
236 * taken to couple the two clocks: we take the
237 * larger time as the latest time for both
238 * runqueues. (this creates monotonic movement)
239 */
240 if (likely((s64)(remote_clock - this_clock) < 0)) {
241 ptr = &scd->clock;
242 old_val = remote_clock;
243 val = this_clock;
3e51f33f 244 } else {
def0a9b2
PZ
245 /*
246 * Should be rare, but possible:
247 */
248 ptr = &my_scd->clock;
249 old_val = this_clock;
250 val = remote_clock;
3e51f33f 251 }
def0a9b2 252
152f9d07 253 if (cmpxchg64(ptr, old_val, val) != old_val)
def0a9b2
PZ
254 goto again;
255
256 return val;
3e51f33f
PZ
257}
258
c676329a
PZ
259/*
260 * Similar to cpu_clock(), but requires local IRQs to be disabled.
261 *
262 * See cpu_clock().
263 */
3e51f33f
PZ
264u64 sched_clock_cpu(int cpu)
265{
b342501c 266 struct sched_clock_data *scd;
def0a9b2
PZ
267 u64 clock;
268
35af99e6 269 if (sched_clock_stable())
b342501c 270 return sched_clock();
a381759d 271
a381759d
PZ
272 if (unlikely(!sched_clock_running))
273 return 0ull;
274
ef08f0ff 275 preempt_disable();
def0a9b2 276 scd = cpu_sdc(cpu);
3e51f33f 277
def0a9b2
PZ
278 if (cpu != smp_processor_id())
279 clock = sched_clock_remote(scd);
280 else
281 clock = sched_clock_local(scd);
ef08f0ff 282 preempt_enable();
e4e4e534 283
3e51f33f
PZ
284 return clock;
285}
286
287void sched_clock_tick(void)
288{
8325d9c0 289 struct sched_clock_data *scd;
3e51f33f
PZ
290 u64 now, now_gtod;
291
35af99e6 292 if (sched_clock_stable())
8325d9c0
PZ
293 return;
294
a381759d
PZ
295 if (unlikely(!sched_clock_running))
296 return;
297
3e51f33f
PZ
298 WARN_ON_ONCE(!irqs_disabled());
299
8325d9c0 300 scd = this_scd();
3e51f33f 301 now_gtod = ktime_to_ns(ktime_get());
a83bc47c 302 now = sched_clock();
3e51f33f 303
3e51f33f
PZ
304 scd->tick_raw = now;
305 scd->tick_gtod = now_gtod;
def0a9b2 306 sched_clock_local(scd);
3e51f33f
PZ
307}
308
309/*
310 * We are going deep-idle (irqs are disabled):
311 */
312void sched_clock_idle_sleep_event(void)
313{
314 sched_clock_cpu(smp_processor_id());
315}
316EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
317
318/*
319 * We just idled delta nanoseconds (called with irqs disabled):
320 */
321void sched_clock_idle_wakeup_event(u64 delta_ns)
322{
1c5745aa
TG
323 if (timekeeping_suspended)
324 return;
325
354879bb 326 sched_clock_tick();
3e51f33f
PZ
327 touch_softlockup_watchdog();
328}
329EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
330
c676329a
PZ
331/*
332 * As outlined at the top, provides a fast, high resolution, nanosecond
333 * time source that is monotonic per cpu argument and has bounded drift
334 * between cpus.
335 *
336 * ######################### BIG FAT WARNING ##########################
337 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
338 * # go backwards !! #
339 * ####################################################################
340 */
341u64 cpu_clock(int cpu)
b9f8fcd5 342{
35af99e6
PZ
343 if (static_key_false(&__sched_clock_stable))
344 return sched_clock_cpu(cpu);
345
346 return sched_clock();
b9f8fcd5
DM
347}
348
c676329a
PZ
349/*
350 * Similar to cpu_clock() for the current cpu. Time will only be observed
351 * to be monotonic if care is taken to only compare timestampt taken on the
352 * same CPU.
353 *
354 * See cpu_clock().
355 */
356u64 local_clock(void)
357{
35af99e6
PZ
358 if (static_key_false(&__sched_clock_stable))
359 return sched_clock_cpu(raw_smp_processor_id());
360
361 return sched_clock();
c676329a
PZ
362}
363
8325d9c0
PZ
364#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
365
366void sched_clock_init(void)
367{
368 sched_clock_running = 1;
369}
370
371u64 sched_clock_cpu(int cpu)
372{
373 if (unlikely(!sched_clock_running))
374 return 0;
375
376 return sched_clock();
377}
378
c676329a 379u64 cpu_clock(int cpu)
76a2a6ee 380{
35af99e6 381 return sched_clock();
b9f8fcd5 382}
76a2a6ee 383
c676329a
PZ
384u64 local_clock(void)
385{
35af99e6 386 return sched_clock();
c676329a
PZ
387}
388
b9f8fcd5 389#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
76a2a6ee 390
4c9fe8ad 391EXPORT_SYMBOL_GPL(cpu_clock);
c676329a 392EXPORT_SYMBOL_GPL(local_clock);