Commit | Line | Data |
---|---|---|
b26bf6ab RW |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Timer events oriented CPU idle governor | |
4 | * | |
5 | * Copyright (C) 2018 Intel Corporation | |
6 | * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> | |
7 | * | |
8 | * The idea of this governor is based on the observation that on many systems | |
9 | * timer events are two or more orders of magnitude more frequent than any | |
10 | * other interrupts, so they are likely to be the most significant source of CPU | |
11 | * wakeups from idle states. Moreover, information about what happened in the | |
12 | * (relatively recent) past can be used to estimate whether or not the deepest | |
13 | * idle state with target residency within the time to the closest timer is | |
14 | * likely to be suitable for the upcoming idle time of the CPU and, if not, then | |
15 | * which of the shallower idle states to choose. | |
16 | * | |
17 | * Of course, non-timer wakeup sources are more important in some use cases and | |
18 | * they can be covered by taking a few most recent idle time intervals of the | |
19 | * CPU into account. However, even in that case it is not necessary to consider | |
20 | * idle duration values greater than the time till the closest timer, as the | |
21 | * patterns that they may belong to produce average values close enough to | |
22 | * the time till the closest timer (sleep length) anyway. | |
23 | * | |
24 | * Thus this governor estimates whether or not the upcoming idle time of the CPU | |
25 | * is likely to be significantly shorter than the sleep length and selects an | |
26 | * idle state for it in accordance with that, as follows: | |
27 | * | |
28 | * - Find an idle state on the basis of the sleep length and state statistics | |
29 | * collected over time: | |
30 | * | |
31 | * o Find the deepest idle state whose target residency is less than or equal | |
32 | * to the sleep length. | |
33 | * | |
34 | * o Select it if it matched both the sleep length and the observed idle | |
35 | * duration in the past more often than it matched the sleep length alone | |
36 | * (i.e. the observed idle duration was significantly shorter than the sleep | |
37 | * length matched by it). | |
38 | * | |
39 | * o Otherwise, select the shallower state with the greatest matched "early" | |
40 | * wakeups metric. | |
41 | * | |
42 | * - If the majority of the most recent idle duration values are below the | |
43 | * target residency of the idle state selected so far, use those values to | |
44 | * compute the new expected idle duration and find an idle state matching it | |
45 | * (which has to be shallower than the one selected so far). | |
46 | */ | |
47 | ||
48 | #include <linux/cpuidle.h> | |
49 | #include <linux/jiffies.h> | |
50 | #include <linux/kernel.h> | |
51 | #include <linux/sched/clock.h> | |
52 | #include <linux/tick.h> | |
53 | ||
54 | /* | |
55 | * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value | |
56 | * is used for decreasing metrics on a regular basis. | |
57 | */ | |
58 | #define PULSE 1024 | |
59 | #define DECAY_SHIFT 3 | |
60 | ||
61 | /* | |
62 | * Number of the most recent idle duration values to take into consideration for | |
63 | * the detection of wakeup patterns. | |
64 | */ | |
65 | #define INTERVALS 8 | |
66 | ||
67 | /** | |
68 | * struct teo_idle_state - Idle state data used by the TEO cpuidle governor. | |
69 | * @early_hits: "Early" CPU wakeups "matching" this state. | |
70 | * @hits: "On time" CPU wakeups "matching" this state. | |
71 | * @misses: CPU wakeups "missing" this state. | |
72 | * | |
73 | * A CPU wakeup is "matched" by a given idle state if the idle duration measured | |
74 | * after the wakeup is between the target residency of that state and the target | |
75 | * residency of the next one (or if this is the deepest available idle state, it | |
76 | * "matches" a CPU wakeup when the measured idle duration is at least equal to | |
77 | * its target residency). | |
78 | * | |
79 | * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if | |
80 | * it occurs significantly earlier than the closest expected timer event (that | |
81 | * is, early enough to match an idle state shallower than the one matching the | |
82 | * time till the closest timer event). Otherwise, the wakeup is "on time", or | |
83 | * it is a "hit". | |
84 | * | |
85 | * A "miss" occurs when the given state doesn't match the wakeup, but it matches | |
86 | * the time till the closest timer event used for idle state selection. | |
87 | */ | |
88 | struct teo_idle_state { | |
89 | unsigned int early_hits; | |
90 | unsigned int hits; | |
91 | unsigned int misses; | |
92 | }; | |
93 | ||
94 | /** | |
95 | * struct teo_cpu - CPU data used by the TEO cpuidle governor. | |
96 | * @time_span_ns: Time between idle state selection and post-wakeup update. | |
97 | * @sleep_length_ns: Time till the closest timer event (at the selection time). | |
98 | * @states: Idle states data corresponding to this CPU. | |
b26bf6ab RW |
99 | * @interval_idx: Index of the most recent saved idle interval. |
100 | * @intervals: Saved idle duration values. | |
101 | */ | |
102 | struct teo_cpu { | |
103 | u64 time_span_ns; | |
104 | u64 sleep_length_ns; | |
105 | struct teo_idle_state states[CPUIDLE_STATE_MAX]; | |
b26bf6ab RW |
106 | int interval_idx; |
107 | unsigned int intervals[INTERVALS]; | |
108 | }; | |
109 | ||
110 | static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); | |
111 | ||
112 | /** | |
113 | * teo_update - Update CPU data after wakeup. | |
114 | * @drv: cpuidle driver containing state data. | |
115 | * @dev: Target CPU. | |
116 | */ | |
117 | static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) | |
118 | { | |
119 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
120 | unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns); | |
121 | int i, idx_hit = -1, idx_timer = -1; | |
122 | unsigned int measured_us; | |
123 | ||
124 | if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { | |
125 | /* | |
b7e7fffd RW |
126 | * One of the safety nets has triggered or the wakeup was close |
127 | * enough to the closest timer event expected at the idle state | |
128 | * selection time to be discarded. | |
b26bf6ab | 129 | */ |
b7e7fffd | 130 | measured_us = UINT_MAX; |
b26bf6ab | 131 | } else { |
7d4daeed MT |
132 | unsigned int lat; |
133 | ||
134 | lat = drv->states[dev->last_state_idx].exit_latency; | |
b26bf6ab RW |
135 | |
136 | measured_us = ktime_to_us(cpu_data->time_span_ns); | |
137 | /* | |
138 | * The delay between the wakeup and the first instruction | |
139 | * executed by the CPU is not likely to be worst-case every | |
140 | * time, so take 1/2 of the exit latency as a very rough | |
141 | * approximation of the average of it. | |
142 | */ | |
143 | if (measured_us >= lat) | |
144 | measured_us -= lat / 2; | |
145 | else | |
146 | measured_us /= 2; | |
147 | } | |
148 | ||
149 | /* | |
150 | * Decay the "early hits" metric for all of the states and find the | |
151 | * states matching the sleep length and the measured idle duration. | |
152 | */ | |
153 | for (i = 0; i < drv->state_count; i++) { | |
154 | unsigned int early_hits = cpu_data->states[i].early_hits; | |
155 | ||
156 | cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT; | |
157 | ||
158 | if (drv->states[i].target_residency <= sleep_length_us) { | |
159 | idx_timer = i; | |
160 | if (drv->states[i].target_residency <= measured_us) | |
161 | idx_hit = i; | |
162 | } | |
163 | } | |
164 | ||
165 | /* | |
166 | * Update the "hits" and "misses" data for the state matching the sleep | |
167 | * length. If it matches the measured idle duration too, this is a hit, | |
168 | * so increase the "hits" metric for it then. Otherwise, this is a | |
169 | * miss, so increase the "misses" metric for it. In the latter case | |
170 | * also increase the "early hits" metric for the state that actually | |
171 | * matches the measured idle duration. | |
172 | */ | |
173 | if (idx_timer >= 0) { | |
174 | unsigned int hits = cpu_data->states[idx_timer].hits; | |
175 | unsigned int misses = cpu_data->states[idx_timer].misses; | |
176 | ||
177 | hits -= hits >> DECAY_SHIFT; | |
178 | misses -= misses >> DECAY_SHIFT; | |
179 | ||
180 | if (idx_timer > idx_hit) { | |
181 | misses += PULSE; | |
182 | if (idx_hit >= 0) | |
183 | cpu_data->states[idx_hit].early_hits += PULSE; | |
184 | } else { | |
185 | hits += PULSE; | |
186 | } | |
187 | ||
188 | cpu_data->states[idx_timer].misses = misses; | |
189 | cpu_data->states[idx_timer].hits = hits; | |
190 | } | |
191 | ||
b26bf6ab RW |
192 | /* |
193 | * Save idle duration values corresponding to non-timer wakeups for | |
194 | * pattern detection. | |
195 | */ | |
196 | cpu_data->intervals[cpu_data->interval_idx++] = measured_us; | |
197 | if (cpu_data->interval_idx > INTERVALS) | |
198 | cpu_data->interval_idx = 0; | |
199 | } | |
200 | ||
201 | /** | |
202 | * teo_find_shallower_state - Find shallower idle state matching given duration. | |
203 | * @drv: cpuidle driver containing state data. | |
204 | * @dev: Target CPU. | |
205 | * @state_idx: Index of the capping idle state. | |
206 | * @duration_us: Idle duration value to match. | |
207 | */ | |
208 | static int teo_find_shallower_state(struct cpuidle_driver *drv, | |
209 | struct cpuidle_device *dev, int state_idx, | |
210 | unsigned int duration_us) | |
211 | { | |
212 | int i; | |
213 | ||
214 | for (i = state_idx - 1; i >= 0; i--) { | |
215 | if (drv->states[i].disabled || dev->states_usage[i].disable) | |
216 | continue; | |
217 | ||
218 | state_idx = i; | |
219 | if (drv->states[i].target_residency <= duration_us) | |
220 | break; | |
221 | } | |
222 | return state_idx; | |
223 | } | |
224 | ||
225 | /** | |
226 | * teo_select - Selects the next idle state to enter. | |
227 | * @drv: cpuidle driver containing state data. | |
228 | * @dev: Target CPU. | |
229 | * @stop_tick: Indication on whether or not to stop the scheduler tick. | |
230 | */ | |
231 | static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, | |
232 | bool *stop_tick) | |
233 | { | |
234 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
235 | int latency_req = cpuidle_governor_latency_req(dev->cpu); | |
236 | unsigned int duration_us, count; | |
cab09f3d | 237 | int max_early_idx, constraint_idx, idx, i; |
b26bf6ab RW |
238 | ktime_t delta_tick; |
239 | ||
7d4daeed | 240 | if (dev->last_state_idx >= 0) { |
b26bf6ab | 241 | teo_update(drv, dev); |
7d4daeed | 242 | dev->last_state_idx = -1; |
b26bf6ab RW |
243 | } |
244 | ||
245 | cpu_data->time_span_ns = local_clock(); | |
246 | ||
247 | cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); | |
248 | duration_us = ktime_to_us(cpu_data->sleep_length_ns); | |
249 | ||
250 | count = 0; | |
251 | max_early_idx = -1; | |
cab09f3d | 252 | constraint_idx = drv->state_count; |
b26bf6ab RW |
253 | idx = -1; |
254 | ||
255 | for (i = 0; i < drv->state_count; i++) { | |
256 | struct cpuidle_state *s = &drv->states[i]; | |
257 | struct cpuidle_state_usage *su = &dev->states_usage[i]; | |
258 | ||
259 | if (s->disabled || su->disable) { | |
069ce2ef RW |
260 | /* |
261 | * Ignore disabled states with target residencies beyond | |
262 | * the anticipated idle duration. | |
263 | */ | |
264 | if (s->target_residency > duration_us) | |
265 | continue; | |
266 | ||
b26bf6ab RW |
267 | /* |
268 | * If the "early hits" metric of a disabled state is | |
269 | * greater than the current maximum, it should be taken | |
270 | * into account, because it would be a mistake to select | |
271 | * a deeper state with lower "early hits" metric. The | |
272 | * index cannot be changed to point to it, however, so | |
273 | * just increase the max count alone and let the index | |
274 | * still point to a shallower idle state. | |
275 | */ | |
276 | if (max_early_idx >= 0 && | |
277 | count < cpu_data->states[i].early_hits) | |
278 | count = cpu_data->states[i].early_hits; | |
279 | ||
280 | continue; | |
281 | } | |
282 | ||
283 | if (idx < 0) | |
284 | idx = i; /* first enabled state */ | |
285 | ||
286 | if (s->target_residency > duration_us) | |
287 | break; | |
288 | ||
cab09f3d RW |
289 | if (s->exit_latency > latency_req && constraint_idx > i) |
290 | constraint_idx = i; | |
b26bf6ab RW |
291 | |
292 | idx = i; | |
293 | ||
294 | if (count < cpu_data->states[i].early_hits && | |
295 | !(tick_nohz_tick_stopped() && | |
296 | drv->states[i].target_residency < TICK_USEC)) { | |
297 | count = cpu_data->states[i].early_hits; | |
298 | max_early_idx = i; | |
299 | } | |
300 | } | |
301 | ||
302 | /* | |
303 | * If the "hits" metric of the idle state matching the sleep length is | |
304 | * greater than its "misses" metric, that is the one to use. Otherwise, | |
305 | * it is more likely that one of the shallower states will match the | |
306 | * idle duration observed after wakeup, so take the one with the maximum | |
307 | * "early hits" metric, but if that cannot be determined, just use the | |
308 | * state selected so far. | |
309 | */ | |
310 | if (cpu_data->states[idx].hits <= cpu_data->states[idx].misses && | |
311 | max_early_idx >= 0) { | |
312 | idx = max_early_idx; | |
313 | duration_us = drv->states[idx].target_residency; | |
314 | } | |
315 | ||
cab09f3d RW |
316 | /* |
317 | * If there is a latency constraint, it may be necessary to use a | |
318 | * shallower idle state than the one selected so far. | |
319 | */ | |
320 | if (constraint_idx < idx) | |
321 | idx = constraint_idx; | |
322 | ||
b26bf6ab RW |
323 | if (idx < 0) { |
324 | idx = 0; /* No states enabled. Must use 0. */ | |
325 | } else if (idx > 0) { | |
326 | u64 sum = 0; | |
327 | ||
328 | count = 0; | |
329 | ||
330 | /* | |
331 | * Count and sum the most recent idle duration values less than | |
cab09f3d | 332 | * the current expected idle duration value. |
b26bf6ab RW |
333 | */ |
334 | for (i = 0; i < INTERVALS; i++) { | |
335 | unsigned int val = cpu_data->intervals[i]; | |
336 | ||
cab09f3d | 337 | if (val >= duration_us) |
b26bf6ab RW |
338 | continue; |
339 | ||
340 | count++; | |
341 | sum += val; | |
342 | } | |
343 | ||
344 | /* | |
345 | * Give up unless the majority of the most recent idle duration | |
346 | * values are in the interesting range. | |
347 | */ | |
348 | if (count > INTERVALS / 2) { | |
349 | unsigned int avg_us = div64_u64(sum, count); | |
350 | ||
351 | /* | |
352 | * Avoid spending too much time in an idle state that | |
353 | * would be too shallow. | |
354 | */ | |
355 | if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) { | |
b26bf6ab | 356 | duration_us = avg_us; |
cab09f3d RW |
357 | if (drv->states[idx].target_residency > avg_us) |
358 | idx = teo_find_shallower_state(drv, dev, | |
359 | idx, avg_us); | |
b26bf6ab RW |
360 | } |
361 | } | |
362 | } | |
363 | ||
364 | /* | |
365 | * Don't stop the tick if the selected state is a polling one or if the | |
366 | * expected idle duration is shorter than the tick period length. | |
367 | */ | |
368 | if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || | |
369 | duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) { | |
370 | unsigned int delta_tick_us = ktime_to_us(delta_tick); | |
371 | ||
372 | *stop_tick = false; | |
373 | ||
374 | /* | |
375 | * The tick is not going to be stopped, so if the target | |
376 | * residency of the state to be returned is not within the time | |
377 | * till the closest timer including the tick, try to correct | |
378 | * that. | |
379 | */ | |
380 | if (idx > 0 && drv->states[idx].target_residency > delta_tick_us) | |
381 | idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us); | |
382 | } | |
383 | ||
384 | return idx; | |
385 | } | |
386 | ||
387 | /** | |
388 | * teo_reflect - Note that governor data for the CPU need to be updated. | |
389 | * @dev: Target CPU. | |
390 | * @state: Entered state. | |
391 | */ | |
392 | static void teo_reflect(struct cpuidle_device *dev, int state) | |
393 | { | |
394 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
395 | ||
7d4daeed | 396 | dev->last_state_idx = state; |
b26bf6ab RW |
397 | /* |
398 | * If the wakeup was not "natural", but triggered by one of the safety | |
399 | * nets, assume that the CPU might have been idle for the entire sleep | |
400 | * length time. | |
401 | */ | |
402 | if (dev->poll_time_limit || | |
403 | (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { | |
404 | dev->poll_time_limit = false; | |
405 | cpu_data->time_span_ns = cpu_data->sleep_length_ns; | |
406 | } else { | |
407 | cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; | |
408 | } | |
409 | } | |
410 | ||
411 | /** | |
412 | * teo_enable_device - Initialize the governor's data for the target CPU. | |
413 | * @drv: cpuidle driver (not used). | |
414 | * @dev: Target CPU. | |
415 | */ | |
416 | static int teo_enable_device(struct cpuidle_driver *drv, | |
417 | struct cpuidle_device *dev) | |
418 | { | |
419 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
420 | int i; | |
421 | ||
422 | memset(cpu_data, 0, sizeof(*cpu_data)); | |
423 | ||
424 | for (i = 0; i < INTERVALS; i++) | |
425 | cpu_data->intervals[i] = UINT_MAX; | |
426 | ||
427 | return 0; | |
428 | } | |
429 | ||
430 | static struct cpuidle_governor teo_governor = { | |
431 | .name = "teo", | |
432 | .rating = 19, | |
433 | .enable = teo_enable_device, | |
434 | .select = teo_select, | |
435 | .reflect = teo_reflect, | |
436 | }; | |
437 | ||
438 | static int __init teo_governor_init(void) | |
439 | { | |
440 | return cpuidle_register_governor(&teo_governor); | |
441 | } | |
442 | ||
443 | postcore_initcall(teo_governor_init); |