[linux-2.6-block.git] / kernel / irq / timings.c

/*
 * linux/kernel/irq/timings.c
 *
 * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 */
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/static_key.h>
#include <linux/interrupt.h>
#include <linux/idr.h>
#include <linux/irq.h>
#include <linux/math64.h>

#include <trace/events/irq.h>

#include "internals.h"

DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);

DEFINE_PER_CPU(struct irq_timings, irq_timings);

struct irqt_stat {
	u64	next_evt;
	u64	last_ts;
	u64	variance;
	u32	avg;
	u32	nr_samples;
	int	anomalies;
	int	valid;
};

static DEFINE_IDR(irqt_stats);

void irq_timings_enable(void)
{
	static_branch_enable(&irq_timing_enabled);
}

void irq_timings_disable(void)
{
	static_branch_disable(&irq_timing_enabled);
}

/**
 * irqs_update - update the irq timing statistics with a new timestamp
 *
 * @irqs: an irqt_stat struct pointer
 * @ts: the new timestamp
 *
 * The statistics are computed online, in other words, the code is
 * designed to compute the statistics on a stream of values rather
 * than doing multiple passes on the values to compute the average,
 * then the variance. The integer division introduces a loss of
 * precision but with an acceptable error margin regarding the results
 * we would have with the double floating precision: we are dealing
 * with nanosec, so big numbers, consequently the mantisse is
 * negligeable, especially when converting the time in usec
 * afterwards.
 *
 * The computation happens at idle time. When the CPU is not idle, the
 * interrupts' timestamps are stored in the circular buffer, when the
 * CPU goes idle and this routine is called, all the buffer's values
 * are injected in the statistical model continuying to extend the
 * statistics from the previous busy-idle cycle.
 *
 * The observations showed a device will trigger a burst of periodic
 * interrupts followed by one or two peaks of longer time, for
 * instance when a SD card device flushes its cache, then the periodic
 * intervals occur again. A one second inactivity period resets the
 * stats, that gives us the certitude the statistical values won't
 * exceed 1x10^9, thus the computation won't overflow.
 *
 * Basically, the purpose of the algorithm is to watch the periodic
 * interrupts and eliminate the peaks.
 *
 * An interrupt is considered periodically stable if the interval of
 * its occurences follow the normal distribution, thus the values
 * comply with:
 *
 *      avg - 3 x stddev < value < avg + 3 x stddev
 *
 * Which can be simplified to:
 *
 *      -3 x stddev < value - avg < 3 x stddev
 *
 *      abs(value - avg) < 3 x stddev
 *
 * In order to save a costly square root computation, we use the
 * variance. For the record, stddev = sqrt(variance). The equation
 * above becomes:
 *
 *      abs(value - avg) < 3 x sqrt(variance)
 *
 * And finally we square it:
 *
 *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
 *
 *      (value - avg) x (value - avg) < 9 x variance
 *
 * Statistically speaking, any values out of this interval is
 * considered as an anomaly and is discarded. However, a normal
 * distribution appears when the number of samples is 30 (it is the
 * rule of thumb in statistics, cf. "30 samples" on Internet). When
 * there are three consecutive anomalies, the statistics are resetted.
 *
 */
static void irqs_update(struct irqt_stat *irqs, u64 ts)
{
	u64 old_ts = irqs->last_ts;
	u64 variance = 0;
	u64 interval;
	s64 diff;

	/*
	 * The timestamps are absolute time values, we need to compute
	 * the timing interval between two interrupts.
	 */
	irqs->last_ts = ts;

	/*
	 * The interval type is u64 in order to deal with the same
	 * type in our computation, that prevent mindfuck issues with
	 * overflow, sign and division.
	 */
	interval = ts - old_ts;

	/*
	 * The interrupt triggered more than one second apart, that
	 * ends the sequence as predictible for our purpose. In this
	 * case, assume we have the beginning of a sequence and the
	 * timestamp is the first value. As it is impossible to
	 * predict anything at this point, return.
	 *
	 * Note the first timestamp of the sequence will always fall
	 * in this test because the old_ts is zero. That is what we
	 * want as we need another timestamp to compute an interval.
	 */
	if (interval >= NSEC_PER_SEC) {
		memset(irqs, 0, sizeof(*irqs));
		irqs->last_ts = ts;
		return;
	}

	/*
	 * Pre-compute the delta with the average as the result is
	 * used several times in this function.
	 */
	diff = interval - irqs->avg;

	/*
	 * Increment the number of samples.
	 */
	irqs->nr_samples++;

	/*
	 * Online variance divided by the number of elements if there
	 * is more than one sample.  Normally the formula is division
	 * by nr_samples - 1 but we assume the number of element will be
	 * more than 32 and dividing by 32 instead of 31 is enough
	 * precise.
	 */
	if (likely(irqs->nr_samples > 1))
		variance = irqs->variance >> IRQ_TIMINGS_SHIFT;

	/*
	 * The rule of thumb in statistics for the normal distribution
	 * is having at least 30 samples in order to have the model to
	 * apply. Values outside the interval are considered as an
	 * anomaly.
	 */
	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
		/*
		 * After three consecutive anomalies, we reset the
		 * stats as it is no longer stable enough.
		 */
		if (irqs->anomalies++ >= 3) {
			memset(irqs, 0, sizeof(*irqs));
			irqs->last_ts = ts;
			return;
		}
	} else {
		/*
		 * The anomalies must be consecutives, so at this
		 * point, we reset the anomalies counter.
		 */
		irqs->anomalies = 0;
	}

	/*
	 * The interrupt is considered stable enough to try to predict
	 * the next event on it.
	 */
	irqs->valid = 1;

	/*
	 * Online average algorithm:
	 *
	 *  new_average = average + ((value - average) / count)
	 *
	 * The variance computation depends on the new average
	 * to be computed here first.
	 *
	 */
	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);

	/*
	 * Online variance algorithm:
	 *
	 *  new_variance = variance + (value - average) x (value - new_average)
	 *
	 * Warning: irqs->avg is updated with the line above, hence
	 * 'interval - irqs->avg' is no longer equal to 'diff'
	 */
	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));

	/*
	 * Update the next event
	 */
	irqs->next_evt = ts + irqs->avg;
}

/**
 * irq_timings_next_event - Return when the next event is supposed to arrive
 *
 * During the last busy cycle, the number of interrupts is incremented
 * and stored in the irq_timings structure. This information is
 * necessary to:
 *
 * - know if the index in the table wrapped up:
 *
 *      If more than the array size interrupts happened during the
 *      last busy/idle cycle, the index wrapped up and we have to
 *      begin with the next element in the array which is the last one
 *      in the sequence, otherwise it is a the index 0.
 *
 * - have an indication of the interrupts activity on this CPU
 *   (eg. irq/sec)
 *
 * The values are 'consumed' after inserting in the statistical model,
 * thus the count is reinitialized.
 *
 * The array of values **must** be browsed in the time direction, the
 * timestamp must increase between an element and the next one.
 *
 * Returns a nanosec time based estimation of the earliest interrupt,
 * U64_MAX otherwise.
 */
u64 irq_timings_next_event(u64 now)
{
	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
	struct irqt_stat *irqs;
	struct irqt_stat __percpu *s;
	u64 ts, next_evt = U64_MAX;
	int i, irq = 0;

	/*
	 * This function must be called with the local irq disabled in
	 * order to prevent the timings circular buffer to be updated
	 * while we are reading it.
	 */
	WARN_ON_ONCE(!irqs_disabled());

	/*
	 * Number of elements in the circular buffer: If it happens it
	 * was flushed before, then the number of elements could be
	 * smaller than IRQ_TIMINGS_SIZE, so the count is used,
	 * otherwise the array size is used as we wrapped. The index
	 * begins from zero when we did not wrap. That could be done
	 * in a nicer way with the proper circular array structure
	 * type but with the cost of extra computation in the
	 * interrupt handler hot path. We choose efficiency.
	 *
	 * Inject measured irq/timestamp to the statistical model
	 * while decrementing the counter because we consume the data
	 * from our circular buffer.
	 */
	for (i = irqts->count & IRQ_TIMINGS_MASK,
		     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
	     irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {

		irq = irq_timing_decode(irqts->values[i], &ts);

		s = idr_find(&irqt_stats, irq);
		if (s) {
			irqs = this_cpu_ptr(s);
			irqs_update(irqs, ts);
		}
	}

	/*
	 * Look in the list of interrupts' statistics, the earliest
	 * next event.
	 */
	idr_for_each_entry(&irqt_stats, s, i) {

		irqs = this_cpu_ptr(s);

		if (!irqs->valid)
			continue;

		if (irqs->next_evt <= now) {
			irq = i;
			next_evt = now;

			/*
			 * This interrupt mustn't use in the future
			 * until new events occur and update the
			 * statistics.
			 */
			irqs->valid = 0;
			break;
		}

		if (irqs->next_evt < next_evt) {
			irq = i;
			next_evt = irqs->next_evt;
		}
	}

	return next_evt;
}

void irq_timings_free(int irq)
{
	struct irqt_stat __percpu *s;

	s = idr_find(&irqt_stats, irq);
	if (s) {
		free_percpu(s);
		idr_remove(&irqt_stats, irq);
	}
}

int irq_timings_alloc(int irq)
{
	struct irqt_stat __percpu *s;
	int id;

	/*
	 * Some platforms can have the same private interrupt per cpu,
	 * so this function may be be called several times with the
	 * same interrupt number. Just bail out in case the per cpu
	 * stat structure is already allocated.
	 */
	s = idr_find(&irqt_stats, irq);
	if (s)
		return 0;

	s = alloc_percpu(*s);
	if (!s)
		return -ENOMEM;

	idr_preload(GFP_KERNEL);
	id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
	idr_preload_end();

	if (id < 0) {
		free_percpu(s);
		return id;
	}

	return 0;
}
Commit	Line	Data
b2d3d61a DL	1	/*
	2	* linux/kernel/irq/timings.c
	3	*
	4	* Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License version 2 as
	8	* published by the Free Software Foundation.
	9	*
	10	*/
e1c92149	11	#include <linux/kernel.h>
b2d3d61a	12	#include <linux/percpu.h>
e1c92149	13	#include <linux/slab.h>
b2d3d61a DL	14	#include <linux/static_key.h>
b2d3d61a DL	15	#include <linux/interrupt.h>
e1c92149	16	#include <linux/idr.h>
b2d3d61a	17	#include <linux/irq.h>
e1c92149 DL	18	#include <linux/math64.h>
	19
	20	#include <trace/events/irq.h>
b2d3d61a DL	21
	22	#include "internals.h"
	23
	24	DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
	25
	26	DEFINE_PER_CPU(struct irq_timings, irq_timings);
	27
e1c92149 DL	28	struct irqt_stat {
	29	u64 next_evt;
	30	u64 last_ts;
	31	u64 variance;
	32	u32 avg;
	33	u32 nr_samples;
	34	int anomalies;
	35	int valid;
	36	};
	37
	38	static DEFINE_IDR(irqt_stats);
	39
b2d3d61a DL	40	void irq_timings_enable(void)
	41	{
	42	static_branch_enable(&irq_timing_enabled);
	43	}
	44
	45	void irq_timings_disable(void)
	46	{
	47	static_branch_disable(&irq_timing_enabled);
	48	}
e1c92149 DL	49
	50	/**
	51	* irqs_update - update the irq timing statistics with a new timestamp
	52	*
	53	* @irqs: an irqt_stat struct pointer
	54	* @ts: the new timestamp
	55	*
	56	* The statistics are computed online, in other words, the code is
	57	* designed to compute the statistics on a stream of values rather
	58	* than doing multiple passes on the values to compute the average,
	59	* then the variance. The integer division introduces a loss of
	60	* precision but with an acceptable error margin regarding the results
	61	* we would have with the double floating precision: we are dealing
	62	* with nanosec, so big numbers, consequently the mantisse is
	63	* negligeable, especially when converting the time in usec
	64	* afterwards.
	65	*
	66	* The computation happens at idle time. When the CPU is not idle, the
	67	* interrupts' timestamps are stored in the circular buffer, when the
	68	* CPU goes idle and this routine is called, all the buffer's values
	69	* are injected in the statistical model continuying to extend the
	70	* statistics from the previous busy-idle cycle.
	71	*
	72	* The observations showed a device will trigger a burst of periodic
	73	* interrupts followed by one or two peaks of longer time, for
	74	* instance when a SD card device flushes its cache, then the periodic
	75	* intervals occur again. A one second inactivity period resets the
	76	* stats, that gives us the certitude the statistical values won't
	77	* exceed 1x10^9, thus the computation won't overflow.
	78	*
	79	* Basically, the purpose of the algorithm is to watch the periodic
	80	* interrupts and eliminate the peaks.
	81	*
	82	* An interrupt is considered periodically stable if the interval of
	83	* its occurences follow the normal distribution, thus the values
	84	* comply with:
	85	*
	86	* avg - 3 x stddev < value < avg + 3 x stddev
	87	*
	88	* Which can be simplified to:
	89	*
	90	* -3 x stddev < value - avg < 3 x stddev
	91	*
	92	* abs(value - avg) < 3 x stddev
	93	*
	94	* In order to save a costly square root computation, we use the
	95	* variance. For the record, stddev = sqrt(variance). The equation
	96	* above becomes:
	97	*
	98	* abs(value - avg) < 3 x sqrt(variance)
	99	*
	100	* And finally we square it:
	101	*
	102	* (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
	103	*
	104	* (value - avg) x (value - avg) < 9 x variance
	105	*
	106	* Statistically speaking, any values out of this interval is
	107	* considered as an anomaly and is discarded. However, a normal
	108	* distribution appears when the number of samples is 30 (it is the
	109	* rule of thumb in statistics, cf. "30 samples" on Internet). When
	110	* there are three consecutive anomalies, the statistics are resetted.
	111	*
	112	*/
113	static void irqs_update(struct irqt_stat *irqs, u64 ts)
114	{
115	u64 old_ts = irqs->last_ts;
116	u64 variance = 0;
117	u64 interval;
118	s64 diff;
119
120	/*
121	* The timestamps are absolute time values, we need to compute
122	* the timing interval between two interrupts.
123	*/
124	irqs->last_ts = ts;
125
126	/*
127	* The interval type is u64 in order to deal with the same
128	* type in our computation, that prevent mindfuck issues with
129	* overflow, sign and division.
130	*/
131	interval = ts - old_ts;
132
133	/*
134	* The interrupt triggered more than one second apart, that
135	* ends the sequence as predictible for our purpose. In this
136	* case, assume we have the beginning of a sequence and the
137	* timestamp is the first value. As it is impossible to
138	* predict anything at this point, return.
139	*
140	* Note the first timestamp of the sequence will always fall
141	* in this test because the old_ts is zero. That is what we
142	* want as we need another timestamp to compute an interval.
143	*/
144	if (interval >= NSEC_PER_SEC) {
145	memset(irqs, 0, sizeof(*irqs));
146	irqs->last_ts = ts;
147	return;
148	}
149
150	/*
151	* Pre-compute the delta with the average as the result is
152	* used several times in this function.
153	*/
154	diff = interval - irqs->avg;
155
156	/*
157	* Increment the number of samples.
158	*/
159	irqs->nr_samples++;
160
161	/*
162	* Online variance divided by the number of elements if there
163	* is more than one sample. Normally the formula is division
164	* by nr_samples - 1 but we assume the number of element will be
165	* more than 32 and dividing by 32 instead of 31 is enough
166	* precise.
167	*/
168	if (likely(irqs->nr_samples > 1))
169	variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
170
171	/*
172	* The rule of thumb in statistics for the normal distribution
173	* is having at least 30 samples in order to have the model to
174	* apply. Values outside the interval are considered as an
175	* anomaly.
176	*/
177	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
178	/*
179	* After three consecutive anomalies, we reset the
180	* stats as it is no longer stable enough.
181	*/
182	if (irqs->anomalies++ >= 3) {
183	memset(irqs, 0, sizeof(*irqs));
184	irqs->last_ts = ts;
185	return;
186	}
187	} else {
188	/*
189	* The anomalies must be consecutives, so at this
190	* point, we reset the anomalies counter.
191	*/
192	irqs->anomalies = 0;
193	}
194
195	/*
196	* The interrupt is considered stable enough to try to predict
197	* the next event on it.
198	*/
199	irqs->valid = 1;
200
201	/*
202	* Online average algorithm:
203	*
204	* new_average = average + ((value - average) / count)
205	*
206	* The variance computation depends on the new average
207	* to be computed here first.
208	*
209	*/
210	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
211
212	/*
213	* Online variance algorithm:
214	*
215	* new_variance = variance + (value - average) x (value - new_average)
216	*
217	* Warning: irqs->avg is updated with the line above, hence
218	* 'interval - irqs->avg' is no longer equal to 'diff'
219	*/
220	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
221
222	/*
223	* Update the next event
224	*/
225	irqs->next_evt = ts + irqs->avg;
226	}
227
228	/**
229	* irq_timings_next_event - Return when the next event is supposed to arrive
230	*
231	* During the last busy cycle, the number of interrupts is incremented
232	* and stored in the irq_timings structure. This information is
233	* necessary to:
234	*
235	* - know if the index in the table wrapped up:
236	*
237	* If more than the array size interrupts happened during the
238	* last busy/idle cycle, the index wrapped up and we have to
239	* begin with the next element in the array which is the last one
240	* in the sequence, otherwise it is a the index 0.
241	*
242	* - have an indication of the interrupts activity on this CPU
243	* (eg. irq/sec)
244	*
245	* The values are 'consumed' after inserting in the statistical model,
246	* thus the count is reinitialized.
247	*
248	* The array of values must be browsed in the time direction, the
249	* timestamp must increase between an element and the next one.
250	*
251	* Returns a nanosec time based estimation of the earliest interrupt,
252	* U64_MAX otherwise.
253	*/
254	u64 irq_timings_next_event(u64 now)
255	{
256	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
257	struct irqt_stat *irqs;
258	struct irqt_stat __percpu *s;
259	u64 ts, next_evt = U64_MAX;
260	int i, irq = 0;
261
262	/*
263	* This function must be called with the local irq disabled in
264	* order to prevent the timings circular buffer to be updated
265	* while we are reading it.
266	*/
267	WARN_ON_ONCE(!irqs_disabled());
268
269	/*
270	* Number of elements in the circular buffer: If it happens it
271	* was flushed before, then the number of elements could be
272	* smaller than IRQ_TIMINGS_SIZE, so the count is used,
273	* otherwise the array size is used as we wrapped. The index
274	* begins from zero when we did not wrap. That could be done
275	* in a nicer way with the proper circular array structure
276	* type but with the cost of extra computation in the
277	* interrupt handler hot path. We choose efficiency.
278	*
279	* Inject measured irq/timestamp to the statistical model
280	* while decrementing the counter because we consume the data
281	* from our circular buffer.
282	*/
283	for (i = irqts->count & IRQ_TIMINGS_MASK,
284	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
285	irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
286
287	irq = irq_timing_decode(irqts->values[i], &ts);
288
289	s = idr_find(&irqt_stats, irq);
290	if (s) {
291	irqs = this_cpu_ptr(s);
292	irqs_update(irqs, ts);
293	}
294	}
295
296	/*
297	* Look in the list of interrupts' statistics, the earliest
298	* next event.
299	*/
300	idr_for_each_entry(&irqt_stats, s, i) {
301
302	irqs = this_cpu_ptr(s);
303
304	if (!irqs->valid)
305	continue;
306
307	if (irqs->next_evt <= now) {
308	irq = i;
309	next_evt = now;
310
311	/*
312	* This interrupt mustn't use in the future
313	* until new events occur and update the
314	* statistics.
315	*/
316	irqs->valid = 0;
317	break;
318	}
319
320	if (irqs->next_evt < next_evt) {
321	irq = i;
322	next_evt = irqs->next_evt;
323	}
324	}
325
326	return next_evt;
327	}
328
329	void irq_timings_free(int irq)
330	{
331	struct irqt_stat __percpu *s;
332
333	s = idr_find(&irqt_stats, irq);
334	if (s) {
335	free_percpu(s);
336	idr_remove(&irqt_stats, irq);
337	}
338	}
339
340	int irq_timings_alloc(int irq)
341	{
342	struct irqt_stat __percpu *s;
343	int id;
344
345	/*
346	* Some platforms can have the same private interrupt per cpu,
347	* so this function may be be called several times with the
348	* same interrupt number. Just bail out in case the per cpu
349	* stat structure is already allocated.
350	*/
351	s = idr_find(&irqt_stats, irq);
352	if (s)
353	return 0;
354
355	s = alloc_percpu(*s);
356	if (!s)
357	return -ENOMEM;
358
359	idr_preload(GFP_KERNEL);
360	id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
361	idr_preload_end();
362
363	if (id < 0) {
364	free_percpu(s);
365	return id;
366	}
367
368	return 0;
369	}