Commit | Line | Data |
---|---|---|
158f0231 JA |
1 | diff --git a/block/Kconfig b/block/Kconfig |
2 | index 377f6dd..27eaed9 100644 | |
3 | --- a/block/Kconfig | |
4 | +++ b/block/Kconfig | |
5 | @@ -11,4 +11,15 @@ config LBD | |
6 | your machine, or if you want to have a raid or loopback device | |
7 | bigger than 2TB. Otherwise say N. | |
8 | ||
9 | +config BLK_DEV_IO_TRACE | |
10 | + bool "Support for tracing block io actions" | |
11 | + select RELAYFS_FS | |
12 | + help | |
13 | + Say Y here, if you want to be able to trace the block layer actions | |
14 | + on a given queue. Tracing allows you to see any traffic happening | |
15 | + on a block device queue. For more information (and the user space | |
16 | + support tools needed), fetch the blktrace app from: | |
17 | + | |
18 | + git://brick.kernel.dk/data/git/blktrace.git | |
19 | + | |
20 | source block/Kconfig.iosched | |
21 | diff --git a/block/Makefile b/block/Makefile | |
22 | index 7e4f93e..c05de0e 100644 | |
23 | --- a/block/Makefile | |
24 | +++ b/block/Makefile | |
25 | @@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosch | |
26 | obj-$(CONFIG_IOSCHED_AS) += as-iosched.o | |
27 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o | |
28 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o | |
29 | + | |
30 | +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | |
31 | diff --git a/block/blktrace.c b/block/blktrace.c | |
32 | new file mode 100644 | |
57c9f4e8 | 33 | index 0000000..3b03587 |
158f0231 JA |
34 | --- /dev/null |
35 | +++ b/block/blktrace.c | |
57c9f4e8 | 36 | @@ -0,0 +1,499 @@ |
158f0231 JA |
37 | +#include <linux/config.h> |
38 | +#include <linux/kernel.h> | |
39 | +#include <linux/blkdev.h> | |
40 | +#include <linux/blktrace_api.h> | |
41 | +#include <linux/percpu.h> | |
42 | +#include <linux/init.h> | |
43 | +#include <linux/mutex.h> | |
44 | +#include <asm/uaccess.h> | |
45 | + | |
46 | +static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, }; | |
57c9f4e8 | 47 | +static unsigned int blktrace_seq = 1; |
158f0231 | 48 | + |
62fb68f5 JA |
49 | +static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) |
50 | +{ | |
51 | + struct blk_io_trace *t; | |
62fb68f5 JA |
52 | + |
53 | + t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm)); | |
54 | + if (t) { | |
62fb68f5 JA |
55 | + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; |
56 | + t->device = bt->dev; | |
57 | + t->action = BLK_TC_ACT(BLK_TC_NOTIFY); | |
58 | + t->pid = tsk->pid; | |
59 | + t->cpu = smp_processor_id(); | |
60 | + t->pdu_len = sizeof(tsk->comm); | |
61 | + memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len); | |
57c9f4e8 | 62 | + tsk->btrace_seq = blktrace_seq; |
62fb68f5 | 63 | + } |
62fb68f5 JA |
64 | +} |
65 | + | |
66 | +static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, | |
67 | + pid_t pid) | |
68 | +{ | |
69 | + if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) | |
70 | + return 1; | |
71 | + if (sector < bt->start_lba || sector > bt->end_lba) | |
72 | + return 1; | |
73 | + if (bt->pid && pid != bt->pid) | |
74 | + return 1; | |
75 | + | |
76 | + return 0; | |
77 | +} | |
78 | + | |
79 | +/* | |
80 | + * Data direction bit lookup | |
81 | + */ | |
82 | +static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; | |
83 | + | |
84 | +/* | |
85 | + * Bio action bits of interest | |
86 | + */ | |
87 | +static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) }; | |
88 | + | |
89 | +/* | |
90 | + * More could be added as needed, taking care to increment the decrementer | |
91 | + * to get correct indexing | |
92 | + */ | |
93 | +#define trace_barrier_bit(rw) \ | |
94 | + (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0)) | |
95 | +#define trace_sync_bit(rw) \ | |
96 | + (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1)) | |
97 | + | |
158f0231 JA |
98 | +/* |
99 | + * The worker for the various blk_add_trace*() types. Fills out a | |
100 | + * blk_io_trace structure and places it in a per-cpu subbuffer. | |
101 | + */ | |
102 | +void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |
103 | + int rw, u32 what, int error, int pdu_len, void *pdu_data) | |
104 | +{ | |
62fb68f5 | 105 | + struct task_struct *tsk = current; |
158f0231 JA |
106 | + struct blk_io_trace *t; |
107 | + unsigned long flags; | |
108 | + unsigned long *sequence; | |
2def0d98 JA |
109 | + unsigned long seq; |
110 | + u64 cpu_time; | |
158f0231 JA |
111 | + pid_t pid; |
112 | + int cpu; | |
113 | + | |
62fb68f5 | 114 | + if (unlikely(bt->trace_state != Blktrace_running)) |
e62a6470 JA |
115 | + return; |
116 | + | |
62fb68f5 JA |
117 | + what |= ddir_act[rw & WRITE]; |
118 | + what |= bio_act[trace_barrier_bit(rw)]; | |
119 | + what |= bio_act[trace_sync_bit(rw)]; | |
158f0231 | 120 | + |
62fb68f5 JA |
121 | + pid = tsk->pid; |
122 | + if (unlikely(act_log_check(bt, what, sector, pid))) | |
158f0231 JA |
123 | + return; |
124 | + | |
158f0231 JA |
125 | + /* |
126 | + * A word about the locking here - we disable interrupts to reserve | |
127 | + * some space in the relayfs per-cpu buffer, to prevent an irq | |
128 | + * from coming in and stepping on our toes. Once reserved, it's | |
129 | + * enough to get preemption disabled to prevent read of this data | |
130 | + * before we are through filling it. get_cpu()/put_cpu() does this | |
131 | + * for us | |
132 | + */ | |
133 | + local_irq_save(flags); | |
134 | + | |
57c9f4e8 JA |
135 | + if (unlikely(tsk->btrace_seq != blktrace_seq)) |
136 | + trace_note_tsk(bt, tsk); | |
137 | + | |
158f0231 JA |
138 | + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); |
139 | + if (unlikely(!t)) { | |
140 | + local_irq_restore(flags); | |
141 | + return; | |
142 | + } | |
143 | + | |
144 | + cpu = get_cpu(); | |
145 | + | |
146 | + sequence = per_cpu_ptr(bt->sequence, cpu); | |
2def0d98 JA |
147 | + seq = ++(*sequence); |
148 | + cpu_time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu); | |
158f0231 JA |
149 | + |
150 | + local_irq_restore(flags); | |
151 | + | |
152 | + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; | |
2def0d98 JA |
153 | + t->sequence = seq; |
154 | + t->time = cpu_time; | |
158f0231 JA |
155 | + t->sector = sector; |
156 | + t->bytes = bytes; | |
157 | + t->action = what; | |
2def0d98 JA |
158 | + t->pid = pid; |
159 | + t->device = bt->dev; | |
160 | + t->cpu = cpu; | |
158f0231 JA |
161 | + t->error = error; |
162 | + t->pdu_len = pdu_len; | |
158f0231 JA |
163 | + |
164 | + if (pdu_len) | |
165 | + memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); | |
166 | + | |
167 | + put_cpu(); | |
168 | +} | |
169 | + | |
170 | +EXPORT_SYMBOL_GPL(__blk_add_trace); | |
171 | + | |
172 | +static struct dentry *blk_tree_root; | |
173 | +static struct mutex blk_tree_mutex; | |
174 | + | |
175 | +static inline void blk_remove_root(void) | |
176 | +{ | |
177 | + if (relayfs_remove_dir(blk_tree_root) != -ENOTEMPTY) | |
178 | + blk_tree_root = NULL; | |
179 | +} | |
180 | + | |
181 | +static void blk_remove_tree(struct dentry *dir) | |
182 | +{ | |
183 | + mutex_lock(&blk_tree_mutex); | |
184 | + relayfs_remove_dir(dir); | |
185 | + blk_remove_root(); | |
186 | + mutex_unlock(&blk_tree_mutex); | |
187 | +} | |
188 | + | |
189 | +static struct dentry *blk_create_tree(const char *blk_name) | |
190 | +{ | |
191 | + struct dentry *dir = NULL; | |
192 | + | |
193 | + mutex_lock(&blk_tree_mutex); | |
194 | + | |
195 | + if (!blk_tree_root) { | |
196 | + blk_tree_root = relayfs_create_dir("block", NULL); | |
197 | + if (!blk_tree_root) | |
198 | + goto err; | |
199 | + } | |
200 | + | |
201 | + dir = relayfs_create_dir(blk_name, blk_tree_root); | |
202 | + if (!dir) | |
203 | + blk_remove_root(); | |
204 | + | |
205 | +err: | |
206 | + mutex_unlock(&blk_tree_mutex); | |
207 | + return dir; | |
208 | +} | |
209 | + | |
e62a6470 | 210 | +void blk_trace_cleanup(struct blk_trace *bt) |
158f0231 JA |
211 | +{ |
212 | + relay_close(bt->rchan); | |
213 | + relayfs_remove_file(bt->dropped_file); | |
214 | + blk_remove_tree(bt->dir); | |
57c9f4e8 | 215 | + free_percpu(bt->sequence); |
158f0231 JA |
216 | + kfree(bt); |
217 | +} | |
218 | + | |
e62a6470 | 219 | +static int blk_trace_remove(request_queue_t *q) |
158f0231 | 220 | +{ |
43ec3d6a | 221 | + struct blk_trace *bt; |
158f0231 | 222 | + |
43ec3d6a JA |
223 | + bt = xchg(&q->blk_trace, NULL); |
224 | + if (!bt) | |
225 | + return -EINVAL; | |
158f0231 | 226 | + |
43ec3d6a JA |
227 | + if (bt->trace_state == Blktrace_setup || |
228 | + bt->trace_state == Blktrace_stopped) | |
229 | + blk_trace_cleanup(bt); | |
158f0231 | 230 | + |
43ec3d6a | 231 | + return 0; |
158f0231 JA |
232 | +} |
233 | + | |
234 | +static int blk_dropped_open(struct inode *inode, struct file *filp) | |
235 | +{ | |
236 | + filp->private_data = inode->u.generic_ip; | |
237 | + | |
238 | + return 0; | |
239 | +} | |
240 | + | |
241 | +static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, | |
242 | + size_t count, loff_t *ppos) | |
243 | +{ | |
244 | + struct blk_trace *bt = filp->private_data; | |
245 | + char buf[16]; | |
246 | + | |
247 | + snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); | |
248 | + | |
249 | + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); | |
250 | +} | |
251 | + | |
252 | +static struct file_operations blk_dropped_fops = { | |
253 | + .owner = THIS_MODULE, | |
254 | + .open = blk_dropped_open, | |
255 | + .read = blk_dropped_read, | |
256 | +}; | |
257 | + | |
258 | +/* | |
259 | + * Keep track of how many times we encountered a full subbuffer, to aid | |
260 | + * the user space app in telling how many lost events there were. | |
261 | + */ | |
262 | +static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, | |
263 | + void *prev_subbuf, size_t prev_padding) | |
264 | +{ | |
265 | + struct blk_trace *bt; | |
266 | + | |
267 | + if (!relay_buf_full(buf)) | |
268 | + return 1; | |
269 | + | |
270 | + bt = buf->chan->private_data; | |
271 | + atomic_inc(&bt->dropped); | |
272 | + return 0; | |
273 | +} | |
274 | + | |
275 | +static struct rchan_callbacks blk_relay_callbacks = { | |
276 | + .subbuf_start = blk_subbuf_start_callback, | |
277 | +}; | |
278 | + | |
279 | +/* | |
280 | + * Setup everything required to start tracing | |
281 | + */ | |
e62a6470 JA |
282 | +static int blk_trace_setup(request_queue_t *q, struct block_device *bdev, |
283 | + char __user *arg) | |
158f0231 | 284 | +{ |
158f0231 | 285 | + struct blk_user_trace_setup buts; |
2def0d98 | 286 | + struct blk_trace *old_bt, *bt = NULL; |
158f0231 JA |
287 | + struct dentry *dir = NULL; |
288 | + char b[BDEVNAME_SIZE]; | |
289 | + int ret, i; | |
290 | + | |
158f0231 JA |
291 | + if (copy_from_user(&buts, arg, sizeof(buts))) |
292 | + return -EFAULT; | |
293 | + | |
294 | + if (!buts.buf_size || !buts.buf_nr) | |
295 | + return -EINVAL; | |
296 | + | |
297 | + strcpy(buts.name, bdevname(bdev, b)); | |
298 | + | |
299 | + /* | |
300 | + * some device names have larger paths - convert the slashes | |
301 | + * to underscores for this to work as expected | |
302 | + */ | |
303 | + for (i = 0; i < strlen(buts.name); i++) | |
304 | + if (buts.name[i] == '/') | |
305 | + buts.name[i] = '_'; | |
306 | + | |
307 | + if (copy_to_user(arg, &buts, sizeof(buts))) | |
308 | + return -EFAULT; | |
309 | + | |
158f0231 JA |
310 | + ret = -ENOMEM; |
311 | + bt = kzalloc(sizeof(*bt), GFP_KERNEL); | |
312 | + if (!bt) | |
313 | + goto err; | |
314 | + | |
315 | + bt->sequence = alloc_percpu(unsigned long); | |
316 | + if (!bt->sequence) | |
317 | + goto err; | |
318 | + | |
319 | + ret = -ENOENT; | |
320 | + dir = blk_create_tree(buts.name); | |
321 | + if (!dir) | |
322 | + goto err; | |
323 | + | |
324 | + bt->dir = dir; | |
325 | + bt->dev = bdev->bd_dev; | |
326 | + atomic_set(&bt->dropped, 0); | |
327 | + | |
328 | + ret = -EIO; | |
329 | + bt->dropped_file = relayfs_create_file("dropped", dir, 0, &blk_dropped_fops, bt); | |
330 | + if (!bt->dropped_file) | |
331 | + goto err; | |
332 | + | |
333 | + bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks); | |
334 | + if (!bt->rchan) | |
335 | + goto err; | |
336 | + bt->rchan->private_data = bt; | |
337 | + | |
338 | + bt->act_mask = buts.act_mask; | |
339 | + if (!bt->act_mask) | |
340 | + bt->act_mask = (u16) -1; | |
341 | + | |
342 | + bt->start_lba = buts.start_lba; | |
343 | + bt->end_lba = buts.end_lba; | |
344 | + if (!bt->end_lba) | |
345 | + bt->end_lba = -1ULL; | |
346 | + | |
347 | + bt->pid = buts.pid; | |
e62a6470 | 348 | + bt->trace_state = Blktrace_setup; |
158f0231 | 349 | + |
e62a6470 | 350 | + ret = -EBUSY; |
2def0d98 JA |
351 | + old_bt = xchg(&q->blk_trace, bt); |
352 | + if (old_bt) { | |
353 | + xchg(&q->blk_trace, old_bt); | |
43ec3d6a | 354 | + goto err; |
2def0d98 | 355 | + } |
43ec3d6a JA |
356 | + |
357 | + return 0; | |
158f0231 | 358 | +err: |
158f0231 JA |
359 | + if (bt && bt->dropped_file) |
360 | + relayfs_remove_file(bt->dropped_file); | |
361 | + if (dir) | |
362 | + blk_remove_tree(dir); | |
363 | + if (bt) { | |
364 | + if (bt->sequence) | |
365 | + free_percpu(bt->sequence); | |
366 | + kfree(bt); | |
367 | + } | |
368 | + return ret; | |
369 | +} | |
370 | + | |
e62a6470 JA |
371 | +static int blk_trace_startstop(request_queue_t *q, int start) |
372 | +{ | |
373 | + struct blk_trace *bt; | |
374 | + int ret; | |
375 | + | |
43ec3d6a JA |
376 | + if ((bt = q->blk_trace) == NULL) |
377 | + return -EINVAL; | |
e62a6470 JA |
378 | + |
379 | + /* | |
380 | + * For starting a trace, we can transition from a setup or stopped | |
381 | + * trace. For stopping a trace, the state must be running | |
382 | + */ | |
43ec3d6a JA |
383 | + ret = -EINVAL; |
384 | + if (start) { | |
385 | + if (bt->trace_state == Blktrace_setup || | |
386 | + bt->trace_state == Blktrace_stopped) { | |
57c9f4e8 JA |
387 | + blktrace_seq++; |
388 | + smp_mb(); | |
43ec3d6a JA |
389 | + bt->trace_state = Blktrace_running; |
390 | + ret = 0; | |
391 | + } | |
392 | + } else { | |
393 | + if (bt->trace_state == Blktrace_running) { | |
394 | + bt->trace_state = Blktrace_stopped; | |
395 | + ret = 0; | |
e62a6470 JA |
396 | + } |
397 | + } | |
e62a6470 JA |
398 | + |
399 | + return ret; | |
400 | +} | |
401 | + | |
402 | +/** | |
403 | + * blk_trace_ioctl: - handle the ioctls associated with tracing | |
404 | + * @bdev: the block device | |
405 | + * @cmd: the ioctl cmd | |
406 | + * @arg: the argument data, if any | |
407 | + * | |
408 | + **/ | |
409 | +int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |
410 | +{ | |
411 | + request_queue_t *q; | |
412 | + int ret, start = 0; | |
413 | + | |
414 | + q = bdev_get_queue(bdev); | |
415 | + if (!q) | |
416 | + return -ENXIO; | |
417 | + | |
418 | + down(&bdev->bd_sem); | |
419 | + | |
420 | + switch (cmd) { | |
421 | + case BLKTRACESETUP: | |
422 | + ret = blk_trace_setup(q, bdev, arg); | |
423 | + break; | |
424 | + case BLKTRACESTART: | |
425 | + start = 1; | |
426 | + case BLKTRACESTOP: | |
427 | + ret = blk_trace_startstop(q, start); | |
428 | + break; | |
429 | + case BLKTRACETEARDOWN: | |
e62a6470 | 430 | + ret = blk_trace_remove(q); |
e62a6470 JA |
431 | + break; |
432 | + default: | |
433 | + ret = -ENOTTY; | |
434 | + break; | |
435 | + } | |
436 | + | |
437 | + up(&bdev->bd_sem); | |
438 | + return ret; | |
439 | +} | |
440 | + | |
441 | +/** | |
442 | + * blk_trace_shutdown: - stop and cleanup trace structures | |
443 | + * @q: the request queue associated with the device | |
444 | + * | |
445 | + **/ | |
446 | +void blk_trace_shutdown(request_queue_t *q) | |
447 | +{ | |
448 | + blk_trace_startstop(q, 0); | |
449 | + blk_trace_remove(q); | |
450 | +} | |
451 | + | |
158f0231 JA |
452 | +/* |
453 | + * Average offset over two calls to sched_clock() with a gettimeofday() | |
454 | + * in the middle | |
455 | + */ | |
456 | +static void blk_check_time(unsigned long long *t) | |
457 | +{ | |
458 | + unsigned long long a, b; | |
459 | + struct timeval tv; | |
460 | + | |
461 | + a = sched_clock(); | |
462 | + do_gettimeofday(&tv); | |
463 | + b = sched_clock(); | |
464 | + | |
465 | + *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000; | |
466 | + *t -= (a + b) / 2; | |
467 | +} | |
468 | + | |
469 | +static void blk_trace_check_cpu_time(void *data) | |
470 | +{ | |
471 | + unsigned long long *t; | |
472 | + int cpu = get_cpu(); | |
473 | + | |
474 | + t = &per_cpu(blk_trace_cpu_offset, cpu); | |
475 | + | |
476 | + /* | |
477 | + * Just call it twice, hopefully the second call will be cache hot | |
478 | + * and a little more precise | |
479 | + */ | |
480 | + blk_check_time(t); | |
481 | + blk_check_time(t); | |
482 | + | |
483 | + put_cpu(); | |
484 | +} | |
485 | + | |
486 | +/* | |
487 | + * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU | |
488 | + * timings | |
489 | + */ | |
490 | +static void blk_trace_calibrate_offsets(void) | |
491 | +{ | |
492 | + unsigned long flags; | |
493 | + | |
494 | + smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1); | |
495 | + local_irq_save(flags); | |
496 | + blk_trace_check_cpu_time(NULL); | |
497 | + local_irq_restore(flags); | |
498 | +} | |
499 | + | |
500 | +static void blk_trace_set_ht_offsets(void) | |
501 | +{ | |
502 | +#if defined(CONFIG_SCHED_SMT) | |
503 | + int cpu, i; | |
504 | + | |
505 | + /* | |
506 | + * now make sure HT siblings have the same time offset | |
507 | + */ | |
508 | + preempt_disable(); | |
509 | + for_each_online_cpu(cpu) { | |
510 | + unsigned long long *cpu_off, *sibling_off; | |
511 | + | |
512 | + for_each_cpu_mask(i, cpu_sibling_map[cpu]) { | |
513 | + if (i == cpu) | |
514 | + continue; | |
515 | + | |
516 | + cpu_off = &per_cpu(blk_trace_cpu_offset, cpu); | |
517 | + sibling_off = &per_cpu(blk_trace_cpu_offset, i); | |
518 | + *sibling_off = *cpu_off; | |
519 | + } | |
520 | + } | |
521 | + preempt_enable(); | |
522 | +#endif | |
523 | +} | |
524 | + | |
525 | +static __init int blk_trace_init(void) | |
526 | +{ | |
527 | + mutex_init(&blk_tree_mutex); | |
528 | + blk_trace_calibrate_offsets(); | |
529 | + blk_trace_set_ht_offsets(); | |
530 | + | |
531 | + return 0; | |
532 | +} | |
533 | + | |
534 | +module_init(blk_trace_init); | |
535 | + | |
536 | diff --git a/block/elevator.c b/block/elevator.c | |
2def0d98 | 537 | index 24b702d..0c9fafe 100644 |
158f0231 JA |
538 | --- a/block/elevator.c |
539 | +++ b/block/elevator.c | |
540 | @@ -33,6 +33,7 @@ | |
541 | #include <linux/init.h> | |
542 | #include <linux/compiler.h> | |
543 | #include <linux/delay.h> | |
544 | +#include <linux/blktrace_api.h> | |
545 | ||
546 | #include <asm/uaccess.h> | |
547 | ||
2def0d98 | 548 | @@ -315,6 +316,8 @@ void elv_insert(request_queue_t *q, stru |
158f0231 JA |
549 | struct list_head *pos; |
550 | unsigned ordseq; | |
551 | ||
552 | + blk_add_trace_rq(q, rq, BLK_TA_INSERT); | |
553 | + | |
2def0d98 | 554 | rq->q = q; |
158f0231 | 555 | |
2def0d98 JA |
556 | switch (where) { |
557 | @@ -481,6 +484,7 @@ struct request *elv_next_request(request | |
158f0231 JA |
558 | * not be passed by new incoming requests |
559 | */ | |
560 | rq->flags |= REQ_STARTED; | |
561 | + blk_add_trace_rq(q, rq, BLK_TA_ISSUE); | |
562 | } | |
563 | ||
564 | if (!q->boundary_rq || q->boundary_rq == rq) { | |
565 | diff --git a/block/ioctl.c b/block/ioctl.c | |
e62a6470 | 566 | index e110949..7acb56c 100644 |
158f0231 JA |
567 | --- a/block/ioctl.c |
568 | +++ b/block/ioctl.c | |
569 | @@ -5,6 +5,7 @@ | |
570 | #include <linux/backing-dev.h> | |
571 | #include <linux/buffer_head.h> | |
572 | #include <linux/smp_lock.h> | |
573 | +#include <linux/blktrace_api.h> | |
574 | #include <asm/uaccess.h> | |
575 | ||
576 | static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) | |
e62a6470 | 577 | @@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct fi |
158f0231 JA |
578 | return put_ulong(arg, bdev->bd_inode->i_size >> 9); |
579 | case BLKGETSIZE64: | |
580 | return put_u64(arg, bdev->bd_inode->i_size); | |
e62a6470 JA |
581 | + case BLKTRACESTART: |
582 | + case BLKTRACESTOP: | |
583 | + case BLKTRACESETUP: | |
584 | + case BLKTRACETEARDOWN: | |
585 | + return blk_trace_ioctl(bdev, cmd, (char __user *) arg); | |
158f0231 JA |
586 | } |
587 | return -ENOIOCTLCMD; | |
588 | } | |
589 | diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c | |
2def0d98 | 590 | index 03d9c82..41387f5 100644 |
158f0231 JA |
591 | --- a/block/ll_rw_blk.c |
592 | +++ b/block/ll_rw_blk.c | |
593 | @@ -28,6 +28,7 @@ | |
594 | #include <linux/writeback.h> | |
595 | #include <linux/interrupt.h> | |
596 | #include <linux/cpu.h> | |
597 | +#include <linux/blktrace_api.h> | |
598 | ||
599 | /* | |
600 | * for max sense size | |
e62a6470 | 601 | @@ -1551,8 +1552,10 @@ void blk_plug_device(request_queue_t *q) |
158f0231 JA |
602 | if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) |
603 | return; | |
604 | ||
605 | - if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) | |
606 | + if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { | |
607 | mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); | |
608 | + blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); | |
609 | + } | |
610 | } | |
611 | ||
612 | EXPORT_SYMBOL(blk_plug_device); | |
e62a6470 | 613 | @@ -1616,14 +1619,21 @@ static void blk_backing_dev_unplug(struc |
158f0231 JA |
614 | /* |
615 | * devices don't necessarily have an ->unplug_fn defined | |
616 | */ | |
617 | - if (q->unplug_fn) | |
618 | + if (q->unplug_fn) { | |
619 | + blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, | |
620 | + q->rq.count[READ] + q->rq.count[WRITE]); | |
621 | + | |
622 | q->unplug_fn(q); | |
623 | + } | |
624 | } | |
625 | ||
626 | static void blk_unplug_work(void *data) | |
627 | { | |
628 | request_queue_t *q = data; | |
629 | ||
630 | + blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, | |
631 | + q->rq.count[READ] + q->rq.count[WRITE]); | |
632 | + | |
633 | q->unplug_fn(q); | |
634 | } | |
635 | ||
e62a6470 | 636 | @@ -1631,6 +1641,9 @@ static void blk_unplug_timeout(unsigned |
158f0231 JA |
637 | { |
638 | request_queue_t *q = (request_queue_t *)data; | |
639 | ||
640 | + blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, | |
641 | + q->rq.count[READ] + q->rq.count[WRITE]); | |
642 | + | |
643 | kblockd_schedule_work(&q->unplug_work); | |
644 | } | |
645 | ||
e62a6470 | 646 | @@ -1753,6 +1766,9 @@ void blk_cleanup_queue(request_queue_t * |
158f0231 JA |
647 | if (q->queue_tags) |
648 | __blk_queue_free_tags(q); | |
649 | ||
e62a6470 JA |
650 | + if (q->blk_trace) |
651 | + blk_trace_shutdown(q); | |
158f0231 JA |
652 | + |
653 | kmem_cache_free(requestq_cachep, q); | |
654 | } | |
655 | ||
e62a6470 | 656 | @@ -2104,6 +2120,8 @@ rq_starved: |
158f0231 JA |
657 | |
658 | rq_init(q, rq); | |
659 | rq->rl = rl; | |
660 | + | |
661 | + blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); | |
662 | out: | |
663 | return rq; | |
664 | } | |
e62a6470 | 665 | @@ -2132,6 +2150,8 @@ static struct request *get_request_wait( |
158f0231 JA |
666 | if (!rq) { |
667 | struct io_context *ioc; | |
668 | ||
669 | + blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); | |
670 | + | |
671 | __generic_unplug_device(q); | |
672 | spin_unlock_irq(q->queue_lock); | |
673 | io_schedule(); | |
e62a6470 | 674 | @@ -2185,6 +2205,8 @@ EXPORT_SYMBOL(blk_get_request); |
158f0231 JA |
675 | */ |
676 | void blk_requeue_request(request_queue_t *q, struct request *rq) | |
677 | { | |
678 | + blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); | |
679 | + | |
680 | if (blk_rq_tagged(rq)) | |
681 | blk_queue_end_tag(q, rq); | |
682 | ||
e62a6470 | 683 | @@ -2819,6 +2841,8 @@ static int __make_request(request_queue_ |
158f0231 JA |
684 | if (!q->back_merge_fn(q, req, bio)) |
685 | break; | |
686 | ||
687 | + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); | |
688 | + | |
689 | req->biotail->bi_next = bio; | |
690 | req->biotail = bio; | |
691 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; | |
e62a6470 | 692 | @@ -2834,6 +2858,8 @@ static int __make_request(request_queue_ |
158f0231 JA |
693 | if (!q->front_merge_fn(q, req, bio)) |
694 | break; | |
695 | ||
696 | + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); | |
697 | + | |
698 | bio->bi_next = req->bio; | |
699 | req->bio = bio; | |
700 | ||
e62a6470 | 701 | @@ -2951,6 +2977,7 @@ void generic_make_request(struct bio *bi |
158f0231 JA |
702 | request_queue_t *q; |
703 | sector_t maxsector; | |
704 | int ret, nr_sectors = bio_sectors(bio); | |
705 | + dev_t old_dev; | |
706 | ||
707 | might_sleep(); | |
708 | /* Test device or partition size, when known. */ | |
e62a6470 | 709 | @@ -2977,6 +3004,8 @@ void generic_make_request(struct bio *bi |
158f0231 JA |
710 | * NOTE: we don't repeat the blk_size check for each new device. |
711 | * Stacking drivers are expected to know what they are doing. | |
712 | */ | |
713 | + maxsector = -1; | |
714 | + old_dev = 0; | |
715 | do { | |
716 | char b[BDEVNAME_SIZE]; | |
717 | ||
e62a6470 | 718 | @@ -3009,6 +3038,15 @@ end_io: |
158f0231 JA |
719 | */ |
720 | blk_partition_remap(bio); | |
721 | ||
722 | + if (maxsector != -1) | |
723 | + blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, | |
724 | + maxsector); | |
725 | + | |
726 | + blk_add_trace_bio(q, bio, BLK_TA_QUEUE); | |
727 | + | |
728 | + maxsector = bio->bi_sector; | |
729 | + old_dev = bio->bi_bdev->bd_dev; | |
730 | + | |
731 | ret = q->make_request_fn(q, bio); | |
732 | } while (ret); | |
733 | } | |
e62a6470 | 734 | @@ -3128,6 +3166,8 @@ static int __end_that_request_first(stru |
158f0231 JA |
735 | int total_bytes, bio_nbytes, error, next_idx = 0; |
736 | struct bio *bio; | |
737 | ||
738 | + blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); | |
739 | + | |
740 | /* | |
741 | * extend uptodate bool to allow < 0 value to be direct io error | |
742 | */ | |
743 | diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c | |
e62a6470 | 744 | index 0d65394..234048e 100644 |
158f0231 JA |
745 | --- a/drivers/block/cciss.c |
746 | +++ b/drivers/block/cciss.c | |
747 | @@ -38,6 +38,7 @@ | |
748 | #include <linux/hdreg.h> | |
749 | #include <linux/spinlock.h> | |
750 | #include <linux/compat.h> | |
751 | +#include <linux/blktrace_api.h> | |
752 | #include <asm/uaccess.h> | |
753 | #include <asm/io.h> | |
754 | ||
e62a6470 | 755 | @@ -2331,6 +2332,7 @@ static inline void complete_command( ctl |
158f0231 JA |
756 | |
757 | cmd->rq->completion_data = cmd; | |
758 | cmd->rq->errors = status; | |
759 | + blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE); | |
760 | blk_complete_request(cmd->rq); | |
761 | } | |
762 | ||
763 | diff --git a/drivers/md/dm.c b/drivers/md/dm.c | |
764 | index e9adeb9..c8f3aa2 100644 | |
765 | --- a/drivers/md/dm.c | |
766 | +++ b/drivers/md/dm.c | |
767 | @@ -17,6 +17,7 @@ | |
768 | #include <linux/mempool.h> | |
769 | #include <linux/slab.h> | |
770 | #include <linux/idr.h> | |
771 | +#include <linux/blktrace_api.h> | |
772 | ||
773 | static const char *_name = DM_NAME; | |
774 | ||
775 | @@ -334,6 +335,8 @@ static void dec_pending(struct dm_io *io | |
776 | /* nudge anyone waiting on suspend queue */ | |
777 | wake_up(&io->md->wait); | |
778 | ||
779 | + blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE); | |
780 | + | |
781 | bio_endio(io->bio, io->bio->bi_size, io->error); | |
782 | free_io(io->md, io); | |
783 | } | |
784 | @@ -392,6 +395,7 @@ static void __map_bio(struct dm_target * | |
785 | struct target_io *tio) | |
786 | { | |
787 | int r; | |
788 | + sector_t sector; | |
789 | ||
790 | /* | |
791 | * Sanity checks. | |
792 | @@ -407,10 +411,17 @@ static void __map_bio(struct dm_target * | |
793 | * this io. | |
794 | */ | |
795 | atomic_inc(&tio->io->io_count); | |
796 | + sector = clone->bi_sector; | |
797 | r = ti->type->map(ti, clone, &tio->info); | |
798 | - if (r > 0) | |
799 | + if (r > 0) { | |
800 | /* the bio has been remapped so dispatch it */ | |
801 | + | |
802 | + blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, | |
803 | + tio->io->bio->bi_bdev->bd_dev, sector, | |
804 | + clone->bi_sector); | |
805 | + | |
806 | generic_make_request(clone); | |
807 | + } | |
808 | ||
809 | else if (r < 0) { | |
810 | /* error the io and bail out */ | |
811 | diff --git a/fs/bio.c b/fs/bio.c | |
812 | index 1f3bb50..0dd0d81 100644 | |
813 | --- a/fs/bio.c | |
814 | +++ b/fs/bio.c | |
815 | @@ -25,6 +25,7 @@ | |
816 | #include <linux/module.h> | |
817 | #include <linux/mempool.h> | |
818 | #include <linux/workqueue.h> | |
819 | +#include <linux/blktrace_api.h> | |
820 | #include <scsi/sg.h> /* for struct sg_iovec */ | |
821 | ||
822 | #define BIO_POOL_SIZE 256 | |
823 | @@ -1095,6 +1096,9 @@ struct bio_pair *bio_split(struct bio *b | |
824 | if (!bp) | |
825 | return bp; | |
826 | ||
827 | + blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi, | |
828 | + bi->bi_sector + first_sectors); | |
829 | + | |
830 | BUG_ON(bi->bi_vcnt != 1); | |
831 | BUG_ON(bi->bi_idx != 0); | |
832 | atomic_set(&bp->cnt, 3); | |
833 | diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c | |
2def0d98 | 834 | index 057e602..893d600 100644 |
158f0231 JA |
835 | --- a/fs/compat_ioctl.c |
836 | +++ b/fs/compat_ioctl.c | |
837 | @@ -72,6 +72,7 @@ | |
838 | #include <linux/i2c-dev.h> | |
839 | #include <linux/wireless.h> | |
840 | #include <linux/atalk.h> | |
841 | +#include <linux/blktrace_api.h> | |
842 | ||
843 | #include <net/sock.h> /* siocdevprivate_ioctl */ | |
844 | #include <net/bluetooth/bluetooth.h> | |
845 | diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h | |
846 | index 860e7a4..266ce9d 100644 | |
847 | --- a/include/linux/blkdev.h | |
848 | +++ b/include/linux/blkdev.h | |
849 | @@ -22,6 +22,7 @@ typedef struct request_queue request_que | |
850 | struct elevator_queue; | |
851 | typedef struct elevator_queue elevator_t; | |
852 | struct request_pm_state; | |
853 | +struct blk_trace; | |
854 | ||
855 | #define BLKDEV_MIN_RQ 4 | |
856 | #define BLKDEV_MAX_RQ 128 /* Default maximum */ | |
857 | @@ -416,6 +417,8 @@ struct request_queue | |
858 | unsigned int sg_reserved_size; | |
859 | int node; | |
860 | ||
861 | + struct blk_trace *blk_trace; | |
862 | + | |
863 | /* | |
864 | * reserved for flush operations | |
865 | */ | |
866 | diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h | |
867 | new file mode 100644 | |
62fb68f5 | 868 | index 0000000..fca40ef |
158f0231 JA |
869 | --- /dev/null |
870 | +++ b/include/linux/blktrace_api.h | |
e62a6470 | 871 | @@ -0,0 +1,277 @@ |
158f0231 JA |
872 | +#ifndef BLKTRACE_H |
873 | +#define BLKTRACE_H | |
874 | + | |
875 | +#include <linux/config.h> | |
876 | +#include <linux/blkdev.h> | |
877 | +#include <linux/relayfs_fs.h> | |
878 | + | |
879 | +/* | |
880 | + * Trace categories | |
881 | + */ | |
882 | +enum blktrace_cat { | |
883 | + BLK_TC_READ = 1 << 0, /* reads */ | |
884 | + BLK_TC_WRITE = 1 << 1, /* writes */ | |
885 | + BLK_TC_BARRIER = 1 << 2, /* barrier */ | |
886 | + BLK_TC_SYNC = 1 << 3, /* barrier */ | |
887 | + BLK_TC_QUEUE = 1 << 4, /* queueing/merging */ | |
888 | + BLK_TC_REQUEUE = 1 << 5, /* requeueing */ | |
889 | + BLK_TC_ISSUE = 1 << 6, /* issue */ | |
890 | + BLK_TC_COMPLETE = 1 << 7, /* completions */ | |
891 | + BLK_TC_FS = 1 << 8, /* fs requests */ | |
892 | + BLK_TC_PC = 1 << 9, /* pc requests */ | |
62fb68f5 | 893 | + BLK_TC_NOTIFY = 1 << 10, /* special message */ |
158f0231 JA |
894 | + |
895 | + BLK_TC_END = 1 << 15, /* only 16-bits, reminder */ | |
896 | +}; | |
897 | + | |
898 | +#define BLK_TC_SHIFT (16) | |
899 | +#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) | |
900 | + | |
901 | +/* | |
902 | + * Basic trace actions | |
903 | + */ | |
904 | +enum blktrace_act { | |
905 | + __BLK_TA_QUEUE = 1, /* queued */ | |
906 | + __BLK_TA_BACKMERGE, /* back merged to existing rq */ | |
907 | + __BLK_TA_FRONTMERGE, /* front merge to existing rq */ | |
908 | + __BLK_TA_GETRQ, /* allocated new request */ | |
909 | + __BLK_TA_SLEEPRQ, /* sleeping on rq allocation */ | |
910 | + __BLK_TA_REQUEUE, /* request requeued */ | |
911 | + __BLK_TA_ISSUE, /* sent to driver */ | |
912 | + __BLK_TA_COMPLETE, /* completed by driver */ | |
913 | + __BLK_TA_PLUG, /* queue was plugged */ | |
914 | + __BLK_TA_UNPLUG_IO, /* queue was unplugged by io */ | |
915 | + __BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */ | |
916 | + __BLK_TA_INSERT, /* insert request */ | |
917 | + __BLK_TA_SPLIT, /* bio was split */ | |
918 | + __BLK_TA_BOUNCE, /* bio was bounced */ | |
919 | + __BLK_TA_REMAP, /* bio was remapped */ | |
920 | +}; | |
921 | + | |
922 | +/* | |
923 | + * Trace actions in full. Additionally, read or write is masked | |
924 | + */ | |
925 | +#define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE)) | |
926 | +#define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) | |
927 | +#define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) | |
928 | +#define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE)) | |
929 | +#define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE)) | |
930 | +#define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE)) | |
931 | +#define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE)) | |
932 | +#define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE)) | |
933 | +#define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE)) | |
934 | +#define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE)) | |
935 | +#define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE)) | |
936 | +#define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE)) | |
937 | +#define BLK_TA_SPLIT (__BLK_TA_SPLIT) | |
938 | +#define BLK_TA_BOUNCE (__BLK_TA_BOUNCE) | |
939 | +#define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE)) | |
940 | + | |
941 | +#define BLK_IO_TRACE_MAGIC 0x65617400 | |
62fb68f5 | 942 | +#define BLK_IO_TRACE_VERSION 0x07 |
158f0231 JA |
943 | + |
944 | +/* | |
945 | + * The trace itself | |
946 | + */ | |
947 | +struct blk_io_trace { | |
948 | + u32 magic; /* MAGIC << 8 | version */ | |
949 | + u32 sequence; /* event number */ | |
950 | + u64 time; /* in microseconds */ | |
951 | + u64 sector; /* disk offset */ | |
952 | + u32 bytes; /* transfer length */ | |
953 | + u32 action; /* what happened */ | |
954 | + u32 pid; /* who did it */ | |
62fb68f5 | 955 | + u32 device; /* device number */ |
158f0231 JA |
956 | + u32 cpu; /* on what cpu did it happen */ |
957 | + u16 error; /* completion error */ | |
958 | + u16 pdu_len; /* length of data after this trace */ | |
158f0231 JA |
959 | +}; |
960 | + | |
961 | +/* | |
962 | + * The remap event | |
963 | + */ | |
964 | +struct blk_io_trace_remap { | |
965 | + u32 device; | |
966 | + u32 __pad; | |
967 | + u64 sector; | |
968 | +}; | |
969 | + | |
e62a6470 JA |
970 | +enum { |
971 | + Blktrace_setup = 1, | |
972 | + Blktrace_running, | |
973 | + Blktrace_stopped, | |
974 | +}; | |
975 | + | |
158f0231 | 976 | +struct blk_trace { |
e62a6470 | 977 | + int trace_state; |
158f0231 JA |
978 | + struct dentry *dir; |
979 | + struct rchan *rchan; | |
980 | + struct dentry *dropped_file; | |
981 | + atomic_t dropped; | |
982 | + unsigned long *sequence; | |
983 | + u32 dev; | |
984 | + u16 act_mask; | |
985 | + u64 start_lba; | |
986 | + u64 end_lba; | |
987 | + u32 pid; | |
988 | +}; | |
989 | + | |
990 | +/* | |
e62a6470 | 991 | + * User setup structure passed with BLKTRACESTART |
158f0231 JA |
992 | + */ |
993 | +struct blk_user_trace_setup { | |
994 | + char name[BDEVNAME_SIZE]; /* output */ | |
995 | + u16 act_mask; /* input */ | |
996 | + u32 buf_size; /* input */ | |
997 | + u32 buf_nr; /* input */ | |
998 | + u64 start_lba; | |
999 | + u64 end_lba; | |
1000 | + u32 pid; | |
1001 | +}; | |
1002 | + | |
1003 | +#if defined(CONFIG_BLK_DEV_IO_TRACE) | |
e62a6470 JA |
1004 | +extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); |
1005 | +extern void blk_trace_shutdown(request_queue_t *); | |
158f0231 JA |
1006 | +extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); |
1007 | + | |
1008 | +/** | |
1009 | + * blk_add_trace_rq - Add a trace for a request oriented action | |
1010 | + * @q: queue the io is for | |
1011 | + * @rq: the source request | |
1012 | + * @what: the action | |
1013 | + * | |
1014 | + * Description: | |
1015 | + * Records an action against a request. Will log the bio offset + size. | |
1016 | + * | |
1017 | + **/ | |
1018 | +static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq, | |
1019 | + u32 what) | |
1020 | +{ | |
1021 | + struct blk_trace *bt = q->blk_trace; | |
1022 | + int rw = rq->flags & 0x07; | |
1023 | + | |
1024 | + if (likely(!bt)) | |
1025 | + return; | |
1026 | + | |
1027 | + if (blk_pc_request(rq)) { | |
1028 | + what |= BLK_TC_ACT(BLK_TC_PC); | |
1029 | + __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd); | |
1030 | + } else { | |
1031 | + what |= BLK_TC_ACT(BLK_TC_FS); | |
1032 | + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL); | |
1033 | + } | |
1034 | +} | |
1035 | + | |
1036 | +/** | |
1037 | + * blk_add_trace_bio - Add a trace for a bio oriented action | |
1038 | + * @q: queue the io is for | |
1039 | + * @bio: the source bio | |
1040 | + * @what: the action | |
1041 | + * | |
1042 | + * Description: | |
1043 | + * Records an action against a bio. Will log the bio offset + size. | |
1044 | + * | |
1045 | + **/ | |
1046 | +static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | |
1047 | + u32 what) | |
1048 | +{ | |
1049 | + struct blk_trace *bt = q->blk_trace; | |
1050 | + | |
1051 | + if (likely(!bt)) | |
1052 | + return; | |
1053 | + | |
1054 | + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); | |
1055 | +} | |
1056 | + | |
1057 | +/** | |
1058 | + * blk_add_trace_generic - Add a trace for a generic action | |
1059 | + * @q: queue the io is for | |
1060 | + * @bio: the source bio | |
1061 | + * @rw: the data direction | |
1062 | + * @what: the action | |
1063 | + * | |
1064 | + * Description: | |
1065 | + * Records a simple trace | |
1066 | + * | |
1067 | + **/ | |
1068 | +static inline void blk_add_trace_generic(struct request_queue *q, | |
1069 | + struct bio *bio, int rw, u32 what) | |
1070 | +{ | |
1071 | + struct blk_trace *bt = q->blk_trace; | |
1072 | + | |
1073 | + if (likely(!bt)) | |
1074 | + return; | |
1075 | + | |
1076 | + if (bio) | |
1077 | + blk_add_trace_bio(q, bio, what); | |
1078 | + else | |
1079 | + __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL); | |
1080 | +} | |
1081 | + | |
1082 | +/** | |
1083 | + * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload | |
1084 | + * @q: queue the io is for | |
1085 | + * @what: the action | |
1086 | + * @bio: the source bio | |
1087 | + * @pdu: the integer payload | |
1088 | + * | |
1089 | + * Description: | |
1090 | + * Adds a trace with some integer payload. This might be an unplug | |
1091 | + * option given as the action, with the depth at unplug time given | |
1092 | + * as the payload | |
1093 | + * | |
1094 | + **/ | |
1095 | +static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what, | |
1096 | + struct bio *bio, unsigned int pdu) | |
1097 | +{ | |
1098 | + struct blk_trace *bt = q->blk_trace; | |
1099 | + u64 rpdu = cpu_to_be64(pdu); | |
1100 | + | |
1101 | + if (likely(!bt)) | |
1102 | + return; | |
1103 | + | |
1104 | + if (bio) | |
1105 | + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); | |
1106 | + else | |
1107 | + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); | |
1108 | +} | |
1109 | + | |
1110 | +/** | |
1111 | + * blk_add_trace_remap - Add a trace for a remap operation | |
1112 | + * @q: queue the io is for | |
1113 | + * @bio: the source bio | |
1114 | + * @dev: target device | |
1115 | + * @from: source sector | |
1116 | + * @to: target sector | |
1117 | + * | |
1118 | + * Description: | |
1119 | + * Device mapper or raid target sometimes need to split a bio because | |
1120 | + * it spans a stripe (or similar). Add a trace for that action. | |
1121 | + * | |
1122 | + **/ | |
1123 | +static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio, | |
1124 | + dev_t dev, sector_t from, sector_t to) | |
1125 | +{ | |
1126 | + struct blk_trace *bt = q->blk_trace; | |
1127 | + struct blk_io_trace_remap r; | |
1128 | + | |
1129 | + if (likely(!bt)) | |
1130 | + return; | |
1131 | + | |
1132 | + r.device = cpu_to_be32(dev); | |
1133 | + r.sector = cpu_to_be64(to); | |
1134 | + | |
1135 | + __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); | |
1136 | +} | |
1137 | + | |
1138 | +#else /* !CONFIG_BLK_DEV_IO_TRACE */ | |
e62a6470 JA |
1139 | +#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) |
1140 | +#define blk_trace_shutdown(q) do { } while (0) | |
158f0231 JA |
1141 | +#define blk_add_trace_rq(q, rq, what) do { } while (0) |
1142 | +#define blk_add_trace_bio(q, rq, what) do { } while (0) | |
1143 | +#define blk_add_trace_generic(q, rq, rw, what) do { } while (0) | |
1144 | +#define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0) | |
1145 | +#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0) | |
1146 | +#endif /* CONFIG_BLK_DEV_IO_TRACE */ | |
1147 | + | |
1148 | +#endif | |
1149 | diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h | |
e62a6470 | 1150 | index 8fad50f..a3ed64b 100644 |
158f0231 JA |
1151 | --- a/include/linux/compat_ioctl.h |
1152 | +++ b/include/linux/compat_ioctl.h | |
e62a6470 | 1153 | @@ -97,6 +97,10 @@ COMPATIBLE_IOCTL(BLKRRPART) |
158f0231 JA |
1154 | COMPATIBLE_IOCTL(BLKFLSBUF) |
1155 | COMPATIBLE_IOCTL(BLKSECTSET) | |
1156 | COMPATIBLE_IOCTL(BLKSSZGET) | |
e62a6470 JA |
1157 | +COMPATIBLE_IOCTL(BLKTRACESTART) |
1158 | +COMPATIBLE_IOCTL(BLKTRACESTOP) | |
1159 | +COMPATIBLE_IOCTL(BLKTRACESETUP) | |
1160 | +COMPATIBLE_IOCTL(BLKTRACETEARDOWN) | |
158f0231 JA |
1161 | ULONG_IOCTL(BLKRASET) |
1162 | ULONG_IOCTL(BLKFRASET) | |
1163 | /* RAID */ | |
1164 | diff --git a/include/linux/fs.h b/include/linux/fs.h | |
e62a6470 | 1165 | index e059da9..c7a63cd 100644 |
158f0231 JA |
1166 | --- a/include/linux/fs.h |
1167 | +++ b/include/linux/fs.h | |
e62a6470 | 1168 | @@ -196,6 +196,10 @@ extern int dir_notify_enable; |
158f0231 JA |
1169 | #define BLKBSZGET _IOR(0x12,112,size_t) |
1170 | #define BLKBSZSET _IOW(0x12,113,size_t) | |
1171 | #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ | |
e62a6470 JA |
1172 | +#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) |
1173 | +#define BLKTRACESTART _IO(0x12,116) | |
1174 | +#define BLKTRACESTOP _IO(0x12,117) | |
1175 | +#define BLKTRACETEARDOWN _IO(0x12,118) | |
158f0231 JA |
1176 | |
1177 | #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ | |
1178 | #define FIBMAP _IO(0x00,1) /* bmap access */ | |
62fb68f5 | 1179 | diff --git a/include/linux/sched.h b/include/linux/sched.h |
57c9f4e8 | 1180 | index 0cfcd1c..69cd119 100644 |
62fb68f5 JA |
1181 | --- a/include/linux/sched.h |
1182 | +++ b/include/linux/sched.h | |
57c9f4e8 JA |
1183 | @@ -708,6 +708,7 @@ struct task_struct { |
1184 | prio_array_t *array; | |
62fb68f5 | 1185 | |
57c9f4e8 JA |
1186 | unsigned short ioprio; |
1187 | + unsigned int btrace_seq; | |
1188 | ||
1189 | unsigned long sleep_avg; | |
1190 | unsigned long long timestamp, last_ran; | |
1191 | diff --git a/kernel/fork.c b/kernel/fork.c | |
1192 | index 8e88b37..60f838f 100644 | |
1193 | --- a/kernel/fork.c | |
1194 | +++ b/kernel/fork.c | |
1195 | @@ -179,6 +179,7 @@ static struct task_struct *dup_task_stru | |
1196 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | |
1197 | atomic_set(&tsk->usage,2); | |
1198 | atomic_set(&tsk->fs_excl, 0); | |
1199 | + tsk->btrace_seq = 0; | |
1200 | return tsk; | |
1201 | } | |
62fb68f5 | 1202 | |
158f0231 JA |
1203 | diff --git a/mm/highmem.c b/mm/highmem.c |
1204 | index ce2e7e8..d0ea1ee 100644 | |
1205 | --- a/mm/highmem.c | |
1206 | +++ b/mm/highmem.c | |
1207 | @@ -26,6 +26,7 @@ | |
1208 | #include <linux/init.h> | |
1209 | #include <linux/hash.h> | |
1210 | #include <linux/highmem.h> | |
1211 | +#include <linux/blktrace_api.h> | |
1212 | #include <asm/tlbflush.h> | |
1213 | ||
1214 | static mempool_t *page_pool, *isa_page_pool; | |
1215 | @@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q | |
1216 | pool = isa_page_pool; | |
1217 | } | |
1218 | ||
1219 | + blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | |
1220 | + | |
1221 | /* | |
1222 | * slow path | |
1223 | */ |