[PATCH] blktrace: add warning about increasing buffer size with dropped events
[blktrace.git] / kernel / blk-trace-2.6.15-git-P0
CommitLineData
6f7da51b
JA
1diff --git a/block/Kconfig b/block/Kconfig
2index 377f6dd..27eaed9 100644
fdf9780f
JA
3--- a/block/Kconfig
4+++ b/block/Kconfig
5@@ -11,4 +11,15 @@ config LBD
892ca0d3
JA
6 your machine, or if you want to have a raid or loopback device
7 bigger than 2TB. Otherwise say N.
8
9+config BLK_DEV_IO_TRACE
10+ bool "Support for tracing block io actions"
11+ select RELAYFS_FS
12+ help
13+ Say Y here, if you want to be able to trace the block layer actions
6c8d81e4
JA
14+ on a given queue. Tracing allows you to see any traffic happening
15+ on a block device queue. For more information (and the user space
16+ support tools needed), fetch the blktrace app from:
17+
18+ git://brick.kernel.dk/data/git/blktrace.git
892ca0d3 19+
fdf9780f 20 source block/Kconfig.iosched
6f7da51b
JA
21diff --git a/block/Makefile b/block/Makefile
22index 7e4f93e..c05de0e 100644
fdf9780f
JA
23--- a/block/Makefile
24+++ b/block/Makefile
25@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosch
26 obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
27 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
28 obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
892ca0d3 29+
892ca0d3 30+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
6f7da51b
JA
31diff --git a/block/blktrace.c b/block/blktrace.c
32new file mode 100644
33index 0000000..c66d074
6c8d81e4 34--- /dev/null
fdf9780f 35+++ b/block/blktrace.c
6f7da51b 36@@ -0,0 +1,257 @@
6c8d81e4
JA
37+#include <linux/config.h>
38+#include <linux/kernel.h>
39+#include <linux/blkdev.h>
fdf9780f 40+#include <linux/blktrace_api.h>
6c8d81e4
JA
41+#include <linux/percpu.h>
42+#include <linux/init.h>
6f7da51b 43+#include <linux/mutex.h>
6c8d81e4
JA
44+#include <asm/uaccess.h>
45+
46+static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
47+
48+void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
49+ int rw, u32 what, int error, int pdu_len, void *pdu_data)
50+{
51+ struct blk_io_trace t;
52+ unsigned long flags;
53+ pid_t pid;
54+ int cpu;
55+
56+ if (rw & (1 << BIO_RW_BARRIER))
57+ what |= BLK_TC_ACT(BLK_TC_BARRIER);
58+ if (rw & (1 << BIO_RW_SYNC))
59+ what |= BLK_TC_ACT(BLK_TC_SYNC);
60+
61+ if (rw & WRITE)
62+ what |= BLK_TC_ACT(BLK_TC_WRITE);
63+ else
64+ what |= BLK_TC_ACT(BLK_TC_READ);
65+
66+ if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
67+ return;
68+ if (sector < bt->start_lba || sector > bt->end_lba)
69+ return;
70+
71+ pid = current->pid;
72+ if (bt->pid && pid != bt->pid)
73+ return;
74+
75+ t.magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
76+
77+ t.device = bt->dev;
78+ t.sector = sector;
79+ t.bytes = bytes;
80+ t.action = what;
81+ t.error = error;
82+ t.pdu_len = pdu_len;
83+
84+ t.pid = pid;
85+ memcpy(t.comm, current->comm, sizeof(t.comm));
86+
87+ /*
88+ * need to serialize this part on the local processor to prevent
89+ * interrupts for messing with sequence <-> time relation
90+ */
91+ local_irq_save(flags);
92+
93+ t.sequence = atomic_add_return(1, &bt->sequence);
94+
95+ cpu = smp_processor_id();
96+ t.cpu = cpu;
97+ t.time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
98+
99+ __relay_write(bt->rchan, &t, sizeof(t));
100+ if (pdu_len)
101+ __relay_write(bt->rchan, pdu_data, pdu_len);
102+
103+ local_irq_restore(flags);
104+}
105+
106+EXPORT_SYMBOL_GPL(__blk_add_trace);
107+
108+static struct dentry *blk_tree_root;
6f7da51b 109+static struct mutex blk_tree_mutex;
6c8d81e4
JA
110+
111+static inline void blk_remove_root(void)
112+{
113+ if (relayfs_remove_dir(blk_tree_root) != -ENOTEMPTY)
114+ blk_tree_root = NULL;
115+}
116+
117+static void blk_remove_tree(struct dentry *dir)
118+{
6f7da51b 119+ mutex_lock(&blk_tree_mutex);
6c8d81e4
JA
120+ relayfs_remove_dir(dir);
121+ blk_remove_root();
6f7da51b 122+ mutex_unlock(&blk_tree_mutex);
6c8d81e4
JA
123+}
124+
125+static struct dentry *blk_create_tree(const char *blk_name)
126+{
127+ struct dentry *dir = NULL;
128+
6f7da51b 129+ mutex_lock(&blk_tree_mutex);
6c8d81e4
JA
130+
131+ if (!blk_tree_root) {
132+ blk_tree_root = relayfs_create_dir("block", NULL);
133+ if (!blk_tree_root)
134+ goto err;
135+ }
136+
137+ dir = relayfs_create_dir(blk_name, blk_tree_root);
138+ if (!dir)
139+ blk_remove_root();
140+
141+err:
6f7da51b 142+ mutex_unlock(&blk_tree_mutex);
6c8d81e4
JA
143+ return dir;
144+}
145+
146+void blk_cleanup_trace(struct blk_trace *bt)
147+{
148+ relay_close(bt->rchan);
149+ blk_remove_tree(bt->dir);
150+ kfree(bt);
151+}
152+
153+int blk_stop_trace(struct block_device *bdev)
154+{
155+ request_queue_t *q = bdev_get_queue(bdev);
156+ struct blk_trace *bt = NULL;
157+ int ret = -EINVAL;
158+
159+ if (!q)
160+ return -ENXIO;
161+
162+ down(&bdev->bd_sem);
163+
164+ if (q->blk_trace) {
165+ bt = q->blk_trace;
166+ q->blk_trace = NULL;
167+ ret = 0;
168+ }
169+
170+ up(&bdev->bd_sem);
171+
172+ if (bt)
173+ blk_cleanup_trace(bt);
174+
175+ return ret;
176+}
177+
178+int blk_start_trace(struct block_device *bdev, char __user *arg)
179+{
180+ request_queue_t *q = bdev_get_queue(bdev);
181+ struct blk_user_trace_setup buts;
182+ struct blk_trace *bt = NULL;
183+ struct dentry *dir = NULL;
184+ char b[BDEVNAME_SIZE];
6f7da51b 185+ int ret, i;
6c8d81e4
JA
186+
187+ if (!q)
188+ return -ENXIO;
189+
190+ if (copy_from_user(&buts, arg, sizeof(buts)))
191+ return -EFAULT;
192+
193+ if (!buts.buf_size || !buts.buf_nr)
194+ return -EINVAL;
195+
196+ strcpy(buts.name, bdevname(bdev, b));
197+
6f7da51b
JA
198+ /*
199+ * some device names have larger paths - convert the slashes
200+ * to underscores for this to work as expected
201+ */
202+ for (i = 0; i < strlen(buts.name); i++)
203+ if (buts.name[i] == '/')
204+ buts.name[i] = '_';
205+
6c8d81e4
JA
206+ if (copy_to_user(arg, &buts, sizeof(buts)))
207+ return -EFAULT;
208+
209+ down(&bdev->bd_sem);
210+ ret = -EBUSY;
211+ if (q->blk_trace)
212+ goto err;
213+
214+ ret = -ENOMEM;
215+ bt = kmalloc(sizeof(*bt), GFP_KERNEL);
216+ if (!bt)
217+ goto err;
218+
219+ ret = -ENOENT;
6f7da51b 220+ dir = blk_create_tree(buts.name);
6c8d81e4
JA
221+ if (!dir)
222+ goto err;
223+
224+ bt->dir = dir;
225+ bt->dev = bdev->bd_dev;
226+ atomic_set(&bt->sequence, 0);
227+
228+ ret = -EIO;
229+ bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, NULL);
230+ if (!bt->rchan)
231+ goto err;
232+
233+ bt->act_mask = buts.act_mask;
234+ if (!bt->act_mask)
235+ bt->act_mask = (u16) -1;
236+
237+ bt->start_lba = buts.start_lba;
238+ bt->end_lba = buts.end_lba;
239+ if (!bt->end_lba)
240+ bt->end_lba = -1ULL;
241+
242+ bt->pid = buts.pid;
243+
244+ q->blk_trace = bt;
245+ up(&bdev->bd_sem);
246+ return 0;
247+err:
248+ up(&bdev->bd_sem);
249+ if (dir)
250+ blk_remove_tree(dir);
251+ if (bt)
252+ kfree(bt);
253+ return ret;
254+}
255+
256+static void blk_trace_check_cpu_time(void *data)
257+{
258+ unsigned long long a, b, *t;
259+ struct timeval tv;
260+ int cpu = get_cpu();
261+
262+ t = &per_cpu(blk_trace_cpu_offset, cpu);
263+
264+ a = sched_clock();
265+ do_gettimeofday(&tv);
266+ b = sched_clock();
267+
268+ *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
269+ *t -= (a + b) / 2;
270+ put_cpu();
271+}
272+
273+static int blk_trace_calibrate_offsets(void)
274+{
275+ unsigned long flags;
276+
277+ smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
278+ local_irq_save(flags);
279+ blk_trace_check_cpu_time(NULL);
280+ local_irq_restore(flags);
281+
282+ return 0;
283+}
284+
285+static __init int blk_trace_init(void)
286+{
6f7da51b
JA
287+ mutex_init(&blk_tree_mutex);
288+
6c8d81e4
JA
289+ return blk_trace_calibrate_offsets();
290+}
291+
292+module_init(blk_trace_init);
293+
6f7da51b
JA
294diff --git a/block/elevator.c b/block/elevator.c
295index 99a4d7b..e0b8179 100644
fdf9780f
JA
296--- a/block/elevator.c
297+++ b/block/elevator.c
08fe97d6 298@@ -33,6 +33,7 @@
892ca0d3
JA
299 #include <linux/init.h>
300 #include <linux/compiler.h>
6c8d81e4 301 #include <linux/delay.h>
fdf9780f 302+#include <linux/blktrace_api.h>
892ca0d3
JA
303
304 #include <asm/uaccess.h>
305
08fe97d6
JA
306@@ -326,6 +327,8 @@ void __elv_add_request(request_queue_t *
307 struct list_head *pos;
308 unsigned ordseq;
309
cf483917 310+ blk_add_trace_rq(q, rq, BLK_TA_INSERT);
892ca0d3 311+
08fe97d6
JA
312 if (q->ordcolor)
313 rq->flags |= REQ_ORDERED_COLOR;
314
315@@ -486,6 +489,8 @@ struct request *elv_next_request(request
6c8d81e4
JA
316 rq->flags |= REQ_STARTED;
317 }
892ca0d3 318
892ca0d3
JA
319+ blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
320+
6c8d81e4
JA
321 if (!q->boundary_rq || q->boundary_rq == rq) {
322 q->end_sector = rq_end_sector(rq);
323 q->boundary_rq = NULL;
6f7da51b
JA
324diff --git a/block/ioctl.c b/block/ioctl.c
325index e110949..63e67a2 100644
fdf9780f
JA
326--- a/block/ioctl.c
327+++ b/block/ioctl.c
08fe97d6 328@@ -5,6 +5,7 @@
892ca0d3
JA
329 #include <linux/backing-dev.h>
330 #include <linux/buffer_head.h>
331 #include <linux/smp_lock.h>
fdf9780f 332+#include <linux/blktrace_api.h>
892ca0d3
JA
333 #include <asm/uaccess.h>
334
335 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
08fe97d6 336@@ -189,6 +190,10 @@ static int blkdev_locked_ioctl(struct fi
892ca0d3
JA
337 return put_ulong(arg, bdev->bd_inode->i_size >> 9);
338 case BLKGETSIZE64:
339 return put_u64(arg, bdev->bd_inode->i_size);
340+ case BLKSTARTTRACE:
341+ return blk_start_trace(bdev, (char __user *) arg);
342+ case BLKSTOPTRACE:
343+ return blk_stop_trace(bdev);
344 }
345 return -ENOIOCTLCMD;
346 }
6f7da51b
JA
347diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
348index 8e27d0a..bfcde0f 100644
fdf9780f
JA
349--- a/block/ll_rw_blk.c
350+++ b/block/ll_rw_blk.c
08fe97d6 351@@ -28,6 +28,7 @@
892ca0d3 352 #include <linux/writeback.h>
08fe97d6
JA
353 #include <linux/interrupt.h>
354 #include <linux/cpu.h>
fdf9780f 355+#include <linux/blktrace_api.h>
892ca0d3
JA
356
357 /*
358 * for max sense size
08fe97d6 359@@ -1555,8 +1556,10 @@ void blk_plug_device(request_queue_t *q)
892ca0d3
JA
360 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
361 return;
362
363- if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
364+ if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
365 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
366+ blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
367+ }
368 }
369
370 EXPORT_SYMBOL(blk_plug_device);
08fe97d6 371@@ -1620,14 +1623,21 @@ static void blk_backing_dev_unplug(struc
892ca0d3
JA
372 /*
373 * devices don't necessarily have an ->unplug_fn defined
374 */
375- if (q->unplug_fn)
376+ if (q->unplug_fn) {
377+ blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
378+ q->rq.count[READ] + q->rq.count[WRITE]);
379+
380 q->unplug_fn(q);
381+ }
382 }
383
384 static void blk_unplug_work(void *data)
bc4fd908
JA
385 {
386 request_queue_t *q = data;
387
388+ blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
389+ q->rq.count[READ] + q->rq.count[WRITE]);
390+
391 q->unplug_fn(q);
392 }
393
08fe97d6 394@@ -1635,6 +1645,9 @@ static void blk_unplug_timeout(unsigned
ca1880de
JA
395 {
396 request_queue_t *q = (request_queue_t *)data;
397
398+ blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
399+ q->rq.count[READ] + q->rq.count[WRITE]);
400+
401 kblockd_schedule_work(&q->unplug_work);
402 }
403
08fe97d6 404@@ -1757,6 +1770,11 @@ void blk_cleanup_queue(request_queue_t *
892ca0d3
JA
405 if (q->queue_tags)
406 __blk_queue_free_tags(q);
407
408+ if (q->blk_trace) {
409+ blk_cleanup_trace(q->blk_trace);
410+ q->blk_trace = NULL;
411+ }
412+
892ca0d3 413 kmem_cache_free(requestq_cachep, q);
08fe97d6
JA
414 }
415
416@@ -2108,6 +2126,8 @@ rq_starved:
892ca0d3
JA
417
418 rq_init(q, rq);
419 rq->rl = rl;
420+
421+ blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
422 out:
423 return rq;
424 }
08fe97d6 425@@ -2136,6 +2156,8 @@ static struct request *get_request_wait(
892ca0d3
JA
426 if (!rq) {
427 struct io_context *ioc;
428
429+ blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
430+
431 __generic_unplug_device(q);
432 spin_unlock_irq(q->queue_lock);
433 io_schedule();
08fe97d6 434@@ -2189,6 +2211,8 @@ EXPORT_SYMBOL(blk_get_request);
892ca0d3
JA
435 */
436 void blk_requeue_request(request_queue_t *q, struct request *rq)
437 {
438+ blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
439+
440 if (blk_rq_tagged(rq))
441 blk_queue_end_tag(q, rq);
442
08fe97d6 443@@ -2820,6 +2844,8 @@ static int __make_request(request_queue_
892ca0d3
JA
444 if (!q->back_merge_fn(q, req, bio))
445 break;
446
447+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
448+
449 req->biotail->bi_next = bio;
450 req->biotail = bio;
451 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
08fe97d6 452@@ -2835,6 +2861,8 @@ static int __make_request(request_queue_
892ca0d3
JA
453 if (!q->front_merge_fn(q, req, bio))
454 break;
455
456+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
457+
458 bio->bi_next = req->bio;
459 req->bio = bio;
460
08fe97d6 461@@ -2952,6 +2980,7 @@ void generic_make_request(struct bio *bi
a8f30e64
JA
462 request_queue_t *q;
463 sector_t maxsector;
464 int ret, nr_sectors = bio_sectors(bio);
465+ dev_t old_dev;
466
467 might_sleep();
468 /* Test device or partition size, when known. */
08fe97d6 469@@ -2978,6 +3007,8 @@ void generic_make_request(struct bio *bi
a8f30e64
JA
470 * NOTE: we don't repeat the blk_size check for each new device.
471 * Stacking drivers are expected to know what they are doing.
472 */
473+ maxsector = -1;
474+ old_dev = 0;
475 do {
476 char b[BDEVNAME_SIZE];
477
08fe97d6 478@@ -3010,6 +3041,15 @@ end_io:
892ca0d3
JA
479 */
480 blk_partition_remap(bio);
481
a8f30e64 482+ if (maxsector != -1)
aa61fd87
AB
483+ blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
484+ maxsector);
a8f30e64 485+
892ca0d3 486+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
a8f30e64
JA
487+
488+ maxsector = bio->bi_sector;
489+ old_dev = bio->bi_bdev->bd_dev;
892ca0d3
JA
490+
491 ret = q->make_request_fn(q, bio);
492 } while (ret);
493 }
08fe97d6 494@@ -3129,6 +3169,8 @@ static int __end_that_request_first(stru
892ca0d3
JA
495 int total_bytes, bio_nbytes, error, next_idx = 0;
496 struct bio *bio;
497
498+ blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
499+
500 /*
501 * extend uptodate bool to allow < 0 value to be direct io error
502 */
6f7da51b
JA
503diff --git a/drivers/md/dm.c b/drivers/md/dm.c
504index 097d1e5..0411900 100644
aa61fd87
AB
505--- a/drivers/md/dm.c
506+++ b/drivers/md/dm.c
507@@ -17,6 +17,7 @@
508 #include <linux/mempool.h>
509 #include <linux/slab.h>
510 #include <linux/idr.h>
fdf9780f 511+#include <linux/blktrace_api.h>
aa61fd87
AB
512
513 static const char *_name = DM_NAME;
514
08fe97d6 515@@ -303,6 +304,8 @@ static inline void dec_pending(struct dm
aa61fd87
AB
516 /* nudge anyone waiting on suspend queue */
517 wake_up(&io->md->wait);
518
519+ blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
520+
521 bio_endio(io->bio, io->bio->bi_size, io->error);
522 free_io(io->md, io);
523 }
08fe97d6 524@@ -361,6 +364,7 @@ static void __map_bio(struct dm_target *
aa61fd87
AB
525 struct target_io *tio)
526 {
527 int r;
528+ sector_t sector;
529
530 /*
531 * Sanity checks.
08fe97d6 532@@ -376,10 +380,17 @@ static void __map_bio(struct dm_target *
aa61fd87
AB
533 * this io.
534 */
535 atomic_inc(&tio->io->io_count);
536+ sector = clone->bi_sector;
537 r = ti->type->map(ti, clone, &tio->info);
538- if (r > 0)
539+ if (r > 0) {
540 /* the bio has been remapped so dispatch it */
541+
542+ blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
543+ tio->io->bio->bi_bdev->bd_dev, sector,
544+ clone->bi_sector);
545+
546 generic_make_request(clone);
547+ }
548
549 else if (r < 0) {
550 /* error the io and bail out */
6f7da51b
JA
551diff --git a/fs/bio.c b/fs/bio.c
552index 7b30695..c0cbbd4 100644
892ca0d3
JA
553--- a/fs/bio.c
554+++ b/fs/bio.c
555@@ -25,6 +25,7 @@
556 #include <linux/module.h>
557 #include <linux/mempool.h>
558 #include <linux/workqueue.h>
fdf9780f 559+#include <linux/blktrace_api.h>
892ca0d3
JA
560 #include <scsi/sg.h> /* for struct sg_iovec */
561
562 #define BIO_POOL_SIZE 256
08fe97d6 563@@ -1094,6 +1095,9 @@ struct bio_pair *bio_split(struct bio *b
892ca0d3
JA
564 if (!bp)
565 return bp;
566
567+ blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
568+ bi->bi_sector + first_sectors);
569+
570 BUG_ON(bi->bi_vcnt != 1);
571 BUG_ON(bi->bi_idx != 0);
572 atomic_set(&bp->cnt, 3);
6f7da51b
JA
573diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
574index f0b7256..67e901a 100644
fdf9780f
JA
575--- a/fs/compat_ioctl.c
576+++ b/fs/compat_ioctl.c
cf483917 577@@ -72,6 +72,7 @@
fdf9780f
JA
578 #include <linux/i2c-dev.h>
579 #include <linux/wireless.h>
580 #include <linux/atalk.h>
581+#include <linux/blktrace_api.h>
582
583 #include <net/sock.h> /* siocdevprivate_ioctl */
584 #include <net/bluetooth/bluetooth.h>
6f7da51b
JA
585diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
586index 02a585f..195c3b9 100644
892ca0d3
JA
587--- a/include/linux/blkdev.h
588+++ b/include/linux/blkdev.h
589@@ -22,6 +22,7 @@ typedef struct request_queue request_que
590 struct elevator_queue;
591 typedef struct elevator_queue elevator_t;
592 struct request_pm_state;
593+struct blk_trace;
594
595 #define BLKDEV_MIN_RQ 4
596 #define BLKDEV_MAX_RQ 128 /* Default maximum */
08fe97d6
JA
597@@ -416,6 +417,8 @@ struct request_queue
598 unsigned int sg_reserved_size;
599 int node;
892ca0d3 600
08fe97d6
JA
601+ struct blk_trace *blk_trace;
602+
603 /*
604 * reserved for flush operations
605 */
6f7da51b
JA
606diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
607new file mode 100644
608index 0000000..a37367f
6c8d81e4 609--- /dev/null
fdf9780f
JA
610+++ b/include/linux/blktrace_api.h
611@@ -0,0 +1,213 @@
892ca0d3
JA
612+#ifndef BLKTRACE_H
613+#define BLKTRACE_H
614+
615+#include <linux/config.h>
616+#include <linux/blkdev.h>
617+#include <linux/relayfs_fs.h>
618+
619+/*
620+ * Trace categories
621+ */
a8f30e64 622+enum blktrace_cat {
892ca0d3
JA
623+ BLK_TC_READ = 1 << 0, /* reads */
624+ BLK_TC_WRITE = 1 << 1, /* writes */
625+ BLK_TC_BARRIER = 1 << 2, /* barrier */
626+ BLK_TC_SYNC = 1 << 3, /* barrier */
627+ BLK_TC_QUEUE = 1 << 4, /* queueing/merging */
628+ BLK_TC_REQUEUE = 1 << 5, /* requeueing */
629+ BLK_TC_ISSUE = 1 << 6, /* issue */
630+ BLK_TC_COMPLETE = 1 << 7, /* completions */
631+ BLK_TC_FS = 1 << 8, /* fs requests */
632+ BLK_TC_PC = 1 << 9, /* pc requests */
633+
634+ BLK_TC_END = 1 << 15, /* only 16-bits, reminder */
635+};
636+
637+#define BLK_TC_SHIFT (16)
638+#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT)
639+
640+/*
641+ * Basic trace actions
642+ */
a8f30e64 643+enum blktrace_act {
892ca0d3
JA
644+ __BLK_TA_QUEUE = 1, /* queued */
645+ __BLK_TA_BACKMERGE, /* back merged to existing rq */
646+ __BLK_TA_FRONTMERGE, /* front merge to existing rq */
647+ __BLK_TA_GETRQ, /* allocated new request */
648+ __BLK_TA_SLEEPRQ, /* sleeping on rq allocation */
649+ __BLK_TA_REQUEUE, /* request requeued */
650+ __BLK_TA_ISSUE, /* sent to driver */
651+ __BLK_TA_COMPLETE, /* completed by driver */
652+ __BLK_TA_PLUG, /* queue was plugged */
653+ __BLK_TA_UNPLUG_IO, /* queue was unplugged by io */
654+ __BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */
655+ __BLK_TA_INSERT, /* insert request */
656+ __BLK_TA_SPLIT, /* bio was split */
657+ __BLK_TA_BOUNCE, /* bio was bounced */
a8f30e64 658+ __BLK_TA_REMAP, /* bio was remapped */
892ca0d3
JA
659+};
660+
661+/*
662+ * Trace actions in full. Additionally, read or write is masked
663+ */
664+#define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
665+#define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
666+#define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
667+#define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
668+#define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
669+#define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
670+#define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
671+#define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
672+#define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
673+#define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
674+#define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
675+#define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
676+#define BLK_TA_SPLIT (__BLK_TA_SPLIT)
677+#define BLK_TA_BOUNCE (__BLK_TA_BOUNCE)
a8f30e64 678+#define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
892ca0d3
JA
679+
680+#define BLK_IO_TRACE_MAGIC 0x65617400
681+#define BLK_IO_TRACE_VERSION 0x05
682+
683+/*
684+ * The trace itself
685+ */
686+struct blk_io_trace {
687+ u32 magic; /* MAGIC << 8 | version */
688+ u32 sequence; /* event number */
689+ u64 time; /* in microseconds */
690+ u64 sector; /* disk offset */
691+ u32 bytes; /* transfer length */
692+ u32 action; /* what happened */
693+ u32 pid; /* who did it */
694+ u32 cpu; /* on what cpu did it happen */
695+ u16 error; /* completion error */
696+ u16 pdu_len; /* length of data after this trace */
697+ u32 device; /* device number */
698+ char comm[16]; /* task command name (TASK_COMM_LEN) */
699+};
700+
a8f30e64
JA
701+/*
702+ * The remap event
703+ */
704+struct blk_io_trace_remap {
705+ u32 device;
fdf9780f 706+ u32 __pad;
a8f30e64
JA
707+ u64 sector;
708+};
709+
892ca0d3
JA
710+struct blk_trace {
711+ struct dentry *dir;
712+ struct rchan *rchan;
713+ atomic_t sequence;
714+ u32 dev;
715+ u16 act_mask;
6c8d81e4
JA
716+ u64 start_lba;
717+ u64 end_lba;
718+ u32 pid;
892ca0d3
JA
719+};
720+
721+/*
722+ * User setup structure passed with BLKSTARTTRACE
723+ */
724+struct blk_user_trace_setup {
725+ char name[BDEVNAME_SIZE]; /* output */
726+ u16 act_mask; /* input */
727+ u32 buf_size; /* input */
728+ u32 buf_nr; /* input */
6c8d81e4
JA
729+ u64 start_lba;
730+ u64 end_lba;
731+ u32 pid;
892ca0d3
JA
732+};
733+
734+#if defined(CONFIG_BLK_DEV_IO_TRACE)
735+extern int blk_start_trace(struct block_device *, char __user *);
736+extern int blk_stop_trace(struct block_device *);
737+extern void blk_cleanup_trace(struct blk_trace *);
738+extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
739+
740+static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
741+ u32 what)
742+{
743+ struct blk_trace *bt = q->blk_trace;
744+ int rw = rq->flags & 0x07;
745+
746+ if (likely(!bt))
747+ return;
748+
749+ if (blk_pc_request(rq)) {
750+ what |= BLK_TC_ACT(BLK_TC_PC);
751+ __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
752+ } else {
753+ what |= BLK_TC_ACT(BLK_TC_FS);
754+ __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
755+ }
756+}
757+
758+static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
759+ u32 what)
760+{
761+ struct blk_trace *bt = q->blk_trace;
762+
763+ if (likely(!bt))
764+ return;
765+
766+ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
767+}
768+
769+static inline void blk_add_trace_generic(struct request_queue *q,
770+ struct bio *bio, int rw, u32 what)
771+{
772+ struct blk_trace *bt = q->blk_trace;
773+
774+ if (likely(!bt))
775+ return;
776+
777+ if (bio)
778+ blk_add_trace_bio(q, bio, what);
779+ else
780+ __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
781+}
782+
783+static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
784+ struct bio *bio, unsigned int pdu)
785+{
786+ struct blk_trace *bt = q->blk_trace;
787+ u64 rpdu = cpu_to_be64(pdu);
788+
789+ if (likely(!bt))
790+ return;
791+
792+ if (bio)
793+ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
794+ else
795+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
796+}
797+
aa61fd87
AB
798+static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
799+ dev_t dev, sector_t from, sector_t to)
a8f30e64
JA
800+{
801+ struct blk_trace *bt = q->blk_trace;
802+ struct blk_io_trace_remap r;
803+
804+ if (likely(!bt))
805+ return;
806+
807+ r.device = cpu_to_be32(dev);
aa61fd87 808+ r.sector = cpu_to_be64(to);
a8f30e64 809+
aa61fd87 810+ __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
a8f30e64
JA
811+}
812+
892ca0d3
JA
813+#else /* !CONFIG_BLK_DEV_IO_TRACE */
814+#define blk_start_trace(bdev, arg) (-EINVAL)
815+#define blk_stop_trace(bdev) (-EINVAL)
816+#define blk_cleanup_trace(bt) do { } while (0)
817+#define blk_add_trace_rq(q, rq, what) do { } while (0)
818+#define blk_add_trace_bio(q, rq, what) do { } while (0)
819+#define blk_add_trace_generic(q, rq, rw, what) do { } while (0)
820+#define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0)
aa61fd87 821+#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0)
892ca0d3
JA
822+#endif /* CONFIG_BLK_DEV_IO_TRACE */
823+
824+#endif
6f7da51b
JA
825diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
826index 8fad50f..5bed09a 100644
fdf9780f
JA
827--- a/include/linux/compat_ioctl.h
828+++ b/include/linux/compat_ioctl.h
08fe97d6 829@@ -97,6 +97,8 @@ COMPATIBLE_IOCTL(BLKRRPART)
fdf9780f
JA
830 COMPATIBLE_IOCTL(BLKFLSBUF)
831 COMPATIBLE_IOCTL(BLKSECTSET)
832 COMPATIBLE_IOCTL(BLKSSZGET)
833+COMPATIBLE_IOCTL(BLKSTARTTRACE)
834+COMPATIBLE_IOCTL(BLKSTOPTRACE)
835 ULONG_IOCTL(BLKRASET)
836 ULONG_IOCTL(BLKFRASET)
837 /* RAID */
6f7da51b
JA
838diff --git a/include/linux/fs.h b/include/linux/fs.h
839index d1e370d..98b381e 100644
6c8d81e4
JA
840--- a/include/linux/fs.h
841+++ b/include/linux/fs.h
08fe97d6 842@@ -196,6 +196,8 @@ extern int dir_notify_enable;
6c8d81e4
JA
843 #define BLKBSZGET _IOR(0x12,112,size_t)
844 #define BLKBSZSET _IOW(0x12,113,size_t)
845 #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
846+#define BLKSTARTTRACE _IOWR(0x12,115,struct blk_user_trace_setup)
847+#define BLKSTOPTRACE _IO(0x12,116)
848
849 #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
850 #define FIBMAP _IO(0x00,1) /* bmap access */
6f7da51b
JA
851diff --git a/mm/highmem.c b/mm/highmem.c
852index ce2e7e8..d0ea1ee 100644
6c8d81e4
JA
853--- a/mm/highmem.c
854+++ b/mm/highmem.c
855@@ -26,6 +26,7 @@
856 #include <linux/init.h>
857 #include <linux/hash.h>
858 #include <linux/highmem.h>
fdf9780f 859+#include <linux/blktrace_api.h>
6c8d81e4
JA
860 #include <asm/tlbflush.h>
861
862 static mempool_t *page_pool, *isa_page_pool;
863@@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q
864 pool = isa_page_pool;
865 }
866
867+ blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
868+
869 /*
870 * slow path
871 */