blktrace: don't stop tracer if not setup trace successfully
[blktrace.git] / blkparse.c
index 7da67a609a7891e3118db33e0ae91509e8954b54..227cc444c280660eee4441c0d8e9f7ae80f36b17 100644 (file)
@@ -2,6 +2,7 @@
  * block queue tracing parse application
  *
  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -35,7 +36,7 @@
 #include "rbtree.h"
 #include "jhash.h"
 
-static char blkparse_version[] = "0.99";
+static char blkparse_version[] = "1.2.0";
 
 struct skip_info {
        unsigned long start, end;
@@ -52,8 +53,8 @@ struct per_dev_info {
        unsigned long long last_reported_time;
        unsigned long long last_read_time;
        struct io_stats io_stats;
-       unsigned long skips, nskips;
-       unsigned long long seq_skips, seq_nskips;
+       unsigned long skips;
+       unsigned long long seq_skips;
        unsigned int max_depth[2];
        unsigned int cur_depth[2];
 
@@ -68,9 +69,22 @@ struct per_dev_info {
        struct per_cpu_info *cpus;
 };
 
+/*
+ * some duplicated effort here, we can unify this hash and the ppi hash later
+ */
+struct process_pid_map {
+       pid_t pid;
+       char comm[16];
+       struct process_pid_map *hash_next, *list_next;
+};
+
+#define PPM_HASH_SHIFT (8)
+#define PPM_HASH_SIZE  (1 << PPM_HASH_SHIFT)
+#define PPM_HASH_MASK  (PPM_HASH_SIZE - 1)
+static struct process_pid_map *ppm_hash_table[PPM_HASH_SIZE];
+
 struct per_process_info {
-       char name[16];
-       __u32 pid;
+       struct process_pid_map *ppm;
        struct io_stats io_stats;
        struct per_process_info *hash_next, *list_next;
        int more_than_one;
@@ -90,7 +104,6 @@ static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
 static struct per_process_info *ppi_list;
 static int ppi_list_entries;
 
-#define S_OPTS "a:A:i:o:b:stqw:f:F:vVhD:"
 static struct option l_opts[] = {
        {
                .name = "act-mask",
@@ -105,82 +118,100 @@ static struct option l_opts[] = {
                .val = 'A'
        },
        {
-               .name = "input",
+               .name = "batch",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'i'
+               .val = 'b'
        },
        {
-               .name = "output",
+               .name = "input-directory",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'o'
+               .val = 'D'
        },
        {
-               .name = "batch",
+               .name = "dump-binary",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'b'
+               .val = 'd'
        },
        {
-               .name = "per-program-stats",
-               .has_arg = no_argument,
+               .name = "format",
+               .has_arg = required_argument,
                .flag = NULL,
-               .val = 's'
+               .val = 'f'
        },
        {
-               .name = "track-ios",
-               .has_arg = no_argument,
+               .name = "format-spec",
+               .has_arg = required_argument,
                .flag = NULL,
-               .val = 't'
+               .val = 'F'
        },
        {
-               .name = "quiet",
+               .name = "hash-by-name",
                .has_arg = no_argument,
                .flag = NULL,
-               .val = 'q'
+               .val = 'h'
        },
        {
-               .name = "stopwatch",
+               .name = "input",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'w'
+               .val = 'i'
        },
        {
-               .name = "format",
-               .has_arg = required_argument,
+               .name = "no-msgs",
+               .has_arg = no_argument,
                .flag = NULL,
-               .val = 'f'
+               .val = 'M'
        },
        {
-               .name = "format-spec",
+               .name = "output",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'F'
+               .val = 'o'
        },
        {
-               .name = "hash-by-name",
+               .name = "no-text-output",
                .has_arg = no_argument,
                .flag = NULL,
-               .val = 'h'
+               .val = 'O'
        },
        {
-               .name = "verbose",
+               .name = "quiet",
                .has_arg = no_argument,
                .flag = NULL,
-               .val = 'v'
+               .val = 'q'
        },
        {
-               .name = "version",
+               .name = "per-program-stats",
                .has_arg = no_argument,
                .flag = NULL,
-               .val = 'V'
+               .val = 's'
        },
        {
-               .name = "input-directory",
+               .name = "track-ios",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 't'
+       },
+       {
+               .name = "stopwatch",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'D'
+               .val = 'w'
+       },
+       {
+               .name = "verbose",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 'v'
+       },
+       {
+               .name = "version",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 'V'
        },
        {
                .name = NULL,
@@ -214,9 +245,8 @@ static struct trace *t_alloc_list;
 struct io_track {
        struct rb_node rb_node;
 
+       struct process_pid_map *ppm;
        __u64 sector;
-       __u32 pid;
-       char comm[16];
        unsigned long long allocation_time;
        unsigned long long queue_time;
        unsigned long long dispatch_time;
@@ -245,6 +275,11 @@ static int ppi_hash_by_pid = 1;
 static int verbose;
 static unsigned int act_mask = -1U;
 static int stats_printed;
+static int bin_output_msgs = 1;
+int data_is_native = -1;
+
+static FILE *dump_fp;
+static char *dump_binary;
 
 static unsigned int t_alloc_cache;
 static unsigned int bit_alloc_cache;
@@ -253,16 +288,36 @@ static unsigned int bit_alloc_cache;
 static unsigned int rb_batch = RB_BATCH_DEFAULT;
 
 static int pipeline;
+static char *pipename;
+
+static int text_output = 1;
 
 #define is_done()      (*(volatile int *)(&done))
 static volatile int done;
 
+struct timespec                abs_start_time;
+static unsigned long long start_timestamp;
+
+static int have_drv_data = 0;
+
 #define JHASH_RANDOM   (0x3af5f2ee)
 
 #define CPUS_PER_LONG  (8 * sizeof(unsigned long))
 #define CPU_IDX(cpu)   ((cpu) / CPUS_PER_LONG)
 #define CPU_BIT(cpu)   ((cpu) & (CPUS_PER_LONG - 1))
 
+static void output_binary(void *buf, int len)
+{
+       if (dump_binary) {
+               size_t n = fwrite(buf, len, 1, dump_fp);
+               if (n != 1) {
+                       perror(dump_binary);
+                       fclose(dump_fp);
+                       dump_binary = NULL;
+               }
+       }
+}
+
 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
 {
        struct per_cpu_info *cpus = pdi->cpus;
@@ -476,7 +531,108 @@ static inline int cpu_is_online(struct per_dev_info *pdi, int cpu)
        return (pdi->cpu_map[CPU_IDX(cpu)] & (1UL << CPU_BIT(cpu))) != 0;
 }
 
-static inline int ppi_hash_pid(__u32 pid)
+static inline int ppm_hash_pid(pid_t pid)
+{
+       return jhash_1word(pid, JHASH_RANDOM) & PPM_HASH_MASK;
+}
+
+static struct process_pid_map *find_ppm(pid_t pid)
+{
+       const int hash_idx = ppm_hash_pid(pid);
+       struct process_pid_map *ppm;
+
+       ppm = ppm_hash_table[hash_idx];
+       while (ppm) {
+               if (ppm->pid == pid)
+                       return ppm;
+
+               ppm = ppm->hash_next;
+       }
+
+       return NULL;
+}
+
+static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name)
+{
+       const int hash_idx = ppm_hash_pid(pid);
+       struct process_pid_map *ppm;
+
+       ppm = find_ppm(pid);
+       if (!ppm) {
+               ppm = malloc(sizeof(*ppm));
+               memset(ppm, 0, sizeof(*ppm));
+               ppm->pid = pid;
+               memset(ppm->comm, 0, sizeof(ppm->comm));
+               strncpy(ppm->comm, name, sizeof(ppm->comm));
+               ppm->comm[sizeof(ppm->comm) - 1] = '\0';
+               ppm->hash_next = ppm_hash_table[hash_idx];
+               ppm_hash_table[hash_idx] = ppm;
+       }
+
+       return ppm;
+}
+
+static void handle_notify(struct blk_io_trace *bit)
+{
+       void    *payload = (caddr_t) bit + sizeof(*bit);
+       __u32   two32[2];
+
+       switch (bit->action) {
+       case BLK_TN_PROCESS:
+               add_ppm_hash(bit->pid, payload);
+               break;
+
+       case BLK_TN_TIMESTAMP:
+               if (bit->pdu_len != sizeof(two32))
+                       return;
+               memcpy(two32, payload, sizeof(two32));
+               if (!data_is_native) {
+                       two32[0] = be32_to_cpu(two32[0]);
+                       two32[1] = be32_to_cpu(two32[1]);
+               }
+               start_timestamp = bit->time;
+               abs_start_time.tv_sec  = two32[0];
+               abs_start_time.tv_nsec = two32[1];
+               if (abs_start_time.tv_nsec < 0) {
+                       abs_start_time.tv_sec--;
+                       abs_start_time.tv_nsec += 1000000000;
+               }
+
+               break;
+
+       case BLK_TN_MESSAGE:
+               if (bit->pdu_len > 0) {
+                       char msg[bit->pdu_len+1];
+
+                       memcpy(msg, (char *)payload, bit->pdu_len);
+                       msg[bit->pdu_len] = '\0';
+
+                       fprintf(ofp,
+                               "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n",
+                               MAJOR(bit->device), MINOR(bit->device),
+                               bit->cpu, "0", (int) SECONDS(bit->time),
+                               (unsigned long) NANO_SECONDS(bit->time),
+                               0, "m", "N", msg);
+               }
+               break;
+
+       default:
+               /* Ignore unknown notify events */
+               ;
+       }
+}
+
+char *find_process_name(pid_t pid)
+{
+       struct process_pid_map *ppm = find_ppm(pid);
+
+       if (ppm)
+               return ppm->comm;
+
+       return NULL;
+}
+
+static inline int ppi_hash_pid(pid_t pid)
 {
        return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
 }
@@ -488,13 +644,15 @@ static inline int ppi_hash_name(const char *name)
 
 static inline int ppi_hash(struct per_process_info *ppi)
 {
+       struct process_pid_map *ppm = ppi->ppm;
+
        if (ppi_hash_by_pid)
-               return ppi_hash_pid(ppi->pid);
+               return ppi_hash_pid(ppm->pid);
 
-       return ppi_hash_name(ppi->name);
+       return ppi_hash_name(ppm->comm);
 }
 
-static inline void add_process_to_hash(struct per_process_info *ppi)
+static inline void add_ppi_to_hash(struct per_process_info *ppi)
 {
        const int hash_idx = ppi_hash(ppi);
 
@@ -502,21 +660,23 @@ static inline void add_process_to_hash(struct per_process_info *ppi)
        ppi_hash_table[hash_idx] = ppi;
 }
 
-static inline void add_process_to_list(struct per_process_info *ppi)
+static inline void add_ppi_to_list(struct per_process_info *ppi)
 {
        ppi->list_next = ppi_list;
        ppi_list = ppi;
        ppi_list_entries++;
 }
 
-static struct per_process_info *find_process_by_name(char *name)
+static struct per_process_info *find_ppi_by_name(char *name)
 {
        const int hash_idx = ppi_hash_name(name);
        struct per_process_info *ppi;
 
        ppi = ppi_hash_table[hash_idx];
        while (ppi) {
-               if (!strcmp(ppi->name, name))
+               struct process_pid_map *ppm = ppi->ppm;
+
+               if (!strcmp(ppm->comm, name))
                        return ppi;
 
                ppi = ppi->hash_next;
@@ -525,14 +685,16 @@ static struct per_process_info *find_process_by_name(char *name)
        return NULL;
 }
 
-static struct per_process_info *find_process_by_pid(__u32 pid)
+static struct per_process_info *find_ppi_by_pid(pid_t pid)
 {
        const int hash_idx = ppi_hash_pid(pid);
        struct per_process_info *ppi;
 
        ppi = ppi_hash_table[hash_idx];
        while (ppi) {
-               if (ppi->pid == pid)
+               struct process_pid_map *ppm = ppi->ppm;
+
+               if (ppm->pid == pid)
                        return ppi;
 
                ppi = ppi->hash_next;
@@ -541,15 +703,20 @@ static struct per_process_info *find_process_by_pid(__u32 pid)
        return NULL;
 }
 
-static struct per_process_info *find_process(__u32 pid, char *name)
+static struct per_process_info *find_ppi(pid_t pid)
 {
        struct per_process_info *ppi;
+       char *name;
 
        if (ppi_hash_by_pid)
-               return find_process_by_pid(pid);
+               return find_ppi_by_pid(pid);
 
-       ppi = find_process_by_name(name);
-       if (ppi && ppi->pid != pid)
+       name = find_process_name(pid);
+       if (!name)
+               return NULL;
+
+       ppi = find_ppi_by_name(name);
+       if (ppi && ppi->ppm->pid != pid)
                ppi->more_than_one = 1;
 
        return ppi;
@@ -789,16 +956,17 @@ static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
        return NULL;
 }
 
-static struct io_track *find_track(struct per_dev_info *pdi, __u32 pid,
-                                  char *comm, __u64 sector)
+static struct io_track *find_track(struct per_dev_info *pdi, pid_t pid,
+                                  __u64 sector)
 {
        struct io_track *iot;
 
        iot = __find_track(pdi, sector);
        if (!iot) {
                iot = malloc(sizeof(*iot));
-               iot->pid = pid;
-               memcpy(iot->comm, comm, sizeof(iot->comm));
+               iot->ppm = find_ppm(pid);
+               if (!iot->ppm)
+                       iot->ppm = add_ppm_hash(pid, "unknown");
                iot->sector = sector;
                track_rb_insert(pdi, iot);
        }
@@ -835,10 +1003,34 @@ static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
        if (!track_ios)
                return;
 
-       iot = find_track(pdi, t->pid, t->comm, t->sector);
+       iot = find_track(pdi, t->pid, t->sector);
        iot->allocation_time = t->time;
 }
 
+static inline int is_remapper(struct per_dev_info *pdi)
+{
+       int major = MAJOR(pdi->dev);
+
+       return (major == 253 || major == 9);
+}
+
+/*
+ * for md/dm setups, the interesting cycle is Q -> C. So track queueing
+ * time here, as dispatch time
+ */
+static void log_track_queue(struct per_dev_info *pdi, struct blk_io_trace *t)
+{
+       struct io_track *iot;
+
+       if (!track_ios)
+               return;
+       if (!is_remapper(pdi))
+               return;
+
+       iot = find_track(pdi, t->pid, t->sector);
+       iot->dispatch_time = t->time;
+}
+
 /*
  * return time between rq allocation and insertion
  */
@@ -851,7 +1043,7 @@ static unsigned long long log_track_insert(struct per_dev_info *pdi,
        if (!track_ios)
                return -1;
 
-       iot = find_track(pdi, t->pid, t->comm, t->sector);
+       iot = find_track(pdi, t->pid, t->sector);
        iot->queue_time = t->time;
 
        if (!iot->allocation_time)
@@ -860,7 +1052,7 @@ static unsigned long long log_track_insert(struct per_dev_info *pdi,
        elapsed = iot->queue_time - iot->allocation_time;
 
        if (per_process_stats) {
-               struct per_process_info *ppi = find_process(iot->pid,iot->comm);
+               struct per_process_info *ppi = find_ppi(iot->ppm->pid);
                int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
 
                if (ppi && elapsed > ppi->longest_allocation_wait[w])
@@ -897,7 +1089,7 @@ static unsigned long long log_track_issue(struct per_dev_info *pdi,
        elapsed = iot->dispatch_time - iot->queue_time;
 
        if (per_process_stats) {
-               struct per_process_info *ppi = find_process(iot->pid,iot->comm);
+               struct per_process_info *ppi = find_ppi(iot->ppm->pid);
                int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
 
                if (ppi && elapsed > ppi->longest_dispatch_wait[w])
@@ -918,8 +1110,6 @@ static unsigned long long log_track_complete(struct per_dev_info *pdi,
 
        if (!track_ios)
                return -1;
-       if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
-               return -1;
 
        iot = __find_track(pdi, t->sector);
        if (!iot) {
@@ -934,7 +1124,7 @@ static unsigned long long log_track_complete(struct per_dev_info *pdi,
        elapsed = iot->completion_time - iot->dispatch_time;
 
        if (per_process_stats) {
-               struct per_process_info *ppi = find_process(iot->pid,iot->comm);
+               struct per_process_info *ppi = find_ppi(iot->ppm->pid);
                int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
 
                if (ppi && elapsed > ppi->longest_completion_wait[w])
@@ -951,17 +1141,18 @@ static unsigned long long log_track_complete(struct per_dev_info *pdi,
 }
 
 
-static struct io_stats *find_process_io_stats(__u32 pid, char *name)
+static struct io_stats *find_process_io_stats(pid_t pid)
 {
-       struct per_process_info *ppi = find_process(pid, name);
+       struct per_process_info *ppi = find_ppi(pid);
 
        if (!ppi) {
                ppi = malloc(sizeof(*ppi));
                memset(ppi, 0, sizeof(*ppi));
-               memcpy(ppi->name, name, 16);
-               ppi->pid = pid;
-               add_process_to_hash(ppi);
-               add_process_to_list(ppi);
+               ppi->ppm = find_ppm(pid);
+               if (!ppi->ppm)
+                       ppi->ppm = add_ppm_hash(pid, "unknown");
+               add_ppi_to_hash(ppi);
+               add_ppi_to_list(ppi);
        }
 
        return &ppi->io_stats;
@@ -990,10 +1181,12 @@ static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
 {
        if (rw) {
                ios->mwrites++;
-               ios->qwrite_kb += t_kb(t);
+               ios->mwrite_kb += t_kb(t);
+               ios->mwrite_b += t_b(t);
        } else {
                ios->mreads++;
-               ios->qread_kb += t_kb(t);
+               ios->mread_kb += t_kb(t);
+               ios->mread_b += t_b(t);
        }
 }
 
@@ -1003,21 +1196,121 @@ static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
        __account_m(&pci->io_stats, t, rw);
 
        if (per_process_stats) {
-               struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
+               struct io_stats *ios = find_process_io_stats(t->pid);
 
                __account_m(ios, t, rw);
        }
 }
 
+static inline void __account_pc_queue(struct io_stats *ios,
+                                     struct blk_io_trace *t, int rw)
+{
+       if (rw) {
+               ios->qwrites_pc++;
+               ios->qwrite_kb_pc += t_kb(t);
+               ios->qwrite_b_pc += t_b(t);
+       } else {
+               ios->qreads_pc++;
+               ios->qread_kb += t_kb(t);
+               ios->qread_b_pc += t_b(t);
+       }
+}
+
+static inline void account_pc_queue(struct blk_io_trace *t,
+                                   struct per_cpu_info *pci, int rw)
+{
+       __account_pc_queue(&pci->io_stats, t, rw);
+
+       if (per_process_stats) {
+               struct io_stats *ios = find_process_io_stats(t->pid);
+
+               __account_pc_queue(ios, t, rw);
+       }
+}
+
+static inline void __account_pc_issue(struct io_stats *ios, int rw,
+                                     unsigned int bytes)
+{
+       if (rw) {
+               ios->iwrites_pc++;
+               ios->iwrite_kb_pc += bytes >> 10;
+               ios->iwrite_b_pc += bytes & 1023;
+       } else {
+               ios->ireads_pc++;
+               ios->iread_kb_pc += bytes >> 10;
+               ios->iread_b_pc += bytes & 1023;
+       }
+}
+
+static inline void account_pc_issue(struct blk_io_trace *t,
+                                   struct per_cpu_info *pci, int rw)
+{
+       __account_pc_issue(&pci->io_stats, rw, t->bytes);
+
+       if (per_process_stats) {
+               struct io_stats *ios = find_process_io_stats(t->pid);
+
+               __account_pc_issue(ios, rw, t->bytes);
+       }
+}
+
+static inline void __account_pc_requeue(struct io_stats *ios,
+                                       struct blk_io_trace *t, int rw)
+{
+       if (rw) {
+               ios->wrqueue_pc++;
+               ios->iwrite_kb_pc -= t_kb(t);
+               ios->iwrite_b_pc -= t_b(t);
+       } else {
+               ios->rrqueue_pc++;
+               ios->iread_kb_pc -= t_kb(t);
+               ios->iread_b_pc -= t_b(t);
+       }
+}
+
+static inline void account_pc_requeue(struct blk_io_trace *t,
+                                     struct per_cpu_info *pci, int rw)
+{
+       __account_pc_requeue(&pci->io_stats, t, rw);
+
+       if (per_process_stats) {
+               struct io_stats *ios = find_process_io_stats(t->pid);
+
+               __account_pc_requeue(ios, t, rw);
+       }
+}
+
+static inline void __account_pc_c(struct io_stats *ios, int rw)
+{
+       if (rw)
+               ios->cwrites_pc++;
+       else
+               ios->creads_pc++;
+}
+
+static inline void account_pc_c(struct blk_io_trace *t,
+                               struct per_cpu_info *pci, int rw)
+{
+       __account_pc_c(&pci->io_stats, rw);
+
+       if (per_process_stats) {
+               struct io_stats *ios = find_process_io_stats(t->pid);
+
+               __account_pc_c(ios, rw);
+       }
+}
+
 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
                                   int rw)
 {
        if (rw) {
                ios->qwrites++;
                ios->qwrite_kb += t_kb(t);
+               ios->qwrite_b += t_b(t);
        } else {
                ios->qreads++;
                ios->qread_kb += t_kb(t);
+               ios->qread_b += t_b(t);
        }
 }
 
@@ -1027,7 +1320,7 @@ static inline void account_queue(struct blk_io_trace *t,
        __account_queue(&pci->io_stats, t, rw);
 
        if (per_process_stats) {
-               struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
+               struct io_stats *ios = find_process_io_stats(t->pid);
 
                __account_queue(ios, t, rw);
        }
@@ -1038,9 +1331,11 @@ static inline void __account_c(struct io_stats *ios, int rw, int bytes)
        if (rw) {
                ios->cwrites++;
                ios->cwrite_kb += bytes >> 10;
+               ios->cwrite_b += bytes & 1023;
        } else {
                ios->creads++;
                ios->cread_kb += bytes >> 10;
+               ios->cread_b += bytes & 1023;
        }
 }
 
@@ -1050,7 +1345,7 @@ static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
        __account_c(&pci->io_stats, rw, bytes);
 
        if (per_process_stats) {
-               struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
+               struct io_stats *ios = find_process_io_stats(t->pid);
 
                __account_c(ios, rw, bytes);
        }
@@ -1062,9 +1357,11 @@ static inline void __account_issue(struct io_stats *ios, int rw,
        if (rw) {
                ios->iwrites++;
                ios->iwrite_kb += bytes >> 10;
+               ios->iwrite_b  += bytes & 1023;
        } else {
                ios->ireads++;
                ios->iread_kb += bytes >> 10;
+               ios->iread_b  += bytes & 1023;
        }
 }
 
@@ -1074,7 +1371,7 @@ static inline void account_issue(struct blk_io_trace *t,
        __account_issue(&pci->io_stats, rw, t->bytes);
 
        if (per_process_stats) {
-               struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
+               struct io_stats *ios = find_process_io_stats(t->pid);
 
                __account_issue(ios, rw, t->bytes);
        }
@@ -1094,7 +1391,7 @@ static inline void account_unplug(struct blk_io_trace *t,
        __account_unplug(&pci->io_stats, timer);
 
        if (per_process_stats) {
-               struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
+               struct io_stats *ios = find_process_io_stats(t->pid);
 
                __account_unplug(ios, timer);
        }
@@ -1106,9 +1403,11 @@ static inline void __account_requeue(struct io_stats *ios,
        if (rw) {
                ios->wrqueue++;
                ios->iwrite_kb -= t_kb(t);
+               ios->iwrite_b -= t_b(t);
        } else {
                ios->rrqueue++;
                ios->iread_kb -= t_kb(t);
+               ios->iread_b -= t_b(t);
        }
 }
 
@@ -1118,7 +1417,7 @@ static inline void account_requeue(struct blk_io_trace *t,
        __account_requeue(&pci->io_stats, t, rw);
 
        if (per_process_stats) {
-               struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
+               struct io_stats *ios = find_process_io_stats(t->pid);
 
                __account_requeue(ios, t, rw);
        }
@@ -1188,13 +1487,16 @@ static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
        process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
 }
 
-static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
+static void dump_trace_pc(struct blk_io_trace *t, struct per_dev_info *pdi,
+                         struct per_cpu_info *pci)
 {
+       int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
        int act = t->action & 0xffff;
 
        switch (act) {
                case __BLK_TA_QUEUE:
                        log_generic(pci, t, "Q");
+                       account_pc_queue(t, pci, w);
                        break;
                case __BLK_TA_GETRQ:
                        log_generic(pci, t, "G");
@@ -1203,13 +1505,27 @@ static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
                        log_generic(pci, t, "S");
                        break;
                case __BLK_TA_REQUEUE:
+                       /*
+                        * can happen if we miss traces, don't let it go
+                        * below zero
+                        */
+                       if (pdi->cur_depth[w])
+                               pdi->cur_depth[w]--;
+                       account_pc_requeue(t, pci, w);
                        log_generic(pci, t, "R");
                        break;
                case __BLK_TA_ISSUE:
+                       account_pc_issue(t, pci, w);
+                       pdi->cur_depth[w]++;
+                       if (pdi->cur_depth[w] > pdi->max_depth[w])
+                               pdi->max_depth[w] = pdi->cur_depth[w];
                        log_pc(pci, t, "D");
                        break;
                case __BLK_TA_COMPLETE:
+                       if (pdi->cur_depth[w])
+                               pdi->cur_depth[w]--;
                        log_pc(pci, t, "C");
+                       account_pc_c(t, pci, w);
                        break;
                case __BLK_TA_INSERT:
                        log_pc(pci, t, "I");
@@ -1228,6 +1544,7 @@ static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
 
        switch (act) {
                case __BLK_TA_QUEUE:
+                       log_track_queue(pdi, t);
                        account_queue(t, pci, w);
                        log_queue(pci, t, "Q");
                        break;
@@ -1292,6 +1609,10 @@ static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
                case __BLK_TA_REMAP:
                        log_generic(pci, t, "A");
                        break;
+               case __BLK_TA_DRV_DATA:
+                       have_drv_data = 1;
+                       /* dump to binary file only */
+                       break;
                default:
                        fprintf(stderr, "Bad fs action %x\n", t->action);
                        break;
@@ -1301,15 +1622,24 @@ static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
                       struct per_dev_info *pdi)
 {
-       if (t->action & BLK_TC_ACT(BLK_TC_PC))
-               dump_trace_pc(t, pci);
-       else
-               dump_trace_fs(t, pdi, pci);
+       if (text_output) {
+               if (t->action == BLK_TN_MESSAGE)
+                       handle_notify(t);
+               else if (t->action & BLK_TC_ACT(BLK_TC_PC))
+                       dump_trace_pc(t, pdi, pci);
+               else
+                       dump_trace_fs(t, pdi, pci);
+       }
 
        if (!pdi->events)
                pdi->first_reported_time = t->time;
 
        pdi->events++;
+
+       if (bin_output_msgs ||
+                           !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) &&
+                             t->action == BLK_TN_MESSAGE))
+               output_binary(t, sizeof(*t) + t->pdu_len);
 }
 
 /*
@@ -1340,21 +1670,60 @@ static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios,
 
        fprintf(ofp, "%s\n", msg);
 
-       fprintf(ofp, " Reads Queued:    %s, %siB\t", size_cnv(x, ios->qreads, 0), size_cnv(y, ios->qread_kb, 1));
-       fprintf(ofp, " Writes Queued:    %s, %siB\n", size_cnv(x, ios->qwrites, 0), size_cnv(y, ios->qwrite_kb, 1));
-
-       fprintf(ofp, " Read Dispatches: %s, %siB\t", size_cnv(x, ios->ireads, 0), size_cnv(y, ios->iread_kb, 1));
-       fprintf(ofp, " Write Dispatches: %s, %siB\n", size_cnv(x, ios->iwrites, 0), size_cnv(y, ios->iwrite_kb, 1));
+       fprintf(ofp, " Reads Queued:    %s, %siB\t",
+                       size_cnv(x, ios->qreads, 0),
+                       size_cnv(y, ios->qread_kb + (ios->qread_b>>10), 1));
+       fprintf(ofp, " Writes Queued:    %s, %siB\n",
+                       size_cnv(x, ios->qwrites, 0),
+                       size_cnv(y, ios->qwrite_kb + (ios->qwrite_b>>10), 1));
+       fprintf(ofp, " Read Dispatches: %s, %siB\t",
+                       size_cnv(x, ios->ireads, 0),
+                       size_cnv(y, ios->iread_kb + (ios->iread_b>>10), 1));
+       fprintf(ofp, " Write Dispatches: %s, %siB\n",
+                       size_cnv(x, ios->iwrites, 0),
+                       size_cnv(y, ios->iwrite_kb + (ios->iwrite_b>>10), 1));
        fprintf(ofp, " Reads Requeued:  %s\t\t", size_cnv(x, ios->rrqueue, 0));
        fprintf(ofp, " Writes Requeued:  %s\n", size_cnv(x, ios->wrqueue, 0));
-       fprintf(ofp, " Reads Completed: %s, %siB\t", size_cnv(x, ios->creads, 0), size_cnv(y, ios->cread_kb, 1));
-       fprintf(ofp, " Writes Completed: %s, %siB\n", size_cnv(x, ios->cwrites, 0), size_cnv(y, ios->cwrite_kb, 1));
-       fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
-       fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
+       fprintf(ofp, " Reads Completed: %s, %siB\t",
+                       size_cnv(x, ios->creads, 0),
+                       size_cnv(y, ios->cread_kb + (ios->cread_b>>10), 1));
+       fprintf(ofp, " Writes Completed: %s, %siB\n",
+                       size_cnv(x, ios->cwrites, 0),
+                       size_cnv(y, ios->cwrite_kb + (ios->cwrite_b>>10), 1));
+       fprintf(ofp, " Read Merges:     %s, %siB\t",
+                       size_cnv(x, ios->mreads, 0),
+                       size_cnv(y, ios->mread_kb + (ios->mread_b>>10), 1));
+       fprintf(ofp, " Write Merges:     %s, %siB\n",
+                       size_cnv(x, ios->mwrites, 0),
+                       size_cnv(y, ios->mwrite_kb + (ios->mwrite_b>>10), 1));
        if (pdi) {
                fprintf(ofp, " Read depth:      %'8u%8c\t", pdi->max_depth[0], ' ');
                fprintf(ofp, " Write depth:      %'8u\n", pdi->max_depth[1]);
        }
+       if (ios->qreads_pc || ios->qwrites_pc || ios->ireads_pc || ios->iwrites_pc ||
+           ios->rrqueue_pc || ios->wrqueue_pc || ios->creads_pc || ios->cwrites_pc) {
+               fprintf(ofp, " PC Reads Queued: %s, %siB\t",
+                       size_cnv(x, ios->qreads_pc, 0),
+                       size_cnv(y,
+                               ios->qread_kb_pc + (ios->qread_b_pc>>10), 1));
+               fprintf(ofp, " PC Writes Queued: %s, %siB\n",
+                       size_cnv(x, ios->qwrites_pc, 0),
+                       size_cnv(y,
+                               ios->qwrite_kb_pc + (ios->qwrite_b_pc>>10), 1));
+               fprintf(ofp, " PC Read Disp.:   %s, %siB\t",
+                       size_cnv(x, ios->ireads_pc, 0),
+                       size_cnv(y,
+                               ios->iread_kb_pc + (ios->iread_b_pc>>10), 1));
+               fprintf(ofp, " PC Write Disp.:   %s, %siB\n",
+                       size_cnv(x, ios->iwrites_pc, 0),
+                       size_cnv(y,
+                               ios->iwrite_kb_pc + (ios->iwrite_b_pc>>10),
+                               1));
+               fprintf(ofp, " PC Reads Req.:   %s\t\t", size_cnv(x, ios->rrqueue_pc, 0));
+               fprintf(ofp, " PC Writes Req.:   %s\n", size_cnv(x, ios->wrqueue_pc, 0));
+               fprintf(ofp, " PC Reads Compl.: %s\t\t", size_cnv(x, ios->creads_pc, 0));
+               fprintf(ofp, " PC Writes Compl.: %s\n", size_cnv(x, ios->cwrites_pc, 0));
+       }
        fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
        fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
 }
@@ -1382,9 +1751,9 @@ static int ppi_name_compare(const void *p1, const void *p2)
        struct per_process_info *ppi2 = *((struct per_process_info **) p2);
        int res;
 
-       res = strverscmp(ppi1->name, ppi2->name);
+       res = strverscmp(ppi1->ppm->comm, ppi2->ppm->comm);
        if (!res)
-               res = ppi1->pid > ppi2->pid;
+               res = ppi1->ppm->pid > ppi2->ppm->pid;
 
        return res;
 }
@@ -1426,12 +1795,13 @@ static void show_process_stats(void)
 
        ppi = ppi_list;
        while (ppi) {
+               struct process_pid_map *ppm = ppi->ppm;
                char name[64];
 
                if (ppi->more_than_one)
-                       sprintf(name, "%s (%u, ...)", ppi->name, ppi->pid);
+                       sprintf(name, "%s (%u, ...)", ppm->comm, ppm->pid);
                else
-                       sprintf(name, "%s (%u)", ppi->name, ppi->pid);
+                       sprintf(name, "%s (%u)", ppm->comm, ppm->pid);
 
                dump_io_stats(NULL, &ppi->io_stats, name);
                dump_wait_stats(ppi);
@@ -1450,6 +1820,7 @@ static void show_device_and_cpu_stats(void)
        int i, j, pci_events;
        char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
        char name[32];
+       double ratio;
 
        for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
 
@@ -1480,6 +1851,34 @@ static void show_device_and_cpu_stats(void)
                        total.cwrite_kb += ios->cwrite_kb;
                        total.iread_kb += ios->iread_kb;
                        total.iwrite_kb += ios->iwrite_kb;
+                       total.mread_kb += ios->mread_kb;
+                       total.mwrite_kb += ios->mwrite_kb;
+                       total.qread_b += ios->qread_b;
+                       total.qwrite_b += ios->qwrite_b;
+                       total.cread_b += ios->cread_b;
+                       total.cwrite_b += ios->cwrite_b;
+                       total.iread_b += ios->iread_b;
+                       total.iwrite_b += ios->iwrite_b;
+                       total.mread_b += ios->mread_b;
+                       total.mwrite_b += ios->mwrite_b;
+
+                       total.qreads_pc += ios->qreads_pc;
+                       total.qwrites_pc += ios->qwrites_pc;
+                       total.creads_pc += ios->creads_pc;
+                       total.cwrites_pc += ios->cwrites_pc;
+                       total.ireads_pc += ios->ireads_pc;
+                       total.iwrites_pc += ios->iwrites_pc;
+                       total.rrqueue_pc += ios->rrqueue_pc;
+                       total.wrqueue_pc += ios->wrqueue_pc;
+                       total.qread_kb_pc += ios->qread_kb_pc;
+                       total.qwrite_kb_pc += ios->qwrite_kb_pc;
+                       total.iread_kb_pc += ios->iread_kb_pc;
+                       total.iwrite_kb_pc += ios->iwrite_kb_pc;
+                       total.qread_b_pc += ios->qread_b_pc;
+                       total.qwrite_b_pc += ios->qwrite_b_pc;
+                       total.iread_b_pc += ios->iread_b_pc;
+                       total.iwrite_b_pc += ios->iwrite_b_pc;
+
                        total.timer_unplugs += ios->timer_unplugs;
                        total.io_unplugs += ios->io_unplugs;
 
@@ -1499,8 +1898,10 @@ static void show_device_and_cpu_stats(void)
                wrate = rrate = 0;
                msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000;
                if (msec) {
-                       rrate = 1000 * total.cread_kb / msec;
-                       wrate = 1000 * total.cwrite_kb / msec;
+                       rrate = ((1000 * total.cread_kb) + total.cread_b) /
+                                                                       msec;
+                       wrate = ((1000 * total.cwrite_kb) + total.cwrite_b) /
+                                                                       msec;
                }
 
                fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n",
@@ -1509,10 +1910,13 @@ static void show_device_and_cpu_stats(void)
                        get_dev_name(pdi, line, sizeof(line)), pdi->events);
 
                collect_pdi_skips(pdi);
+               if (!pdi->skips && !pdi->events)
+                       ratio = 0.0;
+               else
+                       ratio = 100.0 * ((double)pdi->seq_skips /
+                                       (double)(pdi->events + pdi->seq_skips));
                fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n",
-                       pdi->skips,pdi->seq_skips,
-                       100.0 * ((double)pdi->seq_skips /
-                               (double)(pdi->events + pdi->seq_skips)));
+                       pdi->skips, pdi->seq_skips, ratio);
        }
 }
 
@@ -1527,6 +1931,25 @@ static void find_genesis(void)
 
                t = t->next;
        }
+
+       /* The time stamp record will usually be the first
+        * record in the trace, but not always.
+        */
+       if (start_timestamp
+        && start_timestamp != genesis_time) {
+               long delta = genesis_time - start_timestamp;
+
+               abs_start_time.tv_sec  += SECONDS(delta);
+               abs_start_time.tv_nsec += NANO_SECONDS(delta);
+               if (abs_start_time.tv_nsec < 0) {
+                       abs_start_time.tv_nsec += 1000000000;
+                       abs_start_time.tv_sec -= 1;
+               } else
+               if (abs_start_time.tv_nsec > 1000000000) {
+                       abs_start_time.tv_nsec -= 1000000000;
+                       abs_start_time.tv_sec += 1;
+               }
+       }
 }
 
 static inline int check_stopwatch(struct blk_io_trace *bit)
@@ -1600,6 +2023,7 @@ static int check_cpu_map(struct per_dev_info *pdi)
         * create a map of the cpus we have traces for
         */
        cpu_map = malloc(pdi->cpu_map_max / sizeof(long));
+       memset(cpu_map, 0, sizeof(*cpu_map));
        n = rb_first(&rb_sort_root);
        while (n) {
                __t = rb_entry(n, struct trace, rb_node);
@@ -1703,7 +2127,8 @@ static void show_entries_rb(int force)
                        break;
                }
 
-               if (check_sequence(pdi, t, force))
+               if (!(bit->action == BLK_TN_MESSAGE) &&
+                   check_sequence(pdi, t, force))
                        break;
 
                if (!force && bit->time > last_allowed_time)
@@ -1714,7 +2139,8 @@ static void show_entries_rb(int force)
                if (!pci || pci->cpu != bit->cpu)
                        pci = get_cpu_info(pdi, bit->cpu);
 
-               pci->last_sequence = bit->sequence;
+               if (!(bit->action == BLK_TN_MESSAGE))
+                       pci->last_sequence = bit->sequence;
 
                pci->nelems++;
 
@@ -1773,6 +2199,22 @@ static int read_data(int fd, void *buffer, int bytes, int block, int *fdblock)
        return 0;
 }
 
+static inline __u16 get_pdulen(struct blk_io_trace *bit)
+{
+       if (data_is_native)
+               return bit->pdu_len;
+
+       return __bswap_16(bit->pdu_len);
+}
+
+static inline __u32 get_magic(struct blk_io_trace *bit)
+{
+       if (data_is_native)
+               return bit->magic;
+
+       return __bswap_32(bit->magic);
+}
+
 static int read_events(int fd, int always_block, int *fdblock)
 {
        struct per_dev_info *pdi = NULL;
@@ -1796,13 +2238,20 @@ static int read_events(int fd, int always_block, int *fdblock)
                        break;
                }
 
-               magic = be32_to_cpu(bit->magic);
+               /*
+                * look at first trace to check whether we need to convert
+                * data in the future
+                */
+               if (data_is_native == -1 && check_data_endianness(bit->magic))
+                       break;
+
+               magic = get_magic(bit);
                if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
                        fprintf(stderr, "Bad magic %x\n", magic);
                        break;
                }
 
-               pdu_len = be16_to_cpu(bit->pdu_len);
+               pdu_len = get_pdulen(bit);
                if (pdu_len) {
                        void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
 
@@ -1821,6 +2270,15 @@ static int read_events(int fd, int always_block, int *fdblock)
                        continue;
                }
 
+               /*
+                * not a real trace, so grab and handle it here
+                */
+               if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
+                       handle_notify(bit);
+                       output_binary(bit, sizeof(*bit) + bit->pdu_len);
+                       continue;
+               }
+
                t = t_alloc();
                memset(t, 0, sizeof(*t));
                t->bit = bit;
@@ -1841,125 +2299,324 @@ static int read_events(int fd, int always_block, int *fdblock)
        return events;
 }
 
-static int do_file(void)
-{
-       struct per_cpu_info *pci;
+/*
+ * Managing input streams
+ */
+
+struct ms_stream {
+       struct ms_stream *next;
+       struct trace *first, *last;
        struct per_dev_info *pdi;
-       int i, j, events, events_added;
+       unsigned int cpu;
+};
 
-       /*
-        * first prepare all files for reading
-        */
-       for (i = 0; i < ndevices; i++) {
-               pdi = &devices[i];
-               pdi->nfiles = 0;
+#define MS_HASH(d, c) ((MAJOR(d) & 0xff) ^ (MINOR(d) & 0xff) ^ (cpu & 0xff))
 
-               for (j = 0;; j++) {
-                       struct stat st;
-                       int len = 0;
-                       char *p, *dname;
+struct ms_stream *ms_head;
+struct ms_stream *ms_hash[256];
 
-                       pci = get_cpu_info(pdi, j);
-                       pci->cpu = j;
-                       pci->fd = -1;
-                       pci->fdblock = -1;
-       
-                       p = strdup(pdi->name);
-                       dname = dirname(p);
-                       if (strcmp(dname, ".")) {
-                               input_dir = dname;
-                               p = strdup(pdi->name);
-                               strcpy(pdi->name, basename(p));
-                       }
-                       free(p);
+static void ms_sort(struct ms_stream *msp);
+static int ms_prime(struct ms_stream *msp);
 
-                       if (input_dir)
-                               len = sprintf(pci->fname, "%s/", input_dir);
+static inline struct trace *ms_peek(struct ms_stream *msp)
+{
+       return (msp == NULL) ? NULL : msp->first;
+}
 
-                       snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
-                                "%s.blktrace.%d", pdi->name, pci->cpu);
-                       if (stat(pci->fname, &st) < 0)
-                               break;
-                       if (st.st_size) {
-                               pci->fd = open(pci->fname, O_RDONLY);
-                               if (pci->fd < 0) {
-                                       perror(pci->fname);
-                                       continue;
-                               }
-                       }
+static inline __u64 ms_peek_time(struct ms_stream *msp)
+{
+       return ms_peek(msp)->bit->time;
+}
 
-                       printf("Input file %s added\n", pci->fname);
-                       pdi->nfiles++;
-                       cpu_mark_online(pdi, pci->cpu);
+static inline void ms_resort(struct ms_stream *msp)
+{
+       if (msp->next && ms_peek_time(msp) > ms_peek_time(msp->next)) {
+               ms_head = msp->next;
+               msp->next = NULL;
+               ms_sort(msp);
+       }
+}
+
+static inline void ms_deq(struct ms_stream *msp)
+{
+       msp->first = msp->first->next;
+       if (!msp->first) {
+               msp->last = NULL;
+               if (!ms_prime(msp)) {
+                       ms_head = msp->next;
+                       msp->next = NULL;
+                       return;
                }
        }
 
-       /*
-        * now loop over the files reading in the data
-        */
-       do {
-               unsigned long long youngest;
+       ms_resort(msp);
+}
 
-               events_added = 0;
-               last_allowed_time = -1ULL;
-               read_sequence++;
+static void ms_sort(struct ms_stream *msp)
+{
+       __u64 msp_t = ms_peek_time(msp);
+       struct ms_stream *this_msp = ms_head;
 
-               for (i = 0; i < ndevices; i++) {
-                       pdi = &devices[i];
-                       pdi->last_read_time = -1ULL;
+       if (this_msp == NULL)
+               ms_head = msp;
+       else if (msp_t < ms_peek_time(this_msp)) {
+               msp->next = this_msp;
+               ms_head = msp;
+       }
+       else {
+               while (this_msp->next && ms_peek_time(this_msp->next) < msp_t)
+                       this_msp = this_msp->next;
 
-                       for (j = 0; j < pdi->nfiles; j++) {
+               msp->next = this_msp->next;
+               this_msp->next = msp;
+       }
+}
 
-                               pci = get_cpu_info(pdi, j);
+static int ms_prime(struct ms_stream *msp)
+{
+       __u32 magic;
+       unsigned int i;
+       struct trace *t;
+       struct per_dev_info *pdi = msp->pdi;
+       struct per_cpu_info *pci = get_cpu_info(pdi, msp->cpu);
+       struct blk_io_trace *bit = NULL;
+       int ret, pdu_len, ndone = 0;
 
-                               if (pci->fd == -1)
-                                       continue;
+       for (i = 0; !is_done() && pci->fd >= 0 && i < rb_batch; i++) {
+               bit = bit_alloc();
+               ret = read_data(pci->fd, bit, sizeof(*bit), 1, &pci->fdblock);
+               if (ret)
+                       goto err;
 
-                               pci->smallest_seq_read = -1;
+               if (data_is_native == -1 && check_data_endianness(bit->magic))
+                       goto err;
 
-                               events = read_events(pci->fd, 1, &pci->fdblock);
-                               if (events <= 0) {
-                                       cpu_mark_offline(pdi, pci->cpu);
-                                       close(pci->fd);
-                                       pci->fd = -1;
-                                       continue;
-                               }
+               magic = get_magic(bit);
+               if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+                       fprintf(stderr, "Bad magic %x\n", magic);
+                       goto err;
 
-                               if (pdi->last_read_time < last_allowed_time)
-                                       last_allowed_time = pdi->last_read_time;
+               }
 
-                               events_added += events;
+               pdu_len = get_pdulen(bit);
+               if (pdu_len) {
+                       void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
+                       ret = read_data(pci->fd, ptr + sizeof(*bit), pdu_len,
+                                                            1, &pci->fdblock);
+                       if (ret) {
+                               free(ptr);
+                               bit = NULL;
+                               goto err;
                        }
+
+                       bit = ptr;
                }
 
-               if (sort_entries(&youngest))
-                       break;
+               trace_to_cpu(bit);
+               if (verify_trace(bit))
+                       goto err;
 
-               if (youngest > stopwatch_end)
-                       break;
+               if (bit->cpu != pci->cpu) {
+                       fprintf(stderr, "cpu %d trace info has error cpu %d\n",
+                               pci->cpu, bit->cpu);
+                       continue;
+               }
 
-               show_entries_rb(0);
+               if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
+                       handle_notify(bit);
+                       output_binary(bit, sizeof(*bit) + bit->pdu_len);
+                       bit_free(bit);
 
-       } while (events_added);
+                       i -= 1;
+                       continue;
+               }
 
-       if (rb_sort_entries)
-               show_entries_rb(1);
+               if (bit->time > pdi->last_read_time)
+                       pdi->last_read_time = bit->time;
+
+               t = t_alloc();
+               memset(t, 0, sizeof(*t));
+               t->bit = bit;
+
+               if (msp->first == NULL)
+                       msp->first = msp->last = t;
+               else {
+                       msp->last->next = t;
+                       msp->last = t;
+               }
+
+               ndone++;
+       }
+
+       return ndone;
+
+err:
+       if (bit) bit_free(bit);
+
+       cpu_mark_offline(pdi, pci->cpu);
+       close(pci->fd);
+       pci->fd = -1;
+
+       return ndone;
+}
+
+static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu)
+{
+       struct ms_stream *msp = malloc(sizeof(*msp));
+
+       msp->next = NULL;
+       msp->first = msp->last = NULL;
+       msp->pdi = pdi;
+       msp->cpu = cpu;
+
+       if (ms_prime(msp))
+               ms_sort(msp);
+
+       return msp;
+}
+
+static int setup_file(struct per_dev_info *pdi, int cpu)
+{
+       int len = 0;
+       struct stat st;
+       char *p, *dname;
+       struct per_cpu_info *pci = get_cpu_info(pdi, cpu);
+
+       pci->cpu = cpu;
+       pci->fdblock = -1;
+
+       p = strdup(pdi->name);
+       dname = dirname(p);
+       if (strcmp(dname, ".")) {
+               input_dir = dname;
+               p = strdup(pdi->name);
+               strcpy(pdi->name, basename(p));
+       }
+       free(p);
+
+       if (input_dir)
+               len = sprintf(pci->fname, "%s/", input_dir);
+
+       snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
+                "%s.blktrace.%d", pdi->name, pci->cpu);
+       if (stat(pci->fname, &st) < 0)
+               return 0;
+       if (!st.st_size)
+               return 1;
+
+       pci->fd = open(pci->fname, O_RDONLY);
+       if (pci->fd < 0) {
+               perror(pci->fname);
+               return 0;
+       }
+
+       printf("Input file %s added\n", pci->fname);
+       cpu_mark_online(pdi, pci->cpu);
+
+       pdi->nfiles++;
+       ms_alloc(pdi, pci->cpu);
+
+       return 1;
+}
+
+static int handle(struct ms_stream *msp)
+{
+       struct trace *t;
+       struct per_dev_info *pdi;
+       struct per_cpu_info *pci;
+       struct blk_io_trace *bit;
+
+       t = ms_peek(msp);
+
+       bit = t->bit;
+       pdi = msp->pdi;
+       pci = get_cpu_info(pdi, msp->cpu);
+       pci->nelems++;
+       bit->time -= genesis_time;
+
+       if (t->bit->time > stopwatch_end)
+               return 0;
+
+       pdi->last_reported_time = bit->time;
+       if ((bit->action & (act_mask << BLK_TC_SHIFT))&&
+           t->bit->time >= stopwatch_start)
+               dump_trace(bit, pci, pdi);
+
+       ms_deq(msp);
+
+       if (text_output)
+               trace_rb_insert_last(pdi, t);
+       else {
+               bit_free(t->bit);
+               t_free(t);
+       }
+
+       return 1;
+}
+
+/*
+ * Check if we need to sanitize the name. We allow 'foo', or if foo.blktrace.X
+ * is given, then strip back down to 'foo' to avoid missing files.
+ */
+static int name_fixup(char *name)
+{
+       char *b;
+
+       if (!name)
+               return 1;
+
+       b = strstr(name, ".blktrace.");
+       if (b)
+               *b = '\0';
 
        return 0;
 }
 
-static int do_stdin(void)
+static int do_file(void)
 {
-       unsigned long long youngest;
-       int fd, events, fdblock;
+       int i, cpu, ret;
+       struct per_dev_info *pdi;
 
-       last_allowed_time = -1ULL;
-       fd = dup(STDIN_FILENO);
-       if (fd == -1) {
-               perror("dup stdin");
-               return -1;
+       /*
+        * first prepare all files for reading
+        */
+       for (i = 0; i < ndevices; i++) {
+               pdi = &devices[i];
+               ret = name_fixup(pdi->name);
+               if (ret)
+                       return ret;
+
+               for (cpu = 0; setup_file(pdi, cpu); cpu++)
+                       ;
+
+               if (!cpu) {
+                       fprintf(stderr,"No input files found for %s\n",
+                               pdi->name);
+                       return 1;
+               }
        }
 
+       /*
+        * Get the initial time stamp
+        */
+       if (ms_head)
+               genesis_time = ms_peek_time(ms_head);
+
+       /*
+        * Keep processing traces while any are left
+        */
+       while (!is_done() && ms_head && handle(ms_head))
+               ;
+
+       return 0;
+}
+
+static void do_pipe(int fd)
+{
+       unsigned long long youngest;
+       int events, fdblock;
+
+       last_allowed_time = -1ULL;
        fdblock = -1;
        while ((events = read_events(fd, 0, &fdblock)) > 0) {
                read_sequence++;
@@ -1979,7 +2636,23 @@ static int do_stdin(void)
 
        if (rb_sort_entries)
                show_entries_rb(1);
+}
+
+static int do_fifo(void)
+{
+       int fd;
+
+       if (!strcmp(pipename, "-"))
+               fd = dup(STDIN_FILENO);
+       else
+               fd = open(pipename, O_RDONLY);
+
+       if (fd == -1) {
+               perror("dup stdin");
+               return -1;
+       }
 
+       do_pipe(fd);
        close(fd);
        return 0;
 }
@@ -2045,36 +2718,71 @@ static int find_stopwatch_interval(char *string)
        return 0;
 }
 
-static char usage_str[] = \
-       "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
-       "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
-       "\t-i Input file containing trace data, or '-' for stdin\n" \
+static int is_pipe(const char *str)
+{
+       struct stat st;
+
+       if (!strcmp(str, "-"))
+               return 1;
+       if (!stat(str, &st) && S_ISFIFO(st.st_mode))
+               return 1;
+
+       return 0;
+}
+
+#define S_OPTS  "a:A:b:D:d:f:F:hi:o:Oqstw:vVM"
+static char usage_str[] =    "\n\n" \
+       "-i <file>           | --input=<file>\n" \
+       "[ -a <action field> | --act-mask=<action field> ]\n" \
+       "[ -A <action mask>  | --set-mask=<action mask> ]\n" \
+       "[ -b <traces>       | --batch=<traces> ]\n" \
+       "[ -d <file>         | --dump-binary=<file> ]\n" \
+       "[ -D <dir>          | --input-directory=<dir> ]\n" \
+       "[ -f <format>       | --format=<format> ]\n" \
+       "[ -F <spec>         | --format-spec=<spec> ]\n" \
+       "[ -h                | --hash-by-name ]\n" \
+       "[ -o <file>         | --output=<file> ]\n" \
+       "[ -O                | --no-text-output ]\n" \
+       "[ -q                | --quiet ]\n" \
+       "[ -s                | --per-program-stats ]\n" \
+       "[ -t                | --track-ios ]\n" \
+       "[ -w <time>         | --stopwatch=<time> ]\n" \
+       "[ -M                | --no-msgs\n" \
+       "[ -v                | --verbose ]\n" \
+       "[ -V                | --version ]\n\n" \
+       "\t-a Only trace specified actions. See documentation\n" \
+       "\t-A Give trace mask as a single value. See documentation\n" \
+       "\t-b stdin read batching\n" \
+       "\t-d Output file. If specified, binary data is written to file\n" \
        "\t-D Directory to prepend to input file names\n" \
+       "\t-f Output format. Customize the output format. The format field\n" \
+       "\t   identifies can be found in the documentation\n" \
+       "\t-F Format specification. Can be found in the documentation\n" \
+       "\t-h Hash processes by name, not pid\n" \
+       "\t-i Input file containing trace data, or '-' for stdin\n" \
        "\t-o Output file. If not given, output is stdout\n" \
-       "\t-b stdin read batching\n" \
+       "\t-O Do NOT output text data\n" \
+       "\t-q Quiet. Don't display any stats at the end of the trace\n" \
        "\t-s Show per-program io statistics\n" \
-       "\t-h Hash processes by name, not pid\n" \
        "\t-t Track individual ios. Will tell you the time a request took\n" \
        "\t   to get queued, to get dispatched, and to get completed\n" \
-       "\t-q Quiet. Don't display any stats at the end of the trace\n" \
        "\t-w Only parse data between the given time interval in seconds.\n" \
        "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
-       "\t-f Output format. Customize the output format. The format field\n" \
-       "\t   identifies can be found in the documentation\n" \
-       "\t-F Format specification. Can be found in the documentation\n" \
+       "\t-M Do not output messages to binary file\n" \
        "\t-v More verbose for marginal errors\n" \
        "\t-V Print program version info\n\n";
 
 static void usage(char *prog)
 {
-       fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
+       fprintf(stderr, "Usage: %s %s", prog, usage_str);
 }
 
 int main(int argc, char *argv[])
 {
-       char *ofp_buffer;
        int i, c, ret, mode;
        int act_mask_tmp = 0;
+       char *ofp_buffer = NULL;
+       char *bin_ofp_buffer = NULL;
 
        while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
                switch (c) {
@@ -2099,9 +2807,10 @@ int main(int argc, char *argv[])
                        act_mask_tmp = i;
                        break;
                case 'i':
-                       if (!strcmp(optarg, "-") && !pipeline)
+                       if (is_pipe(optarg) && !pipeline) {
                                pipeline = 1;
-                       else if (resize_devices(optarg) != 0)
+                               pipename = strdup(optarg);
+                       } else if (resize_devices(optarg) != 0)
                                return 1;
                        break;
                case 'D':
@@ -2110,6 +2819,9 @@ int main(int argc, char *argv[])
                case 'o':
                        output_name = optarg;
                        break;
+               case 'O':
+                       text_output = 0;
+                       break;
                case 'b':
                        rb_batch = atoi(optarg);
                        if (rb_batch <= 0)
@@ -2144,6 +2856,12 @@ int main(int argc, char *argv[])
                case 'V':
                        printf("%s version %s\n", argv[0], blkparse_version);
                        return 0;
+               case 'd':
+                       dump_binary = optarg;
+                       break;
+               case 'M':
+                       bin_output_msgs = 0;
+                       break;
                default:
                        usage(argv[0]);
                        return 1;
@@ -2151,9 +2869,10 @@ int main(int argc, char *argv[])
        }
 
        while (optind < argc) {
-               if (!strcmp(argv[optind], "-") && !pipeline)
+               if (is_pipe(argv[optind]) && !pipeline) {
                        pipeline = 1;
-               else if (resize_devices(argv[optind]) != 0)
+                       pipename = strdup(argv[optind]);
+               } else if (resize_devices(argv[optind]) != 0)
                        return 1;
                optind++;
        }
@@ -2174,34 +2893,67 @@ int main(int argc, char *argv[])
 
        setlocale(LC_NUMERIC, "en_US");
 
-       if (!output_name) {
-               ofp = fdopen(STDOUT_FILENO, "w");
-               mode = _IOLBF;
-       } else {
-               char ofname[128];
+       if (text_output) {
+               if (!output_name) {
+                       ofp = fdopen(STDOUT_FILENO, "w");
+                       mode = _IOLBF;
+               } else {
+                       char ofname[PATH_MAX];
 
-               snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
-               ofp = fopen(ofname, "w");
-               mode = _IOFBF;
-       }
+                       snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
+                       ofp = fopen(ofname, "w");
+                       mode = _IOFBF;
+               }
 
-       if (!ofp) {
-               perror("fopen");
-               return 1;
+               if (!ofp) {
+                       perror("fopen");
+                       return 1;
+               }
+
+               ofp_buffer = malloc(4096);
+               if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
+                       perror("setvbuf");
+                       return 1;
+               }
        }
 
-       ofp_buffer = malloc(4096);      
-       if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
-               perror("setvbuf");
-               return 1;
+       if (dump_binary) {
+               if (!strcmp(dump_binary, "-"))
+                       dump_fp = stdout;
+               else {
+                       dump_fp = fopen(dump_binary, "w");
+                       if (!dump_fp) {
+                               perror(dump_binary);
+                               dump_binary = NULL;
+                               return 1;
+                       }
+               }
+               bin_ofp_buffer = malloc(128 * 1024);
+               if (setvbuf(dump_fp, bin_ofp_buffer, _IOFBF, 128 * 1024)) {
+                       perror("setvbuf binary");
+                       return 1;
+               }
        }
 
        if (pipeline)
-               ret = do_stdin();
+               ret = do_fifo();
        else
                ret = do_file();
 
-       show_stats();
-       free(ofp_buffer);
+       if (!ret)
+               show_stats();
+
+       if (have_drv_data && !dump_binary)
+               printf("\ndiscarded traces containing low-level device driver "
+                      "specific data (only available in binary output)\n");
+
+       if (ofp_buffer) {
+               fflush(ofp);
+               free(ofp_buffer);
+       }
+       if (bin_ofp_buffer) {
+               fflush(dump_fp);
+               free(bin_ofp_buffer);
+       }
        return ret;
 }