blkparse: split off the timestamp correction code in to a separate function
[blktrace.git] / blkparse.c
index db553c7d45d1149799d70357cf384959f305bfe7..cf7a87b262fed7c4e11a84eb848ac34c24963e81 100644 (file)
@@ -2,6 +2,7 @@
  * block queue tracing parse application
  *
  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -35,7 +36,7 @@
 #include "rbtree.h"
 #include "jhash.h"
 
-static char blkparse_version[] = "0.99.1";
+static char blkparse_version[] = "1.2.0";
 
 struct skip_info {
        unsigned long start, end;
@@ -99,11 +100,23 @@ struct per_process_info {
 #define PPI_HASH_SHIFT (8)
 #define PPI_HASH_SIZE  (1 << PPI_HASH_SHIFT)
 #define PPI_HASH_MASK  (PPI_HASH_SIZE - 1)
+
+enum {
+       SORT_PROG_EVENT_N,   /* Program Name */
+       SORT_PROG_EVENT_QKB, /* KB: Queued read and write */
+       SORT_PROG_EVENT_RKB, /* KB: Queued Read */
+       SORT_PROG_EVENT_WKB, /* KB: Queued Write */
+       SORT_PROG_EVENT_CKB, /* KB: Complete */
+       SORT_PROG_EVENT_QIO, /* IO: Queued read and write */
+       SORT_PROG_EVENT_RIO, /* IO: Queued Read */
+       SORT_PROG_EVENT_WIO, /* IO: Queued Write */
+       SORT_PROG_EVENT_CIO, /* IO: Complete */
+};
+
 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
 static struct per_process_info *ppi_list;
 static int ppi_list_entries;
 
-#define S_OPTS "a:A:i:o:b:stqw:f:F:vVhD:"
 static struct option l_opts[] = {
        {
                .name = "act-mask",
@@ -118,34 +131,64 @@ static struct option l_opts[] = {
                .val = 'A'
        },
        {
-               .name = "input",
+               .name = "batch",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'i'
+               .val = 'b'
        },
        {
-               .name = "output",
+               .name = "input-directory",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'o'
+               .val = 'D'
        },
        {
-               .name = "batch",
+               .name = "dump-binary",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'b'
+               .val = 'd'
        },
        {
-               .name = "per-program-stats",
+               .name = "format",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'f'
+       },
+       {
+               .name = "format-spec",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'F'
+       },
+       {
+               .name = "hash-by-name",
                .has_arg = no_argument,
                .flag = NULL,
-               .val = 's'
+               .val = 'h'
        },
        {
-               .name = "track-ios",
+               .name = "input",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'i'
+       },
+       {
+               .name = "no-msgs",
                .has_arg = no_argument,
                .flag = NULL,
-               .val = 't'
+               .val = 'M'
+       },
+       {
+               .name = "output",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'o'
+       },
+       {
+               .name = "no-text-output",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 'O'
        },
        {
                .name = "quiet",
@@ -154,28 +197,28 @@ static struct option l_opts[] = {
                .val = 'q'
        },
        {
-               .name = "stopwatch",
-               .has_arg = required_argument,
+               .name = "per-program-stats",
+               .has_arg = no_argument,
                .flag = NULL,
-               .val = 'w'
+               .val = 's'
        },
        {
-               .name = "format",
+               .name = "sort-program-stats",
                .has_arg = required_argument,
                .flag = NULL,
-               .val = 'f'
+               .val = 'S'
        },
        {
-               .name = "format-spec",
-               .has_arg = required_argument,
+               .name = "track-ios",
+               .has_arg = no_argument,
                .flag = NULL,
-               .val = 'F'
+               .val = 't'
        },
        {
-               .name = "hash-by-name",
-               .has_arg = no_argument,
+               .name = "stopwatch",
+               .has_arg = required_argument,
                .flag = NULL,
-               .val = 'h'
+               .val = 'w'
        },
        {
                .name = "verbose",
@@ -189,12 +232,6 @@ static struct option l_opts[] = {
                .flag = NULL,
                .val = 'V'
        },
-       {
-               .name = "input-directory",
-               .has_arg = required_argument,
-               .flag = NULL,
-               .val = 'D'
-       },
        {
                .name = NULL,
        }
@@ -251,14 +288,19 @@ static unsigned long long stopwatch_end = -1ULL;  /* "infinity" */
 static unsigned long read_sequence;
 
 static int per_process_stats;
+static int per_process_stats_event = SORT_PROG_EVENT_N;
 static int per_device_and_cpu_stats = 1;
 static int track_ios;
 static int ppi_hash_by_pid = 1;
 static int verbose;
 static unsigned int act_mask = -1U;
 static int stats_printed;
+static int bin_output_msgs = 1;
 int data_is_native = -1;
 
+static FILE *dump_fp;
+static char *dump_binary;
+
 static unsigned int t_alloc_cache;
 static unsigned int bit_alloc_cache;
 
@@ -266,16 +308,36 @@ static unsigned int bit_alloc_cache;
 static unsigned int rb_batch = RB_BATCH_DEFAULT;
 
 static int pipeline;
+static char *pipename;
+
+static int text_output = 1;
 
 #define is_done()      (*(volatile int *)(&done))
 static volatile int done;
 
+struct timespec                abs_start_time;
+static unsigned long long start_timestamp;
+
+static int have_drv_data = 0;
+
 #define JHASH_RANDOM   (0x3af5f2ee)
 
 #define CPUS_PER_LONG  (8 * sizeof(unsigned long))
 #define CPU_IDX(cpu)   ((cpu) / CPUS_PER_LONG)
 #define CPU_BIT(cpu)   ((cpu) & (CPUS_PER_LONG - 1))
 
+static void output_binary(void *buf, int len)
+{
+       if (dump_binary) {
+               size_t n = fwrite(buf, len, 1, dump_fp);
+               if (n != 1) {
+                       perror(dump_binary);
+                       fclose(dump_fp);
+                       dump_binary = NULL;
+               }
+       }
+}
+
 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
 {
        struct per_cpu_info *cpus = pdi->cpus;
@@ -510,7 +572,7 @@ static struct process_pid_map *find_ppm(pid_t pid)
        return NULL;
 }
 
-static void add_ppm_hash(pid_t pid, const char *name)
+static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name)
 {
        const int hash_idx = ppm_hash_pid(pid);
        struct process_pid_map *ppm;
@@ -520,10 +582,64 @@ static void add_ppm_hash(pid_t pid, const char *name)
                ppm = malloc(sizeof(*ppm));
                memset(ppm, 0, sizeof(*ppm));
                ppm->pid = pid;
-               strcpy(ppm->comm, name);
+               memset(ppm->comm, 0, sizeof(ppm->comm));
+               strncpy(ppm->comm, name, sizeof(ppm->comm));
+               ppm->comm[sizeof(ppm->comm) - 1] = '\0';
                ppm->hash_next = ppm_hash_table[hash_idx];
                ppm_hash_table[hash_idx] = ppm;
        }
+
+       return ppm;
+}
+
+static void handle_notify(struct blk_io_trace *bit)
+{
+       void    *payload = (caddr_t) bit + sizeof(*bit);
+       __u32   two32[2];
+
+       switch (bit->action) {
+       case BLK_TN_PROCESS:
+               add_ppm_hash(bit->pid, payload);
+               break;
+
+       case BLK_TN_TIMESTAMP:
+               if (bit->pdu_len != sizeof(two32))
+                       return;
+               memcpy(two32, payload, sizeof(two32));
+               if (!data_is_native) {
+                       two32[0] = be32_to_cpu(two32[0]);
+                       two32[1] = be32_to_cpu(two32[1]);
+               }
+               start_timestamp = bit->time;
+               abs_start_time.tv_sec  = two32[0];
+               abs_start_time.tv_nsec = two32[1];
+               if (abs_start_time.tv_nsec < 0) {
+                       abs_start_time.tv_sec--;
+                       abs_start_time.tv_nsec += 1000000000;
+               }
+
+               break;
+
+       case BLK_TN_MESSAGE:
+               if (bit->pdu_len > 0) {
+                       char msg[bit->pdu_len+1];
+
+                       memcpy(msg, (char *)payload, bit->pdu_len);
+                       msg[bit->pdu_len] = '\0';
+
+                       fprintf(ofp,
+                               "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n",
+                               MAJOR(bit->device), MINOR(bit->device),
+                               bit->cpu, "0", (int) SECONDS(bit->time),
+                               (unsigned long) NANO_SECONDS(bit->time),
+                               0, "m", "N", msg);
+               }
+               break;
+
+       default:
+               /* Ignore unknown notify events */
+               ;
+       }
 }
 
 char *find_process_name(pid_t pid)
@@ -869,6 +985,8 @@ static struct io_track *find_track(struct per_dev_info *pdi, pid_t pid,
        if (!iot) {
                iot = malloc(sizeof(*iot));
                iot->ppm = find_ppm(pid);
+               if (!iot->ppm)
+                       iot->ppm = add_ppm_hash(pid, "unknown");
                iot->sector = sector;
                track_rb_insert(pdi, iot);
        }
@@ -1051,6 +1169,8 @@ static struct io_stats *find_process_io_stats(pid_t pid)
                ppi = malloc(sizeof(*ppi));
                memset(ppi, 0, sizeof(*ppi));
                ppi->ppm = find_ppm(pid);
+               if (!ppi->ppm)
+                       ppi->ppm = add_ppm_hash(pid, "unknown");
                add_ppi_to_hash(ppi);
                add_ppi_to_list(ppi);
        }
@@ -1081,10 +1201,12 @@ static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
 {
        if (rw) {
                ios->mwrites++;
-               ios->qwrite_kb += t_kb(t);
+               ios->mwrite_kb += t_kb(t);
+               ios->mwrite_b += t_b(t);
        } else {
                ios->mreads++;
-               ios->qread_kb += t_kb(t);
+               ios->mread_kb += t_kb(t);
+               ios->mread_b += t_b(t);
        }
 }
 
@@ -1100,15 +1222,115 @@ static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
        }
 }
 
+static inline void __account_pc_queue(struct io_stats *ios,
+                                     struct blk_io_trace *t, int rw)
+{
+       if (rw) {
+               ios->qwrites_pc++;
+               ios->qwrite_kb_pc += t_kb(t);
+               ios->qwrite_b_pc += t_b(t);
+       } else {
+               ios->qreads_pc++;
+               ios->qread_kb += t_kb(t);
+               ios->qread_b_pc += t_b(t);
+       }
+}
+
+static inline void account_pc_queue(struct blk_io_trace *t,
+                                   struct per_cpu_info *pci, int rw)
+{
+       __account_pc_queue(&pci->io_stats, t, rw);
+
+       if (per_process_stats) {
+               struct io_stats *ios = find_process_io_stats(t->pid);
+
+               __account_pc_queue(ios, t, rw);
+       }
+}
+
+static inline void __account_pc_issue(struct io_stats *ios, int rw,
+                                     unsigned int bytes)
+{
+       if (rw) {
+               ios->iwrites_pc++;
+               ios->iwrite_kb_pc += bytes >> 10;
+               ios->iwrite_b_pc += bytes & 1023;
+       } else {
+               ios->ireads_pc++;
+               ios->iread_kb_pc += bytes >> 10;
+               ios->iread_b_pc += bytes & 1023;
+       }
+}
+
+static inline void account_pc_issue(struct blk_io_trace *t,
+                                   struct per_cpu_info *pci, int rw)
+{
+       __account_pc_issue(&pci->io_stats, rw, t->bytes);
+
+       if (per_process_stats) {
+               struct io_stats *ios = find_process_io_stats(t->pid);
+
+               __account_pc_issue(ios, rw, t->bytes);
+       }
+}
+
+static inline void __account_pc_requeue(struct io_stats *ios,
+                                       struct blk_io_trace *t, int rw)
+{
+       if (rw) {
+               ios->wrqueue_pc++;
+               ios->iwrite_kb_pc -= t_kb(t);
+               ios->iwrite_b_pc -= t_b(t);
+       } else {
+               ios->rrqueue_pc++;
+               ios->iread_kb_pc -= t_kb(t);
+               ios->iread_b_pc -= t_b(t);
+       }
+}
+
+static inline void account_pc_requeue(struct blk_io_trace *t,
+                                     struct per_cpu_info *pci, int rw)
+{
+       __account_pc_requeue(&pci->io_stats, t, rw);
+
+       if (per_process_stats) {
+               struct io_stats *ios = find_process_io_stats(t->pid);
+
+               __account_pc_requeue(ios, t, rw);
+       }
+}
+
+static inline void __account_pc_c(struct io_stats *ios, int rw)
+{
+       if (rw)
+               ios->cwrites_pc++;
+       else
+               ios->creads_pc++;
+}
+
+static inline void account_pc_c(struct blk_io_trace *t,
+                               struct per_cpu_info *pci, int rw)
+{
+       __account_pc_c(&pci->io_stats, rw);
+
+       if (per_process_stats) {
+               struct io_stats *ios = find_process_io_stats(t->pid);
+
+               __account_pc_c(ios, rw);
+       }
+}
+
 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
                                   int rw)
 {
        if (rw) {
                ios->qwrites++;
                ios->qwrite_kb += t_kb(t);
+               ios->qwrite_b += t_b(t);
        } else {
                ios->qreads++;
                ios->qread_kb += t_kb(t);
+               ios->qread_b += t_b(t);
        }
 }
 
@@ -1129,9 +1351,11 @@ static inline void __account_c(struct io_stats *ios, int rw, int bytes)
        if (rw) {
                ios->cwrites++;
                ios->cwrite_kb += bytes >> 10;
+               ios->cwrite_b += bytes & 1023;
        } else {
                ios->creads++;
                ios->cread_kb += bytes >> 10;
+               ios->cread_b += bytes & 1023;
        }
 }
 
@@ -1153,9 +1377,11 @@ static inline void __account_issue(struct io_stats *ios, int rw,
        if (rw) {
                ios->iwrites++;
                ios->iwrite_kb += bytes >> 10;
+               ios->iwrite_b  += bytes & 1023;
        } else {
                ios->ireads++;
                ios->iread_kb += bytes >> 10;
+               ios->iread_b  += bytes & 1023;
        }
 }
 
@@ -1197,9 +1423,11 @@ static inline void __account_requeue(struct io_stats *ios,
        if (rw) {
                ios->wrqueue++;
                ios->iwrite_kb -= t_kb(t);
+               ios->iwrite_b -= t_b(t);
        } else {
                ios->rrqueue++;
                ios->iread_kb -= t_kb(t);
+               ios->iread_b -= t_b(t);
        }
 }
 
@@ -1279,13 +1507,16 @@ static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
        process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
 }
 
-static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
+static void dump_trace_pc(struct blk_io_trace *t, struct per_dev_info *pdi,
+                         struct per_cpu_info *pci)
 {
+       int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
        int act = t->action & 0xffff;
 
        switch (act) {
                case __BLK_TA_QUEUE:
                        log_generic(pci, t, "Q");
+                       account_pc_queue(t, pci, w);
                        break;
                case __BLK_TA_GETRQ:
                        log_generic(pci, t, "G");
@@ -1294,13 +1525,27 @@ static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
                        log_generic(pci, t, "S");
                        break;
                case __BLK_TA_REQUEUE:
+                       /*
+                        * can happen if we miss traces, don't let it go
+                        * below zero
+                        */
+                       if (pdi->cur_depth[w])
+                               pdi->cur_depth[w]--;
+                       account_pc_requeue(t, pci, w);
                        log_generic(pci, t, "R");
                        break;
                case __BLK_TA_ISSUE:
+                       account_pc_issue(t, pci, w);
+                       pdi->cur_depth[w]++;
+                       if (pdi->cur_depth[w] > pdi->max_depth[w])
+                               pdi->max_depth[w] = pdi->cur_depth[w];
                        log_pc(pci, t, "D");
                        break;
                case __BLK_TA_COMPLETE:
+                       if (pdi->cur_depth[w])
+                               pdi->cur_depth[w]--;
                        log_pc(pci, t, "C");
+                       account_pc_c(t, pci, w);
                        break;
                case __BLK_TA_INSERT:
                        log_pc(pci, t, "I");
@@ -1384,6 +1629,10 @@ static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
                case __BLK_TA_REMAP:
                        log_generic(pci, t, "A");
                        break;
+               case __BLK_TA_DRV_DATA:
+                       have_drv_data = 1;
+                       /* dump to binary file only */
+                       break;
                default:
                        fprintf(stderr, "Bad fs action %x\n", t->action);
                        break;
@@ -1393,15 +1642,24 @@ static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
                       struct per_dev_info *pdi)
 {
-       if (t->action & BLK_TC_ACT(BLK_TC_PC))
-               dump_trace_pc(t, pci);
-       else
-               dump_trace_fs(t, pdi, pci);
+       if (text_output) {
+               if (t->action == BLK_TN_MESSAGE)
+                       handle_notify(t);
+               else if (t->action & BLK_TC_ACT(BLK_TC_PC))
+                       dump_trace_pc(t, pdi, pci);
+               else
+                       dump_trace_fs(t, pdi, pci);
+       }
 
        if (!pdi->events)
                pdi->first_reported_time = t->time;
 
        pdi->events++;
+
+       if (bin_output_msgs ||
+                           !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) &&
+                             t->action == BLK_TN_MESSAGE))
+               output_binary(t, sizeof(*t) + t->pdu_len);
 }
 
 /*
@@ -1432,21 +1690,60 @@ static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios,
 
        fprintf(ofp, "%s\n", msg);
 
-       fprintf(ofp, " Reads Queued:    %s, %siB\t", size_cnv(x, ios->qreads, 0), size_cnv(y, ios->qread_kb, 1));
-       fprintf(ofp, " Writes Queued:    %s, %siB\n", size_cnv(x, ios->qwrites, 0), size_cnv(y, ios->qwrite_kb, 1));
-
-       fprintf(ofp, " Read Dispatches: %s, %siB\t", size_cnv(x, ios->ireads, 0), size_cnv(y, ios->iread_kb, 1));
-       fprintf(ofp, " Write Dispatches: %s, %siB\n", size_cnv(x, ios->iwrites, 0), size_cnv(y, ios->iwrite_kb, 1));
+       fprintf(ofp, " Reads Queued:    %s, %siB\t",
+                       size_cnv(x, ios->qreads, 0),
+                       size_cnv(y, ios->qread_kb + (ios->qread_b>>10), 1));
+       fprintf(ofp, " Writes Queued:    %s, %siB\n",
+                       size_cnv(x, ios->qwrites, 0),
+                       size_cnv(y, ios->qwrite_kb + (ios->qwrite_b>>10), 1));
+       fprintf(ofp, " Read Dispatches: %s, %siB\t",
+                       size_cnv(x, ios->ireads, 0),
+                       size_cnv(y, ios->iread_kb + (ios->iread_b>>10), 1));
+       fprintf(ofp, " Write Dispatches: %s, %siB\n",
+                       size_cnv(x, ios->iwrites, 0),
+                       size_cnv(y, ios->iwrite_kb + (ios->iwrite_b>>10), 1));
        fprintf(ofp, " Reads Requeued:  %s\t\t", size_cnv(x, ios->rrqueue, 0));
        fprintf(ofp, " Writes Requeued:  %s\n", size_cnv(x, ios->wrqueue, 0));
-       fprintf(ofp, " Reads Completed: %s, %siB\t", size_cnv(x, ios->creads, 0), size_cnv(y, ios->cread_kb, 1));
-       fprintf(ofp, " Writes Completed: %s, %siB\n", size_cnv(x, ios->cwrites, 0), size_cnv(y, ios->cwrite_kb, 1));
-       fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
-       fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
+       fprintf(ofp, " Reads Completed: %s, %siB\t",
+                       size_cnv(x, ios->creads, 0),
+                       size_cnv(y, ios->cread_kb + (ios->cread_b>>10), 1));
+       fprintf(ofp, " Writes Completed: %s, %siB\n",
+                       size_cnv(x, ios->cwrites, 0),
+                       size_cnv(y, ios->cwrite_kb + (ios->cwrite_b>>10), 1));
+       fprintf(ofp, " Read Merges:     %s, %siB\t",
+                       size_cnv(x, ios->mreads, 0),
+                       size_cnv(y, ios->mread_kb + (ios->mread_b>>10), 1));
+       fprintf(ofp, " Write Merges:     %s, %siB\n",
+                       size_cnv(x, ios->mwrites, 0),
+                       size_cnv(y, ios->mwrite_kb + (ios->mwrite_b>>10), 1));
        if (pdi) {
                fprintf(ofp, " Read depth:      %'8u%8c\t", pdi->max_depth[0], ' ');
                fprintf(ofp, " Write depth:      %'8u\n", pdi->max_depth[1]);
        }
+       if (ios->qreads_pc || ios->qwrites_pc || ios->ireads_pc || ios->iwrites_pc ||
+           ios->rrqueue_pc || ios->wrqueue_pc || ios->creads_pc || ios->cwrites_pc) {
+               fprintf(ofp, " PC Reads Queued: %s, %siB\t",
+                       size_cnv(x, ios->qreads_pc, 0),
+                       size_cnv(y,
+                               ios->qread_kb_pc + (ios->qread_b_pc>>10), 1));
+               fprintf(ofp, " PC Writes Queued: %s, %siB\n",
+                       size_cnv(x, ios->qwrites_pc, 0),
+                       size_cnv(y,
+                               ios->qwrite_kb_pc + (ios->qwrite_b_pc>>10), 1));
+               fprintf(ofp, " PC Read Disp.:   %s, %siB\t",
+                       size_cnv(x, ios->ireads_pc, 0),
+                       size_cnv(y,
+                               ios->iread_kb_pc + (ios->iread_b_pc>>10), 1));
+               fprintf(ofp, " PC Write Disp.:   %s, %siB\n",
+                       size_cnv(x, ios->iwrites_pc, 0),
+                       size_cnv(y,
+                               ios->iwrite_kb_pc + (ios->iwrite_b_pc>>10),
+                               1));
+               fprintf(ofp, " PC Reads Req.:   %s\t\t", size_cnv(x, ios->rrqueue_pc, 0));
+               fprintf(ofp, " PC Writes Req.:   %s\n", size_cnv(x, ios->wrqueue_pc, 0));
+               fprintf(ofp, " PC Reads Compl.: %s\t\t", size_cnv(x, ios->creads_pc, 0));
+               fprintf(ofp, " PC Writes Compl.: %s\n", size_cnv(x, ios->cwrites_pc, 0));
+       }
        fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
        fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
 }
@@ -1481,6 +1778,88 @@ static int ppi_name_compare(const void *p1, const void *p2)
        return res;
 }
 
+static int ppi_event_compare(const void *p1, const void *p2)
+{
+       struct per_process_info *ppi1 = *((struct per_process_info **) p1);
+       struct per_process_info *ppi2 = *((struct per_process_info **) p2);
+       struct io_stats *ios1 = &ppi1->io_stats;
+       struct io_stats *ios2 = &ppi2->io_stats;
+       unsigned long io1, io2;
+       unsigned long long kb1,kb2;
+       int sort_by_kb = 1;
+
+       io1 = io2 = 0;
+       kb1 = kb2 = 0;
+
+       switch (per_process_stats_event) {
+       case SORT_PROG_EVENT_QKB: /* KB: Queued read and write */
+               kb1 = ios1->qwrite_kb + (ios1->qwrite_b>>10) +
+                       ios1->qread_kb + (ios1->qread_b>>10);
+               kb2 = ios2->qwrite_kb + (ios2->qwrite_b>>10) +
+                       ios2->qread_kb + (ios2->qread_b>>10);
+               break;
+       case SORT_PROG_EVENT_RKB: /* KB: Queued Read */
+               kb1 = ios1->qread_kb + (ios1->qread_b>>10);
+               kb2 = ios2->qread_kb + (ios2->qread_b>>10);
+               break;
+       case SORT_PROG_EVENT_WKB: /* KB: Queued Write */
+               kb1 = ios1->qwrite_kb + (ios1->qwrite_b>>10);
+               kb2 = ios2->qwrite_kb + (ios2->qwrite_b>>10);
+               break;
+       case SORT_PROG_EVENT_CKB: /* KB: Complete */
+               kb1 = ios1->cwrite_kb + (ios1->cwrite_b>>10) +
+                       ios1->cread_kb + (ios1->cread_b>>10);
+               kb2 = ios2->cwrite_kb + (ios2->cwrite_b>>10) +
+                       ios2->cread_kb + (ios2->cread_b>>10);
+               break;
+       case SORT_PROG_EVENT_QIO: /* IO: Queued read and write */
+               sort_by_kb = 0;
+               io1 = ios1->qreads + ios1->qwrites;
+               io2 = ios2->qreads + ios2->qwrites;
+               break;
+       case SORT_PROG_EVENT_RIO: /* IO: Queued Read */
+               sort_by_kb = 0;
+               io1 = ios1->qreads;
+               io2 = ios2->qreads;
+               break;
+       case SORT_PROG_EVENT_WIO: /* IO: Queued Write */
+               sort_by_kb = 0;
+               io1 = ios1->qwrites;
+               io2 = ios2->qwrites;
+               break;
+       case SORT_PROG_EVENT_CIO: /* IO: Complete */
+               sort_by_kb = 0;
+               io1 = ios1->creads + ios1->cwrites;
+               io2 = ios2->creads + ios2->cwrites;
+               break;
+       }
+
+
+       /* compare kb */
+       if (sort_by_kb) {
+               if (kb1 > kb2)
+                       return 1;
+               else if (kb1 == kb2)
+                       return 0;
+               return -1;
+       }
+
+       /* compare io */
+       if (io1 > io2)
+               return 1;
+       else if (io1 == io2)
+               return 0;
+       return -1;
+}
+
+static int ppi_compare(const void *p1, const void *p2)
+{
+       if (per_process_stats_event == SORT_PROG_EVENT_N)
+               return ppi_name_compare(p1, p2);
+
+       return ppi_event_compare(p1, p2);
+}
+
 static void sort_process_list(void)
 {
        struct per_process_info **ppis;
@@ -1495,7 +1874,7 @@ static void sort_process_list(void)
                ppi = ppi->list_next;
        }
 
-       qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
+       qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_compare);
 
        i = ppi_list_entries - 1;
        ppi_list = NULL;
@@ -1543,6 +1922,7 @@ static void show_device_and_cpu_stats(void)
        int i, j, pci_events;
        char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
        char name[32];
+       double ratio;
 
        for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
 
@@ -1573,6 +1953,34 @@ static void show_device_and_cpu_stats(void)
                        total.cwrite_kb += ios->cwrite_kb;
                        total.iread_kb += ios->iread_kb;
                        total.iwrite_kb += ios->iwrite_kb;
+                       total.mread_kb += ios->mread_kb;
+                       total.mwrite_kb += ios->mwrite_kb;
+                       total.qread_b += ios->qread_b;
+                       total.qwrite_b += ios->qwrite_b;
+                       total.cread_b += ios->cread_b;
+                       total.cwrite_b += ios->cwrite_b;
+                       total.iread_b += ios->iread_b;
+                       total.iwrite_b += ios->iwrite_b;
+                       total.mread_b += ios->mread_b;
+                       total.mwrite_b += ios->mwrite_b;
+
+                       total.qreads_pc += ios->qreads_pc;
+                       total.qwrites_pc += ios->qwrites_pc;
+                       total.creads_pc += ios->creads_pc;
+                       total.cwrites_pc += ios->cwrites_pc;
+                       total.ireads_pc += ios->ireads_pc;
+                       total.iwrites_pc += ios->iwrites_pc;
+                       total.rrqueue_pc += ios->rrqueue_pc;
+                       total.wrqueue_pc += ios->wrqueue_pc;
+                       total.qread_kb_pc += ios->qread_kb_pc;
+                       total.qwrite_kb_pc += ios->qwrite_kb_pc;
+                       total.iread_kb_pc += ios->iread_kb_pc;
+                       total.iwrite_kb_pc += ios->iwrite_kb_pc;
+                       total.qread_b_pc += ios->qread_b_pc;
+                       total.qwrite_b_pc += ios->qwrite_b_pc;
+                       total.iread_b_pc += ios->iread_b_pc;
+                       total.iwrite_b_pc += ios->iwrite_b_pc;
+
                        total.timer_unplugs += ios->timer_unplugs;
                        total.io_unplugs += ios->io_unplugs;
 
@@ -1592,8 +2000,10 @@ static void show_device_and_cpu_stats(void)
                wrate = rrate = 0;
                msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000;
                if (msec) {
-                       rrate = 1000 * total.cread_kb / msec;
-                       wrate = 1000 * total.cwrite_kb / msec;
+                       rrate = ((1000 * total.cread_kb) + total.cread_b) /
+                                                                       msec;
+                       wrate = ((1000 * total.cwrite_kb) + total.cwrite_b) /
+                                                                       msec;
                }
 
                fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n",
@@ -1602,10 +2012,29 @@ static void show_device_and_cpu_stats(void)
                        get_dev_name(pdi, line, sizeof(line)), pdi->events);
 
                collect_pdi_skips(pdi);
+               if (!pdi->skips && !pdi->events)
+                       ratio = 0.0;
+               else
+                       ratio = 100.0 * ((double)pdi->seq_skips /
+                                       (double)(pdi->events + pdi->seq_skips));
                fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n",
-                       pdi->skips,pdi->seq_skips,
-                       100.0 * ((double)pdi->seq_skips /
-                               (double)(pdi->events + pdi->seq_skips)));
+                       pdi->skips, pdi->seq_skips, ratio);
+       }
+}
+
+static void correct_abs_start_time(void)
+{
+       long delta = genesis_time - start_timestamp;
+
+       abs_start_time.tv_sec  += SECONDS(delta);
+       abs_start_time.tv_nsec += NANO_SECONDS(delta);
+       if (abs_start_time.tv_nsec < 0) {
+               abs_start_time.tv_nsec += 1000000000;
+               abs_start_time.tv_sec -= 1;
+       } else
+       if (abs_start_time.tv_nsec > 1000000000) {
+               abs_start_time.tv_nsec -= 1000000000;
+               abs_start_time.tv_sec += 1;
        }
 }
 
@@ -1620,6 +2049,14 @@ static void find_genesis(void)
 
                t = t->next;
        }
+
+       /* The time stamp record will usually be the first
+        * record in the trace, but not always.
+        */
+       if (start_timestamp
+        && start_timestamp != genesis_time) {
+               correct_abs_start_time();
+       }
 }
 
 static inline int check_stopwatch(struct blk_io_trace *bit)
@@ -1693,6 +2130,7 @@ static int check_cpu_map(struct per_dev_info *pdi)
         * create a map of the cpus we have traces for
         */
        cpu_map = malloc(pdi->cpu_map_max / sizeof(long));
+       memset(cpu_map, 0, sizeof(*cpu_map));
        n = rb_first(&rb_sort_root);
        while (n) {
                __t = rb_entry(n, struct trace, rb_node);
@@ -1796,7 +2234,8 @@ static void show_entries_rb(int force)
                        break;
                }
 
-               if (check_sequence(pdi, t, force))
+               if (!(bit->action == BLK_TN_MESSAGE) &&
+                   check_sequence(pdi, t, force))
                        break;
 
                if (!force && bit->time > last_allowed_time)
@@ -1807,7 +2246,8 @@ static void show_entries_rb(int force)
                if (!pci || pci->cpu != bit->cpu)
                        pci = get_cpu_info(pdi, bit->cpu);
 
-               pci->last_sequence = bit->sequence;
+               if (!(bit->action == BLK_TN_MESSAGE))
+                       pci->last_sequence = bit->sequence;
 
                pci->nelems++;
 
@@ -1940,8 +2380,9 @@ static int read_events(int fd, int always_block, int *fdblock)
                /*
                 * not a real trace, so grab and handle it here
                 */
-               if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY)) {
-                       add_ppm_hash(bit->pid, (char *) bit + sizeof(*bit));
+               if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
+                       handle_notify(bit);
+                       output_binary(bit, sizeof(*bit) + bit->pdu_len);
                        continue;
                }
 
@@ -1965,125 +2406,324 @@ static int read_events(int fd, int always_block, int *fdblock)
        return events;
 }
 
-static int do_file(void)
-{
-       struct per_cpu_info *pci;
+/*
+ * Managing input streams
+ */
+
+struct ms_stream {
+       struct ms_stream *next;
+       struct trace *first, *last;
        struct per_dev_info *pdi;
-       int i, j, events, events_added;
+       unsigned int cpu;
+};
 
-       /*
-        * first prepare all files for reading
-        */
-       for (i = 0; i < ndevices; i++) {
-               pdi = &devices[i];
-               pdi->nfiles = 0;
+#define MS_HASH(d, c) ((MAJOR(d) & 0xff) ^ (MINOR(d) & 0xff) ^ (cpu & 0xff))
 
-               for (j = 0;; j++) {
-                       struct stat st;
-                       int len = 0;
-                       char *p, *dname;
+struct ms_stream *ms_head;
+struct ms_stream *ms_hash[256];
 
-                       pci = get_cpu_info(pdi, j);
-                       pci->cpu = j;
-                       pci->fd = -1;
-                       pci->fdblock = -1;
-       
-                       p = strdup(pdi->name);
-                       dname = dirname(p);
-                       if (strcmp(dname, ".")) {
-                               input_dir = dname;
-                               p = strdup(pdi->name);
-                               strcpy(pdi->name, basename(p));
-                       }
-                       free(p);
+static void ms_sort(struct ms_stream *msp);
+static int ms_prime(struct ms_stream *msp);
 
-                       if (input_dir)
-                               len = sprintf(pci->fname, "%s/", input_dir);
+static inline struct trace *ms_peek(struct ms_stream *msp)
+{
+       return (msp == NULL) ? NULL : msp->first;
+}
 
-                       snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
-                                "%s.blktrace.%d", pdi->name, pci->cpu);
-                       if (stat(pci->fname, &st) < 0)
-                               break;
-                       if (st.st_size) {
-                               pci->fd = open(pci->fname, O_RDONLY);
-                               if (pci->fd < 0) {
-                                       perror(pci->fname);
-                                       continue;
-                               }
-                       }
+static inline __u64 ms_peek_time(struct ms_stream *msp)
+{
+       return ms_peek(msp)->bit->time;
+}
+
+static inline void ms_resort(struct ms_stream *msp)
+{
+       if (msp->next && ms_peek_time(msp) > ms_peek_time(msp->next)) {
+               ms_head = msp->next;
+               msp->next = NULL;
+               ms_sort(msp);
+       }
+}
 
-                       printf("Input file %s added\n", pci->fname);
-                       pdi->nfiles++;
-                       cpu_mark_online(pdi, pci->cpu);
+static inline void ms_deq(struct ms_stream *msp)
+{
+       msp->first = msp->first->next;
+       if (!msp->first) {
+               msp->last = NULL;
+               if (!ms_prime(msp)) {
+                       ms_head = msp->next;
+                       msp->next = NULL;
+                       return;
                }
        }
 
-       /*
-        * now loop over the files reading in the data
-        */
-       do {
-               unsigned long long youngest;
+       ms_resort(msp);
+}
 
-               events_added = 0;
-               last_allowed_time = -1ULL;
-               read_sequence++;
+static void ms_sort(struct ms_stream *msp)
+{
+       __u64 msp_t = ms_peek_time(msp);
+       struct ms_stream *this_msp = ms_head;
 
-               for (i = 0; i < ndevices; i++) {
-                       pdi = &devices[i];
-                       pdi->last_read_time = -1ULL;
+       if (this_msp == NULL)
+               ms_head = msp;
+       else if (msp_t < ms_peek_time(this_msp)) {
+               msp->next = this_msp;
+               ms_head = msp;
+       }
+       else {
+               while (this_msp->next && ms_peek_time(this_msp->next) < msp_t)
+                       this_msp = this_msp->next;
 
-                       for (j = 0; j < pdi->nfiles; j++) {
+               msp->next = this_msp->next;
+               this_msp->next = msp;
+       }
+}
 
-                               pci = get_cpu_info(pdi, j);
+static int ms_prime(struct ms_stream *msp)
+{
+       __u32 magic;
+       unsigned int i;
+       struct trace *t;
+       struct per_dev_info *pdi = msp->pdi;
+       struct per_cpu_info *pci = get_cpu_info(pdi, msp->cpu);
+       struct blk_io_trace *bit = NULL;
+       int ret, pdu_len, ndone = 0;
 
-                               if (pci->fd == -1)
-                                       continue;
+       for (i = 0; !is_done() && pci->fd >= 0 && i < rb_batch; i++) {
+               bit = bit_alloc();
+               ret = read_data(pci->fd, bit, sizeof(*bit), 1, &pci->fdblock);
+               if (ret)
+                       goto err;
 
-                               pci->smallest_seq_read = -1;
+               if (data_is_native == -1 && check_data_endianness(bit->magic))
+                       goto err;
 
-                               events = read_events(pci->fd, 1, &pci->fdblock);
-                               if (events <= 0) {
-                                       cpu_mark_offline(pdi, pci->cpu);
-                                       close(pci->fd);
-                                       pci->fd = -1;
-                                       continue;
-                               }
+               magic = get_magic(bit);
+               if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+                       fprintf(stderr, "Bad magic %x\n", magic);
+                       goto err;
 
-                               if (pdi->last_read_time < last_allowed_time)
-                                       last_allowed_time = pdi->last_read_time;
+               }
 
-                               events_added += events;
+               pdu_len = get_pdulen(bit);
+               if (pdu_len) {
+                       void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
+                       ret = read_data(pci->fd, ptr + sizeof(*bit), pdu_len,
+                                                            1, &pci->fdblock);
+                       if (ret) {
+                               free(ptr);
+                               bit = NULL;
+                               goto err;
                        }
+
+                       bit = ptr;
                }
 
-               if (sort_entries(&youngest))
-                       break;
+               trace_to_cpu(bit);
+               if (verify_trace(bit))
+                       goto err;
 
-               if (youngest > stopwatch_end)
-                       break;
+               if (bit->cpu != pci->cpu) {
+                       fprintf(stderr, "cpu %d trace info has error cpu %d\n",
+                               pci->cpu, bit->cpu);
+                       continue;
+               }
 
-               show_entries_rb(0);
+               if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
+                       handle_notify(bit);
+                       output_binary(bit, sizeof(*bit) + bit->pdu_len);
+                       bit_free(bit);
 
-       } while (events_added);
+                       i -= 1;
+                       continue;
+               }
 
-       if (rb_sort_entries)
-               show_entries_rb(1);
+               if (bit->time > pdi->last_read_time)
+                       pdi->last_read_time = bit->time;
+
+               t = t_alloc();
+               memset(t, 0, sizeof(*t));
+               t->bit = bit;
+
+               if (msp->first == NULL)
+                       msp->first = msp->last = t;
+               else {
+                       msp->last->next = t;
+                       msp->last = t;
+               }
+
+               ndone++;
+       }
+
+       return ndone;
+
+err:
+       if (bit) bit_free(bit);
+
+       cpu_mark_offline(pdi, pci->cpu);
+       close(pci->fd);
+       pci->fd = -1;
+
+       return ndone;
+}
+
+static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu)
+{
+       struct ms_stream *msp = malloc(sizeof(*msp));
+
+       msp->next = NULL;
+       msp->first = msp->last = NULL;
+       msp->pdi = pdi;
+       msp->cpu = cpu;
+
+       if (ms_prime(msp))
+               ms_sort(msp);
+
+       return msp;
+}
+
+static int setup_file(struct per_dev_info *pdi, int cpu)
+{
+       int len = 0;
+       struct stat st;
+       char *p, *dname;
+       struct per_cpu_info *pci = get_cpu_info(pdi, cpu);
+
+       pci->cpu = cpu;
+       pci->fdblock = -1;
+
+       p = strdup(pdi->name);
+       dname = dirname(p);
+       if (strcmp(dname, ".")) {
+               input_dir = dname;
+               p = strdup(pdi->name);
+               strcpy(pdi->name, basename(p));
+       }
+       free(p);
+
+       if (input_dir)
+               len = sprintf(pci->fname, "%s/", input_dir);
+
+       snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
+                "%s.blktrace.%d", pdi->name, pci->cpu);
+       if (stat(pci->fname, &st) < 0)
+               return 0;
+       if (!st.st_size)
+               return 1;
+
+       pci->fd = open(pci->fname, O_RDONLY);
+       if (pci->fd < 0) {
+               perror(pci->fname);
+               return 0;
+       }
+
+       printf("Input file %s added\n", pci->fname);
+       cpu_mark_online(pdi, pci->cpu);
+
+       pdi->nfiles++;
+       ms_alloc(pdi, pci->cpu);
+
+       return 1;
+}
+
+static int handle(struct ms_stream *msp)
+{
+       struct trace *t;
+       struct per_dev_info *pdi;
+       struct per_cpu_info *pci;
+       struct blk_io_trace *bit;
+
+       t = ms_peek(msp);
+
+       bit = t->bit;
+       pdi = msp->pdi;
+       pci = get_cpu_info(pdi, msp->cpu);
+       pci->nelems++;
+       bit->time -= genesis_time;
+
+       if (t->bit->time > stopwatch_end)
+               return 0;
+
+       pdi->last_reported_time = bit->time;
+       if ((bit->action & (act_mask << BLK_TC_SHIFT))&&
+           t->bit->time >= stopwatch_start)
+               dump_trace(bit, pci, pdi);
+
+       ms_deq(msp);
+
+       if (text_output)
+               trace_rb_insert_last(pdi, t);
+       else {
+               bit_free(t->bit);
+               t_free(t);
+       }
+
+       return 1;
+}
+
+/*
+ * Check if we need to sanitize the name. We allow 'foo', or if foo.blktrace.X
+ * is given, then strip back down to 'foo' to avoid missing files.
+ */
+static int name_fixup(char *name)
+{
+       char *b;
+
+       if (!name)
+               return 1;
+
+       b = strstr(name, ".blktrace.");
+       if (b)
+               *b = '\0';
 
        return 0;
 }
 
-static int do_stdin(void)
+static int do_file(void)
 {
-       unsigned long long youngest;
-       int fd, events, fdblock;
+       int i, cpu, ret;
+       struct per_dev_info *pdi;
 
-       last_allowed_time = -1ULL;
-       fd = dup(STDIN_FILENO);
-       if (fd == -1) {
-               perror("dup stdin");
-               return -1;
+       /*
+        * first prepare all files for reading
+        */
+       for (i = 0; i < ndevices; i++) {
+               pdi = &devices[i];
+               ret = name_fixup(pdi->name);
+               if (ret)
+                       return ret;
+
+               for (cpu = 0; setup_file(pdi, cpu); cpu++)
+                       ;
+
+               if (!cpu) {
+                       fprintf(stderr,"No input files found for %s\n",
+                               pdi->name);
+                       return 1;
+               }
        }
 
+       /*
+        * Get the initial time stamp
+        */
+       if (ms_head)
+               genesis_time = ms_peek_time(ms_head);
+
+       /*
+        * Keep processing traces while any are left
+        */
+       while (!is_done() && ms_head && handle(ms_head))
+               ;
+
+       return 0;
+}
+
+static void do_pipe(int fd)
+{
+       unsigned long long youngest;
+       int events, fdblock;
+
+       last_allowed_time = -1ULL;
        fdblock = -1;
        while ((events = read_events(fd, 0, &fdblock)) > 0) {
                read_sequence++;
@@ -2103,7 +2743,23 @@ static int do_stdin(void)
 
        if (rb_sort_entries)
                show_entries_rb(1);
+}
+
+static int do_fifo(void)
+{
+       int fd;
 
+       if (!strcmp(pipename, "-"))
+               fd = dup(STDIN_FILENO);
+       else
+               fd = open(pipename, O_RDONLY);
+
+       if (fd == -1) {
+               perror("dup stdin");
+               return -1;
+       }
+
+       do_pipe(fd);
        close(fd);
        return 0;
 }
@@ -2169,36 +2825,116 @@ static int find_stopwatch_interval(char *string)
        return 0;
 }
 
-static char usage_str[] = \
-       "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
-       "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
-       "\t-i Input file containing trace data, or '-' for stdin\n" \
+static int is_pipe(const char *str)
+{
+       struct stat st;
+
+       if (!strcmp(str, "-"))
+               return 1;
+       if (!stat(str, &st) && S_ISFIFO(st.st_mode))
+               return 1;
+
+       return 0;
+}
+
+static int get_program_sort_event(const char *str)
+{
+       char evt = str[0];
+
+       switch (evt) {
+       case 'N':
+               per_process_stats_event = SORT_PROG_EVENT_N;
+               break;
+       case 'Q':
+               per_process_stats_event = SORT_PROG_EVENT_QKB;
+               break;
+       case 'q':
+               per_process_stats_event = SORT_PROG_EVENT_QIO;
+               break;
+       case 'R':
+               per_process_stats_event = SORT_PROG_EVENT_RKB;
+               break;
+       case 'r':
+               per_process_stats_event = SORT_PROG_EVENT_RIO;
+               break;
+       case 'W':
+               per_process_stats_event = SORT_PROG_EVENT_WKB;
+               break;
+       case 'w':
+               per_process_stats_event = SORT_PROG_EVENT_WIO;
+               break;
+       case 'C':
+               per_process_stats_event = SORT_PROG_EVENT_CKB;
+               break;
+       case 'c':
+               per_process_stats_event = SORT_PROG_EVENT_CIO;
+               break;
+       default:
+               return 1;
+       }
+
+       return 0;
+}
+
+#define S_OPTS  "a:A:b:D:d:f:F:hi:o:OqsS:tw:vVM"
+static char usage_str[] =    "\n\n" \
+       "-i <file>           | --input=<file>\n" \
+       "[ -a <action field> | --act-mask=<action field> ]\n" \
+       "[ -A <action mask>  | --set-mask=<action mask> ]\n" \
+       "[ -b <traces>       | --batch=<traces> ]\n" \
+       "[ -d <file>         | --dump-binary=<file> ]\n" \
+       "[ -D <dir>          | --input-directory=<dir> ]\n" \
+       "[ -f <format>       | --format=<format> ]\n" \
+       "[ -F <spec>         | --format-spec=<spec> ]\n" \
+       "[ -h                | --hash-by-name ]\n" \
+       "[ -o <file>         | --output=<file> ]\n" \
+       "[ -O                | --no-text-output ]\n" \
+       "[ -q                | --quiet ]\n" \
+       "[ -s                | --per-program-stats ]\n" \
+       "[ -S <event>        | --sort-program-stats=<event> ]\n" \
+       "[ -t                | --track-ios ]\n" \
+       "[ -w <time>         | --stopwatch=<time> ]\n" \
+       "[ -M                | --no-msgs\n" \
+       "[ -v                | --verbose ]\n" \
+       "[ -V                | --version ]\n\n" \
+       "\t-a Only trace specified actions. See documentation\n" \
+       "\t-A Give trace mask as a single value. See documentation\n" \
+       "\t-b stdin read batching\n" \
+       "\t-d Output file. If specified, binary data is written to file\n" \
        "\t-D Directory to prepend to input file names\n" \
+       "\t-f Output format. Customize the output format. The format field\n" \
+       "\t   identifies can be found in the documentation\n" \
+       "\t-F Format specification. Can be found in the documentation\n" \
+       "\t-h Hash processes by name, not pid\n" \
+       "\t-i Input file containing trace data, or '-' for stdin\n" \
        "\t-o Output file. If not given, output is stdout\n" \
-       "\t-b stdin read batching\n" \
+       "\t-O Do NOT output text data\n" \
+       "\t-q Quiet. Don't display any stats at the end of the trace\n" \
        "\t-s Show per-program io statistics\n" \
-       "\t-h Hash processes by name, not pid\n" \
+       "\t-S Show per-program io statistics sorted by N/Q/q/R/r/W/w/C/c\n" \
+       "\t   N:Name, Q/q:Queued(read & write), R/r:Queued Read, W/w:Queued Write, C/c:Complete.\n" \
+       "\t   Sort programs by how much data(KB): Q,R,W,C.\n" \
+       "\t   Sort programs by how many IO operations: q,r,w,c.\n" \
+       "\t   if -S was used, the -s parameter will be ignored.\n" \
        "\t-t Track individual ios. Will tell you the time a request took\n" \
        "\t   to get queued, to get dispatched, and to get completed\n" \
-       "\t-q Quiet. Don't display any stats at the end of the trace\n" \
        "\t-w Only parse data between the given time interval in seconds.\n" \
        "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
-       "\t-f Output format. Customize the output format. The format field\n" \
-       "\t   identifies can be found in the documentation\n" \
-       "\t-F Format specification. Can be found in the documentation\n" \
+       "\t-M Do not output messages to binary file\n" \
        "\t-v More verbose for marginal errors\n" \
        "\t-V Print program version info\n\n";
 
 static void usage(char *prog)
 {
-       fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
+       fprintf(stderr, "Usage: %s %s", prog, usage_str);
 }
 
 int main(int argc, char *argv[])
 {
-       char *ofp_buffer;
        int i, c, ret, mode;
        int act_mask_tmp = 0;
+       char *ofp_buffer = NULL;
+       char *bin_ofp_buffer = NULL;
 
        while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
                switch (c) {
@@ -2223,9 +2959,10 @@ int main(int argc, char *argv[])
                        act_mask_tmp = i;
                        break;
                case 'i':
-                       if (!strcmp(optarg, "-") && !pipeline)
+                       if (is_pipe(optarg) && !pipeline) {
                                pipeline = 1;
-                       else if (resize_devices(optarg) != 0)
+                               pipename = strdup(optarg);
+                       } else if (resize_devices(optarg) != 0)
                                return 1;
                        break;
                case 'D':
@@ -2234,6 +2971,9 @@ int main(int argc, char *argv[])
                case 'o':
                        output_name = optarg;
                        break;
+               case 'O':
+                       text_output = 0;
+                       break;
                case 'b':
                        rb_batch = atoi(optarg);
                        if (rb_batch <= 0)
@@ -2242,6 +2982,11 @@ int main(int argc, char *argv[])
                case 's':
                        per_process_stats = 1;
                        break;
+               case 'S':
+                       per_process_stats = 1;
+                       if (get_program_sort_event(optarg))
+                               return 1;
+                       break;
                case 't':
                        track_ios = 1;
                        break;
@@ -2268,6 +3013,12 @@ int main(int argc, char *argv[])
                case 'V':
                        printf("%s version %s\n", argv[0], blkparse_version);
                        return 0;
+               case 'd':
+                       dump_binary = optarg;
+                       break;
+               case 'M':
+                       bin_output_msgs = 0;
+                       break;
                default:
                        usage(argv[0]);
                        return 1;
@@ -2275,9 +3026,10 @@ int main(int argc, char *argv[])
        }
 
        while (optind < argc) {
-               if (!strcmp(argv[optind], "-") && !pipeline)
+               if (is_pipe(argv[optind]) && !pipeline) {
                        pipeline = 1;
-               else if (resize_devices(argv[optind]) != 0)
+                       pipename = strdup(argv[optind]);
+               } else if (resize_devices(argv[optind]) != 0)
                        return 1;
                optind++;
        }
@@ -2298,34 +3050,67 @@ int main(int argc, char *argv[])
 
        setlocale(LC_NUMERIC, "en_US");
 
-       if (!output_name) {
-               ofp = fdopen(STDOUT_FILENO, "w");
-               mode = _IOLBF;
-       } else {
-               char ofname[128];
+       if (text_output) {
+               if (!output_name) {
+                       ofp = fdopen(STDOUT_FILENO, "w");
+                       mode = _IOLBF;
+               } else {
+                       char ofname[PATH_MAX];
 
-               snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
-               ofp = fopen(ofname, "w");
-               mode = _IOFBF;
-       }
+                       snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
+                       ofp = fopen(ofname, "w");
+                       mode = _IOFBF;
+               }
 
-       if (!ofp) {
-               perror("fopen");
-               return 1;
+               if (!ofp) {
+                       perror("fopen");
+                       return 1;
+               }
+
+               ofp_buffer = malloc(4096);
+               if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
+                       perror("setvbuf");
+                       return 1;
+               }
        }
 
-       ofp_buffer = malloc(4096);      
-       if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
-               perror("setvbuf");
-               return 1;
+       if (dump_binary) {
+               if (!strcmp(dump_binary, "-"))
+                       dump_fp = stdout;
+               else {
+                       dump_fp = fopen(dump_binary, "w");
+                       if (!dump_fp) {
+                               perror(dump_binary);
+                               dump_binary = NULL;
+                               return 1;
+                       }
+               }
+               bin_ofp_buffer = malloc(128 * 1024);
+               if (setvbuf(dump_fp, bin_ofp_buffer, _IOFBF, 128 * 1024)) {
+                       perror("setvbuf binary");
+                       return 1;
+               }
        }
 
        if (pipeline)
-               ret = do_stdin();
+               ret = do_fifo();
        else
                ret = do_file();
 
-       show_stats();
-       free(ofp_buffer);
+       if (!ret)
+               show_stats();
+
+       if (have_drv_data && !dump_binary)
+               printf("\ndiscarded traces containing low-level device driver "
+                      "specific data (only available in binary output)\n");
+
+       if (ofp_buffer) {
+               fflush(ofp);
+               free(ofp_buffer);
+       }
+       if (bin_ofp_buffer) {
+               fflush(dump_fp);
+               free(bin_ofp_buffer);
+       }
        return ret;
 }