[PATCH] blkparse: make skip detection per-CPU as well
[blktrace.git] / blkparse.c
index 5a94dfe18301b82fda2342e11100a31b841c0b3c..f051ee7e055e4d355b4f8baef69624cdf6eff9d4 100644 (file)
 
 static char blkparse_version[] = "0.99";
 
+struct skip_info {
+       unsigned long start, end;
+       struct skip_info *prev, *next;
+};
+
 struct per_dev_info {
        dev_t dev;
        char *name;
@@ -47,11 +52,10 @@ struct per_dev_info {
        unsigned long long last_reported_time;
        unsigned long long last_read_time;
        struct io_stats io_stats;
-       unsigned long last_sequence;
-       unsigned long skips;
-
-       struct rb_root rb_last;
-       unsigned long rb_last_entries;
+       unsigned long skips, nskips;
+       unsigned long long seq_skips, seq_nskips;
+       unsigned int max_depth[2];
+       unsigned int cur_depth[2];
 
        struct rb_root rb_track;
 
@@ -221,6 +225,7 @@ struct io_track {
 static int ndevices;
 static struct per_dev_info *devices;
 static char *get_dev_name(struct per_dev_info *, char *, int);
+static int trace_rb_insert_last(struct per_dev_info *, struct trace *);
 
 FILE *ofp = NULL;
 static char *output_name;
@@ -228,7 +233,6 @@ static char *input_dir;
 
 static unsigned long long genesis_time;
 static unsigned long long last_allowed_time;
-static unsigned int smallest_seq_read;
 static unsigned long long stopwatch_start;     /* start from zero by default */
 static unsigned long long stopwatch_end = -1ULL;       /* "infinity" */
 
@@ -257,6 +261,189 @@ static volatile int done;
 #define CPU_IDX(cpu)   ((cpu) / CPUS_PER_LONG)
 #define CPU_BIT(cpu)   ((cpu) & (CPUS_PER_LONG - 1))
 
+static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
+{
+       struct per_cpu_info *cpus = pdi->cpus;
+       int ncpus = pdi->ncpus;
+       int new_count = cpu + 1;
+       int new_space, size;
+       char *new_start;
+
+       size = new_count * sizeof(struct per_cpu_info);
+       cpus = realloc(cpus, size);
+       if (!cpus) {
+               char name[20];
+               fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
+                       get_dev_name(pdi, name, sizeof(name)), size);
+               exit(1);
+       }
+
+       new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
+       new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
+       memset(new_start, 0, new_space);
+
+       pdi->ncpus = new_count;
+       pdi->cpus = cpus;
+
+       for (new_count = 0; new_count < pdi->ncpus; new_count++) {
+               struct per_cpu_info *pci = &pdi->cpus[new_count];
+
+               if (!pci->fd) {
+                       pci->fd = -1;
+                       memset(&pci->rb_last, 0, sizeof(pci->rb_last));
+                       pci->rb_last_entries = 0;
+                       pci->last_sequence = -1;
+               }
+       }
+}
+
+static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
+{
+       struct per_cpu_info *pci;
+
+       if (cpu >= pdi->ncpus)
+               resize_cpu_info(pdi, cpu);
+
+       pci = &pdi->cpus[cpu];
+       pci->cpu = cpu;
+       return pci;
+}
+
+
+static int resize_devices(char *name)
+{
+       int size = (ndevices + 1) * sizeof(struct per_dev_info);
+
+       devices = realloc(devices, size);
+       if (!devices) {
+               fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
+               return 1;
+       }
+       memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
+       devices[ndevices].name = name;
+       ndevices++;
+       return 0;
+}
+
+static struct per_dev_info *get_dev_info(dev_t dev)
+{
+       struct per_dev_info *pdi;
+       int i;
+
+       for (i = 0; i < ndevices; i++) {
+               if (!devices[i].dev)
+                       devices[i].dev = dev;
+               if (devices[i].dev == dev)
+                       return &devices[i];
+       }
+
+       if (resize_devices(NULL))
+               return NULL;
+
+       pdi = &devices[ndevices - 1];
+       pdi->dev = dev;
+       pdi->first_reported_time = 0;
+       pdi->last_read_time = 0;
+
+       return pdi;
+}
+
+static void insert_skip(struct per_cpu_info *pci, unsigned long start,
+                       unsigned long end)
+{
+       struct skip_info *sip;
+
+       for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
+               if (end == (sip->start - 1)) {
+                       sip->start = start;
+                       return;
+               } else if (start == (sip->end + 1)) {
+                       sip->end = end;
+                       return;
+               }
+       }
+
+       sip = malloc(sizeof(struct skip_info));
+       sip->start = start;
+       sip->end = end;
+       sip->prev = sip->next = NULL;
+       if (pci->skips_tail == NULL)
+               pci->skips_head = pci->skips_tail = sip;
+       else {
+               sip->prev = pci->skips_tail;
+               pci->skips_tail->next = sip;
+               pci->skips_tail = sip;
+       }
+}
+
+static void remove_sip(struct per_cpu_info *pci, struct skip_info *sip)
+{
+       if (sip->prev == NULL) {
+               if (sip->next == NULL)
+                       pci->skips_head = pci->skips_tail = NULL;
+               else {
+                       pci->skips_head = sip->next;
+                       sip->next->prev = NULL;
+               }
+       } else if (sip->next == NULL) {
+               pci->skips_tail = sip->prev;
+               sip->prev->next = NULL;
+       } else {
+               sip->prev->next = sip->next;
+               sip->next->prev = sip->prev;
+       }
+
+       sip->prev = sip->next = NULL;
+       free(sip);
+}
+
+#define IN_SKIP(sip,seq) (((sip)->start <= (seq)) && ((seq) <= sip->end))
+static int check_current_skips(struct per_cpu_info *pci, unsigned long seq)
+{
+       struct skip_info *sip;
+
+       for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
+               if (IN_SKIP(sip, seq)) {
+                       if (sip->start == seq) {
+                               if (sip->end == seq)
+                                       remove_sip(pci, sip);
+                               else
+                                       sip->start += 1;
+                       } else if (sip->end == seq)
+                               sip->end -= 1;
+                       else {
+                               sip->end = seq - 1;
+                               insert_skip(pci, seq + 1, sip->end);
+                       }
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static void collect_pdi_skips(struct per_dev_info *pdi)
+{
+       struct skip_info *sip;
+       int cpu;
+
+       pdi->skips = 0;
+       pdi->seq_skips = 0;
+
+       for (cpu = 0; cpu < pdi->ncpus; cpu++) {
+               struct per_cpu_info *pci = &pdi->cpus[cpu];
+
+               for (sip = pci->skips_head; sip != NULL; sip = sip->next) {
+                       pdi->skips++;
+                       pdi->seq_skips += (sip->end - sip->start + 1);
+                       if (verbose)
+                               fprintf(stderr,"(%d,%d): skipping %lu -> %lu\n",
+                                       MAJOR(pdi->dev), MINOR(pdi->dev),
+                                       sip->start, sip->end);
+               }
+       }
+}
+
 static void cpu_mark_online(struct per_dev_info *pdi, unsigned int cpu)
 {
        if (cpu >= pdi->cpu_map_max || !pdi->cpu_map) {
@@ -366,8 +553,81 @@ static struct per_process_info *find_process(__u32 pid, char *name)
        return ppi;
 }
 
-static inline int trace_rb_insert(struct trace *t, struct rb_root *root,
-                                 int check_time)
+/*
+ * struct trace and blktrace allocation cache, we do potentially
+ * millions of mallocs for these structures while only using at most
+ * a few thousand at the time
+ */
+static inline void t_free(struct trace *t)
+{
+       if (t_alloc_cache < 1024) {
+               t->next = t_alloc_list;
+               t_alloc_list = t;
+               t_alloc_cache++;
+       } else
+               free(t);
+}
+
+static inline struct trace *t_alloc(void)
+{
+       struct trace *t = t_alloc_list;
+
+       if (t) {
+               t_alloc_list = t->next;
+               t_alloc_cache--;
+               return t;
+       }
+
+       return malloc(sizeof(*t));
+}
+
+static inline void bit_free(struct blk_io_trace *bit)
+{
+       if (bit_alloc_cache < 1024 && !bit->pdu_len) {
+               /*
+                * abuse a 64-bit field for a next pointer for the free item
+                */
+               bit->time = (__u64) (unsigned long) bit_alloc_list;
+               bit_alloc_list = (struct blk_io_trace *) bit;
+               bit_alloc_cache++;
+       } else
+               free(bit);
+}
+
+static inline struct blk_io_trace *bit_alloc(void)
+{
+       struct blk_io_trace *bit = bit_alloc_list;
+
+       if (bit) {
+               bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
+                                bit->time;
+               bit_alloc_cache--;
+               return bit;
+       }
+
+       return malloc(sizeof(*bit));
+}
+
+static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
+{
+       struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
+
+       rb_erase(&t->rb_node, &pci->rb_last);
+       pci->rb_last_entries--;
+
+       bit_free(t->bit);
+       t_free(t);
+}
+
+static void put_trace(struct per_dev_info *pdi, struct trace *t)
+{
+       rb_erase(&t->rb_node, &rb_sort_root);
+       rb_sort_entries--;
+
+       trace_rb_insert_last(pdi, t);
+}
+
+static inline int trace_rb_insert(struct trace *t, struct rb_root *root)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -378,16 +638,11 @@ static inline int trace_rb_insert(struct trace *t, struct rb_root *root,
 
                __t = rb_entry(parent, struct trace, rb_node);
 
-               if (check_time) {
-                       if (t->bit->time < __t->bit->time) {
-                               p = &(*p)->rb_left;
-                               continue;
-                       } else if (t->bit->time > __t->bit->time) {
-                               p = &(*p)->rb_right;
-                               continue;
-                       }
-               }
-               if (t->bit->device < __t->bit->device)
+               if (t->bit->time < __t->bit->time)
+                       p = &(*p)->rb_left;
+               else if (t->bit->time > __t->bit->time)
+                       p = &(*p)->rb_right;
+               else if (t->bit->device < __t->bit->device)
                        p = &(*p)->rb_left;
                else if (t->bit->device > __t->bit->device)
                        p = &(*p)->rb_right;
@@ -404,7 +659,7 @@ static inline int trace_rb_insert(struct trace *t, struct rb_root *root,
 
 static inline int trace_rb_insert_sort(struct trace *t)
 {
-       if (!trace_rb_insert(t, &rb_sort_root, 1)) {
+       if (!trace_rb_insert(t, &rb_sort_root)) {
                rb_sort_entries++;
                return 0;
        }
@@ -412,14 +667,23 @@ static inline int trace_rb_insert_sort(struct trace *t)
        return 1;
 }
 
-static inline int trace_rb_insert_last(struct per_dev_info *pdi,struct trace *t)
+static int trace_rb_insert_last(struct per_dev_info *pdi, struct trace *t)
 {
-       if (!trace_rb_insert(t, &pdi->rb_last, 1)) {
-               pdi->rb_last_entries++;
-               return 0;
+       struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
+
+       if (trace_rb_insert(t, &pci->rb_last))
+               return 1;
+
+       pci->rb_last_entries++;
+
+       if (pci->rb_last_entries > rb_batch * pdi->nfiles) {
+               struct rb_node *n = rb_first(&pci->rb_last);
+
+               t = rb_entry(n, struct trace, rb_node);
+               __put_trace_last(pdi, t);
        }
 
-       return 1;
+       return 0;
 }
 
 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
@@ -457,7 +721,7 @@ static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
 
                while (((n = rb_next(prev)) != NULL) && max--) {
                        __t = rb_entry(n, struct trace, rb_node);
-                       
+
                        if (__t->bit->device == device &&
                            __t->bit->sequence == sequence)
                                return __t;
@@ -465,19 +729,15 @@ static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
                        prev = n;
                }
        }
-                       
-       return NULL;
-}
 
-static inline struct trace *trace_rb_find_sort(dev_t dev, unsigned long seq)
-{
-       return trace_rb_find(dev, seq, &rb_sort_root, 1);
+       return NULL;
 }
 
 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
+                                              struct per_cpu_info *pci,
                                               unsigned long seq)
 {
-       return trace_rb_find(pdi->dev, seq, &pdi->rb_last, 0);
+       return trace_rb_find(pdi->dev, seq, &pci->rb_last, 0);
 }
 
 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
@@ -705,84 +965,6 @@ static struct io_stats *find_process_io_stats(__u32 pid, char *name)
        return &ppi->io_stats;
 }
 
-static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
-{
-       struct per_cpu_info *cpus = pdi->cpus;
-       int ncpus = pdi->ncpus;
-       int new_count = cpu + 1;
-       int new_space, size;
-       char *new_start;
-
-       size = new_count * sizeof(struct per_cpu_info);
-       cpus = realloc(cpus, size);
-       if (!cpus) {
-               char name[20];
-               fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
-                       get_dev_name(pdi, name, sizeof(name)), size);
-               exit(1);
-       }
-
-       new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
-       new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
-       memset(new_start, 0, new_space);
-
-       pdi->ncpus = new_count;
-       pdi->cpus = cpus;
-}
-
-static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
-{
-       struct per_cpu_info *pci;
-
-       if (cpu >= pdi->ncpus)
-               resize_cpu_info(pdi, cpu);
-
-       pci = &pdi->cpus[cpu];
-       pci->cpu = cpu;
-       return pci;
-}
-
-
-static int resize_devices(char *name)
-{
-       int size = (ndevices + 1) * sizeof(struct per_dev_info);
-
-       devices = realloc(devices, size);
-       if (!devices) {
-               fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
-               return 1;
-       }
-       memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
-       devices[ndevices].name = name;
-       ndevices++;
-       return 0;
-}
-
-static struct per_dev_info *get_dev_info(dev_t dev)
-{
-       struct per_dev_info *pdi;
-       int i;
-
-       for (i = 0; i < ndevices; i++) {
-               if (!devices[i].dev)
-                       devices[i].dev = dev;
-               if (devices[i].dev == dev)
-                       return &devices[i];
-       }
-
-       if (resize_devices(NULL))
-               return NULL;
-
-       pdi = &devices[ndevices - 1];
-       pdi->dev = dev;
-       pdi->first_reported_time = 0;
-       pdi->last_sequence = -1;
-       pdi->last_read_time = 0;
-       memset(&pdi->rb_last, 0, sizeof(pdi->rb_last));
-       pdi->rb_last_entries = 0;
-       return pdi;
-}
-
 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
 {
        if (pdi->name)
@@ -1039,7 +1221,7 @@ static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
                          struct per_cpu_info *pci)
 {
-       int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
+       int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
        int act = t->action & 0xffff;
 
        switch (act) {
@@ -1066,14 +1248,19 @@ static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
                        log_generic(pci, t, "S");
                        break;
                case __BLK_TA_REQUEUE:
+                       pdi->cur_depth[w]--;
                        account_requeue(t, pci, w);
                        log_queue(pci, t, "R");
                        break;
                case __BLK_TA_ISSUE:
                        account_issue(t, pci, w);
+                       pdi->cur_depth[w]++;
+                       if (pdi->cur_depth[w] > pdi->max_depth[w])
+                               pdi->max_depth[w] = pdi->cur_depth[w];
                        log_issue(pdi, pci, t, "D");
                        break;
                case __BLK_TA_COMPLETE:
+                       pdi->cur_depth[w]--;
                        account_c(t, pci, w, t->bytes);
                        log_complete(pdi, pci, t, "C");
                        break;
@@ -1138,7 +1325,8 @@ static char *size_cnv(char *dst, unsigned long long num, int in_kb)
        return dst;
 }
 
-static void dump_io_stats(struct io_stats *ios, char *msg)
+static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios,
+                         char *msg)
 {
        static char x[256], y[256];
 
@@ -1155,6 +1343,10 @@ static void dump_io_stats(struct io_stats *ios, char *msg)
        fprintf(ofp, " Writes Completed: %s, %siB\n", size_cnv(x, ios->cwrites, 0), size_cnv(y, ios->cwrite_kb, 1));
        fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
        fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
+       if (pdi) {
+               fprintf(ofp, " Read depth:      %'8u%8c\t", pdi->max_depth[0], ' ');
+               fprintf(ofp, " Write depth:      %'8u\n", pdi->max_depth[1]);
+       }
        fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
        fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
 }
@@ -1233,7 +1425,7 @@ static void show_process_stats(void)
                else
                        sprintf(name, "%s (%u)", ppi->name, ppi->pid);
 
-               dump_io_stats(&ppi->io_stats, name);
+               dump_io_stats(NULL, &ppi->io_stats, name);
                dump_wait_stats(ppi);
                ppi = ppi->list_next;
        }
@@ -1285,7 +1477,7 @@ static void show_device_and_cpu_stats(void)
 
                        snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
                                 j, get_dev_name(pdi, name, sizeof(name)));
-                       dump_io_stats(ios, line);
+                       dump_io_stats(pdi, ios, line);
                        pci_events++;
                }
 
@@ -1293,7 +1485,7 @@ static void show_device_and_cpu_stats(void)
                        fprintf(ofp, "\n");
                        snprintf(line, sizeof(line) - 1, "Total (%s):",
                                 get_dev_name(pdi, name, sizeof(name)));
-                       dump_io_stats(&total, line);
+                       dump_io_stats(NULL, &total, line);
                }
 
                wrate = rrate = 0;
@@ -1303,66 +1495,17 @@ static void show_device_and_cpu_stats(void)
                        wrate = 1000 * total.cwrite_kb / msec;
                }
 
-               fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n", rrate, wrate);
-               fprintf(ofp, "Events (%s): %'Lu entries, %'lu skips\n",
-                       get_dev_name(pdi, line, sizeof(line)), pdi->events,
-                       pdi->skips);
-       }
-}
+               fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n",
+                       rrate, wrate);
+               fprintf(ofp, "Events (%s): %'Lu entries\n",
+                       get_dev_name(pdi, line, sizeof(line)), pdi->events);
 
-/*
- * struct trace and blktrace allocation cache, we do potentially
- * millions of mallocs for these structures while only using at most
- * a few thousand at the time
- */
-static inline void t_free(struct trace *t)
-{
-       if (t_alloc_cache < 1024) {
-               t->next = t_alloc_list;
-               t_alloc_list = t;
-               t_alloc_cache++;
-       } else
-               free(t);
-}
-
-static inline struct trace *t_alloc(void)
-{
-       struct trace *t = t_alloc_list;
-
-       if (t) {
-               t_alloc_list = t->next;
-               t_alloc_cache--;
-               return t;
+               collect_pdi_skips(pdi);
+               fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n",
+                       pdi->skips,pdi->seq_skips,
+                       100.0 * ((double)pdi->seq_skips /
+                               (double)(pdi->events + pdi->seq_skips)));
        }
-
-       return malloc(sizeof(*t));
-}
-
-static inline void bit_free(struct blk_io_trace *bit)
-{
-       if (bit_alloc_cache < 1024 && !bit->pdu_len) {
-               /*
-                * abuse a 64-bit field for a next pointer for the free item
-                */
-               bit->time = (__u64) (unsigned long) bit_alloc_list;
-               bit_alloc_list = (struct blk_io_trace *) bit;
-               bit_alloc_cache++;
-       } else
-               free(bit);
-}
-
-static inline struct blk_io_trace *bit_alloc(void)
-{
-       struct blk_io_trace *bit = bit_alloc_list;
-
-       if (bit) {
-               bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
-                                bit->time;
-               bit_alloc_cache--;
-               return bit;
-       }
-
-       return malloc(sizeof(*bit));
 }
 
 static void find_genesis(void)
@@ -1392,6 +1535,8 @@ static inline int check_stopwatch(struct blk_io_trace *bit)
  */
 static int sort_entries(unsigned long long *youngest)
 {
+       struct per_dev_info *pdi = NULL;
+       struct per_cpu_info *pci = NULL;
        struct trace *t;
 
        if (!genesis_time)
@@ -1408,8 +1553,16 @@ static int sort_entries(unsigned long long *youngest)
                if (bit->time < *youngest || !*youngest)
                        *youngest = bit->time;
 
-               if (bit->sequence < smallest_seq_read)
-                       smallest_seq_read = bit->sequence;
+               if (!pdi || pdi->dev != bit->device) {
+                       pdi = get_dev_info(bit->device);
+                       pci = NULL;
+               }
+
+               if (!pci || pci->cpu != bit->cpu)
+                       pci = get_cpu_info(pdi, bit->cpu);
+
+               if (bit->sequence < pci->smallest_seq_read)
+                       pci->smallest_seq_read = bit->sequence;
 
                if (check_stopwatch(bit)) {
                        bit_free(bit);
@@ -1424,30 +1577,6 @@ static int sort_entries(unsigned long long *youngest)
        return 0;
 }
 
-static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
-{
-       rb_erase(&t->rb_node, &pdi->rb_last);
-       pdi->rb_last_entries--;
-
-       bit_free(t->bit);
-       t_free(t);
-}
-
-static void put_trace(struct per_dev_info *pdi, struct trace *t)
-{
-       rb_erase(&t->rb_node, &rb_sort_root);
-       rb_sort_entries--;
-
-       trace_rb_insert_last(pdi, t);
-
-       if (pdi->rb_last_entries > rb_batch * pdi->nfiles) {
-               struct rb_node *n = rb_first(&pdi->rb_last);
-
-               t = rb_entry(n, struct trace, rb_node);
-               __put_trace_last(pdi, t);
-       }
-}
-
 /*
  * to continue, we must have traces from all online cpus in the tree
  */
@@ -1490,16 +1619,22 @@ static int check_cpu_map(struct per_dev_info *pdi)
 
 static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
 {
-       unsigned long expected_sequence = pdi->last_sequence + 1;
        struct blk_io_trace *bit = t->bit;
+       unsigned long expected_sequence;
+       struct per_cpu_info *pci;
        struct trace *__t;
-       
+
+       pci = get_cpu_info(pdi, bit->cpu);
+       expected_sequence = pci->last_sequence + 1;
+
        if (!expected_sequence) {
                /*
                 * 1 should be the first entry, just allow it
                 */
                if (bit->sequence == 1)
                        return 0;
+               if (bit->sequence == pci->smallest_seq_read)
+                       return 0;
 
                return check_cpu_map(pdi);
        }
@@ -1511,8 +1646,8 @@ static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
         * we may not have seen that sequence yet. if we are not doing
         * the final run, break and wait for more entries.
         */
-       if (expected_sequence < smallest_seq_read) {
-               __t = trace_rb_find_last(pdi, expected_sequence);
+       if (expected_sequence < pci->smallest_seq_read) {
+               __t = trace_rb_find_last(pdi, pci, expected_sequence);
                if (!__t)
                        goto skip;
 
@@ -1522,12 +1657,11 @@ static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
                return 1;
        } else {
 skip:
-               if (verbose) {
-                       fprintf(stderr, "(%d,%d): skipping %lu -> %u\n",
-                               MAJOR(pdi->dev), MINOR(pdi->dev),
-                               pdi->last_sequence, bit->sequence);
-               }
-               pdi->skips++;
+               if (check_current_skips(pci, bit->sequence))
+                       return 0;
+
+               if (expected_sequence < bit->sequence)
+                       insert_skip(pci, expected_sequence, bit->sequence - 1);
                return 0;
        }
 }
@@ -1547,8 +1681,10 @@ static void show_entries_rb(int force)
                t = rb_entry(n, struct trace, rb_node);
                bit = t->bit;
 
-               if (!pdi || pdi->dev != bit->device)
+               if (!pdi || pdi->dev != bit->device) {
                        pdi = get_dev_info(bit->device);
+                       pci = NULL;
+               }
 
                if (!pdi) {
                        fprintf(stderr, "Unknown device ID? (%d,%d)\n",
@@ -1562,33 +1698,38 @@ static void show_entries_rb(int force)
                if (!force && bit->time > last_allowed_time)
                        break;
 
-               pdi->last_sequence = bit->sequence;
-
                check_time(pdi, bit);
 
                if (!pci || pci->cpu != bit->cpu)
                        pci = get_cpu_info(pdi, bit->cpu);
 
+               pci->last_sequence = bit->sequence;
+
                pci->nelems++;
 
-               if (bit->action & (act_mask << BLK_TC_SHIFT)) 
+               if (bit->action & (act_mask << BLK_TC_SHIFT))
                        dump_trace(bit, pci, pdi);
 
                put_trace(pdi, t);
        }
 }
 
-static int read_data(int fd, void *buffer, int bytes, int block)
+static int read_data(int fd, void *buffer, int bytes, int block, int *fdblock)
 {
        int ret, bytes_left, fl;
        void *p;
 
-       fl = fcntl(fd, F_GETFL);
+       if (block != *fdblock) {
+               fl = fcntl(fd, F_GETFL);
 
-       if (!block)
-               fcntl(fd, F_SETFL, fl | O_NONBLOCK);
-       else
-               fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
+               if (!block) {
+                       *fdblock = 0;
+                       fcntl(fd, F_SETFL, fl | O_NONBLOCK);
+               } else {
+                       *fdblock = 1;
+                       fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
+               }
+       }
 
        bytes_left = bytes;
        p = buffer;
@@ -1597,10 +1738,21 @@ static int read_data(int fd, void *buffer, int bytes, int block)
                if (!ret)
                        return 1;
                else if (ret < 0) {
-                       if (errno != EAGAIN)
+                       if (errno != EAGAIN) {
                                perror("read");
+                               return -1;
+                       }
 
-                       return -1;
+                       /*
+                        * never do partial reads. we can return if we
+                        * didn't read anything and we should not block,
+                        * otherwise wait for data
+                        */
+                       if ((bytes_left == bytes) && !block)
+                               return 1;
+
+                       usleep(10);
+                       continue;
                } else {
                        p += ret;
                        bytes_left -= ret;
@@ -1610,7 +1762,7 @@ static int read_data(int fd, void *buffer, int bytes, int block)
        return 0;
 }
 
-static int read_events(int fd, int always_block)
+static int read_events(int fd, int always_block, int *fdblock)
 {
        struct per_dev_info *pdi = NULL;
        unsigned int events = 0;
@@ -1618,13 +1770,18 @@ static int read_events(int fd, int always_block)
        while (!is_done() && events < rb_batch) {
                struct blk_io_trace *bit;
                struct trace *t;
-               int pdu_len;
+               int pdu_len, should_block, ret;
                __u32 magic;
 
                bit = bit_alloc();
 
-               if (read_data(fd, bit, sizeof(*bit), !events || always_block)) {
+               should_block = !events || always_block;
+
+               ret = read_data(fd, bit, sizeof(*bit), should_block, fdblock);
+               if (ret) {
                        bit_free(bit);
+                       if (!events && ret < 0)
+                               events = ret;
                        break;
                }
 
@@ -1638,7 +1795,7 @@ static int read_events(int fd, int always_block)
                if (pdu_len) {
                        void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
 
-                       if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1)) {
+                       if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1, fdblock)) {
                                bit_free(ptr);
                                break;
                        }
@@ -1684,7 +1841,6 @@ static int do_file(void)
        for (i = 0; i < ndevices; i++) {
                pdi = &devices[i];
                pdi->nfiles = 0;
-               pdi->last_sequence = -1;
 
                for (j = 0;; j++) {
                        struct stat st;
@@ -1694,6 +1850,7 @@ static int do_file(void)
                        pci = get_cpu_info(pdi, j);
                        pci->cpu = j;
                        pci->fd = -1;
+                       pci->fdblock = -1;
        
                        p = strdup(pdi->name);
                        dname = dirname(p);
@@ -1733,10 +1890,10 @@ static int do_file(void)
 
                events_added = 0;
                last_allowed_time = -1ULL;
-               smallest_seq_read = -1U;
 
                for (i = 0; i < ndevices; i++) {
                        pdi = &devices[i];
+                       pdi->last_read_time = -1ULL;
 
                        for (j = 0; j < pdi->nfiles; j++) {
 
@@ -1745,8 +1902,10 @@ static int do_file(void)
                                if (pci->fd == -1)
                                        continue;
 
-                               events = read_events(pci->fd, 1);
-                               if (!events) {
+                               pci->smallest_seq_read = -1;
+
+                               events = read_events(pci->fd, 1, &pci->fdblock);
+                               if (events <= 0) {
                                        cpu_mark_offline(pdi, pci->cpu);
                                        close(pci->fd);
                                        pci->fd = -1;
@@ -1779,7 +1938,7 @@ static int do_file(void)
 static int do_stdin(void)
 {
        unsigned long long youngest;
-       int fd, events;
+       int fd, events, fdblock;
 
        last_allowed_time = -1ULL;
        fd = dup(STDIN_FILENO);
@@ -1788,9 +1947,12 @@ static int do_stdin(void)
                return -1;
        }
 
-       while ((events = read_events(fd, 0)) != 0) {
+       fdblock = -1;
+       while ((events = read_events(fd, 0, &fdblock)) > 0) {
        
+#if 0
                smallest_seq_read = -1U;
+#endif
 
                if (sort_entries(&youngest))
                        break;
@@ -1829,7 +1991,6 @@ static void show_stats(void)
 static void handle_sigint(__attribute__((__unused__)) int sig)
 {
        done = 1;
-       show_stats();
 }
 
 /*
@@ -1878,7 +2039,7 @@ static char usage_str[] = \
        "\t-o Output file. If not given, output is stdout\n" \
        "\t-b stdin read batching\n" \
        "\t-s Show per-program io statistics\n" \
-       "\t-n Hash processes by name, not pid\n" \
+       "\t-h Hash processes by name, not pid\n" \
        "\t-t Track individual ios. Will tell you the time a request took\n" \
        "\t   to get queued, to get dispatched, and to get completed\n" \
        "\t-q Quiet. Don't display any stats at the end of the trace\n" \