0518083a108964f4b4f049115692c99260330de1
[blktrace.git] / iowatcher / blkparse.c
1 /*
2  * Copyright (C) 2012 Fusion-io
3  *
4  *  This program is free software; you can redistribute it and/or
5  *  modify it under the terms of the GNU General Public
6  *  License v2 as published by the Free Software Foundation.
7  *
8  *  This program is distributed in the hope that it will be useful,
9  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  *  GNU General Public License for more details.
12  *
13  *  You should have received a copy of the GNU General Public License
14  *  along with this program; if not, write to the Free Software
15  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16  *
17  *  Parts of this file were imported from Jens Axboe's blktrace sources (also GPL)
18  */
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 #include <fcntl.h>
22 #include <unistd.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <math.h>
26 #include <inttypes.h>
27 #include <string.h>
28 #include <asm/types.h>
29 #include <errno.h>
30 #include <sys/mman.h>
31 #include <time.h>
32 #include <math.h>
33 #include <dirent.h>
34
35 #include "plot.h"
36 #include "blkparse.h"
37 #include "list.h"
38 #include "tracers.h"
39 #include "../blktrace_api.h"
40
41 #define IO_HASH_TABLE_BITS  11
42 #define IO_HASH_TABLE_SIZE (1 << IO_HASH_TABLE_BITS)
43 static struct list_head io_hash_table[IO_HASH_TABLE_SIZE];
44
45 #define PROCESS_HASH_TABLE_BITS 7
46 #define PROCESS_HASH_TABLE_SIZE (1 << PROCESS_HASH_TABLE_BITS)
47 static struct list_head process_hash_table[PROCESS_HASH_TABLE_SIZE];
48
49 extern int plot_io_action;
50 extern int io_per_process;
51
52 #define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE))
53 #define BLK_TA_MASK (((1 << BLK_TC_SHIFT) - 1) & ~__BLK_TA_CGROUP)
54
55 struct pending_io {
56         /* sector offset of this IO */
57         u64 sector;
58
59         /* dev_t for this IO */
60         u32 device;
61
62         /* time this IO was dispatched */
63         u64 dispatch_time;
64         /* time this IO was finished */
65         u64 completion_time;
66         struct list_head hash_list;
67         /* process which queued this IO */
68         u32 pid;
69 };
70
71 struct pid_map {
72         struct list_head hash_list;
73         u32 pid;
74         int index;
75         char name[0];
76 };
77
78 u64 get_record_time(struct trace *trace)
79 {
80         return trace->io->time;
81 }
82
83 void init_io_hash_table(void)
84 {
85         int i;
86         struct list_head *head;
87
88         for (i = 0; i < IO_HASH_TABLE_SIZE; i++) {
89                 head = io_hash_table + i;
90                 INIT_LIST_HEAD(head);
91         }
92 }
93
94 /* taken from the kernel hash.h */
95 static inline u64 hash_sector(u64 val)
96 {
97         u64 hash = val;
98
99         /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
100         u64 n = hash;
101         n <<= 18;
102         hash -= n;
103         n <<= 33;
104         hash -= n;
105         n <<= 3;
106         hash += n;
107         n <<= 3;
108         hash -= n;
109         n <<= 4;
110         hash += n;
111         n <<= 2;
112         hash += n;
113
114         /* High bits are more random, so use them. */
115         return hash >> (64 - IO_HASH_TABLE_BITS);
116 }
117
118 static int io_hash_table_insert(struct pending_io *ins_pio)
119 {
120         u64 sector = ins_pio->sector;
121         u32 dev = ins_pio->device;
122         int slot = hash_sector(sector);
123         struct list_head *head;
124         struct pending_io *pio;
125
126         head = io_hash_table + slot;
127         list_for_each_entry(pio, head, hash_list) {
128                 if (pio->sector == sector && pio->device == dev)
129                         return -EEXIST;
130         }
131         list_add_tail(&ins_pio->hash_list, head);
132         return 0;
133 }
134
135 static struct pending_io *io_hash_table_search(u64 sector, u32 dev)
136 {
137         int slot = hash_sector(sector);
138         struct list_head *head;
139         struct pending_io *pio;
140
141         head = io_hash_table + slot;
142         list_for_each_entry(pio, head, hash_list) {
143                 if (pio->sector == sector && pio->device == dev)
144                         return pio;
145         }
146         return NULL;
147 }
148
149 static struct pending_io *hash_queued_io(struct blk_io_trace *io)
150 {
151         struct pending_io *pio;
152         int ret;
153
154         pio = calloc(1, sizeof(*pio));
155         pio->sector = io->sector;
156         pio->device = io->device;
157         pio->pid = io->pid;
158
159         ret = io_hash_table_insert(pio);
160         if (ret < 0) {
161                 /* crud, the IO is there already */
162                 free(pio);
163                 return NULL;
164         }
165         return pio;
166 }
167
168 static struct pending_io *hash_dispatched_io(struct blk_io_trace *io)
169 {
170         struct pending_io *pio;
171
172         pio = io_hash_table_search(io->sector, io->device);
173         if (!pio) {
174                 pio = hash_queued_io(io);
175                 if (!pio)
176                         return NULL;
177         }
178         pio->dispatch_time = io->time;
179         return pio;
180 }
181
182 static struct pending_io *hash_completed_io(struct blk_io_trace *io)
183 {
184         struct pending_io *pio;
185
186         pio = io_hash_table_search(io->sector, io->device);
187
188         if (!pio)
189                 return NULL;
190         return pio;
191 }
192
193 void init_process_hash_table(void)
194 {
195         int i;
196         struct list_head *head;
197
198         for (i = 0; i < PROCESS_HASH_TABLE_SIZE; i++) {
199                 head = process_hash_table + i;
200                 INIT_LIST_HEAD(head);
201         }
202 }
203
204 static u32 hash_pid(u32 pid)
205 {
206         u32 hash = pid;
207
208         hash ^= pid >> 3;
209         hash ^= pid >> 3;
210         hash ^= pid >> 4;
211         hash ^= pid >> 6;
212         return (hash & (PROCESS_HASH_TABLE_SIZE - 1));
213 }
214
215 static struct pid_map *process_hash_search(u32 pid)
216 {
217         int slot = hash_pid(pid);
218         struct list_head *head;
219         struct pid_map *pm;
220
221         head = process_hash_table + slot;
222         list_for_each_entry(pm, head, hash_list) {
223                 if (pm->pid == pid)
224                         return pm;
225         }
226         return NULL;
227 }
228
229 static struct pid_map *process_hash_insert(u32 pid, char *name)
230 {
231         int slot = hash_pid(pid);
232         struct pid_map *pm;
233         int old_index = 0;
234         char buf[16];
235
236         pm = process_hash_search(pid);
237         if (pm) {
238                 /* Entry exists and name shouldn't be changed? */
239                 if (!name || !strcmp(name, pm->name))
240                         return pm;
241                 list_del(&pm->hash_list);
242                 old_index = pm->index;
243                 free(pm);
244         }
245         if (!name) {
246                 sprintf(buf, "[%u]", pid);
247                 name = buf;
248         }
249         pm = malloc(sizeof(struct pid_map) + strlen(name) + 1);
250         pm->pid = pid;
251         pm->index = old_index;
252         strcpy(pm->name, name);
253         list_add_tail(&pm->hash_list, process_hash_table + slot);
254
255         return pm;
256 }
257
258 static void handle_notify(struct trace *trace)
259 {
260         struct blk_io_trace *io = trace->io;
261         void *payload = (char *)io + sizeof(*io);
262         int pdu_len = io->pdu_len;
263         u32 two32[2];
264
265         if (io->action & __BLK_TN_CGROUP) {
266                 payload += sizeof(struct blk_io_cgroup_payload);
267                 pdu_len -= sizeof(struct blk_io_cgroup_payload);
268         }
269         if ((io->action & ~__BLK_TN_CGROUP) == BLK_TN_PROCESS) {
270                 if (io_per_process)
271                         process_hash_insert(io->pid, payload);
272                 return;
273         }
274
275         if ((io->action & ~__BLK_TN_CGROUP) != BLK_TN_TIMESTAMP)
276                 return;
277
278         if (pdu_len != sizeof(two32))
279                 return;
280
281         memcpy(two32, payload, sizeof(two32));
282         trace->start_timestamp = io->time;
283         trace->abs_start_time.tv_sec = two32[0];
284         trace->abs_start_time.tv_nsec = two32[1];
285         if (trace->abs_start_time.tv_nsec < 0) {
286                 trace->abs_start_time.tv_sec--;
287                 trace->abs_start_time.tv_nsec += 1000000000;
288         }
289 }
290
291 int next_record(struct trace *trace)
292 {
293         int skip = trace->io->pdu_len;
294         u64 offset;
295
296         trace->cur += sizeof(*trace->io) + skip;
297         offset = trace->cur - trace->start;
298         if (offset >= trace->len)
299                 return 1;
300
301         trace->io = (struct blk_io_trace *)trace->cur;
302         return 0;
303 }
304
305 void first_record(struct trace *trace)
306 {
307         trace->cur = trace->start;
308         trace->io = (struct blk_io_trace *)trace->cur;
309 }
310
311 static int is_io_event(struct blk_io_trace *test)
312 {
313         char *message;
314         if (!(test->action & BLK_TC_ACT(BLK_TC_NOTIFY)))
315                 return 1;
316         if ((test->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
317                 int len = test->pdu_len;
318
319                 message = (char *)(test + 1);
320                 if (test->action & __BLK_TN_CGROUP) {
321                         len -= sizeof(struct blk_io_cgroup_payload);
322                         message += sizeof(struct blk_io_cgroup_payload);
323                 }
324                 if (len < 3)
325                         return 0;
326                 if (strncmp(message, "fio ", 4) == 0) {
327                         return 1;
328                 }
329         }
330         return 0;
331 }
332
333 u64 find_last_time(struct trace *trace)
334 {
335         char *p = trace->start + trace->len;
336         struct blk_io_trace *test;
337         int search_len = 0;
338         u64 found = 0;
339
340         if (trace->len < sizeof(*trace->io))
341                 return 0;
342         p -= sizeof(*trace->io);
343         while (p >= trace->start) {
344                 test = (struct blk_io_trace *)p;
345                 if (CHECK_MAGIC(test) && is_io_event(test)) {
346                         u64 offset = p - trace->start;
347                         if (offset + sizeof(*test) + test->pdu_len == trace->len) {
348                                 return test->time;
349                         }
350                 }
351                 p--;
352                 search_len++;
353                 if (search_len > 8192) {
354                         break;
355                 }
356         }
357
358         /* searching backwards didn't work out, we'll have to scan the file */
359         first_record(trace);
360         while (1) {
361                 if (is_io_event(trace->io))
362                         found = trace->io->time;
363                 if (next_record(trace))
364                         break;
365         }
366         first_record(trace);
367         return found;
368 }
369
370 static int parse_fio_bank_message(struct trace *trace, u64 *bank_ret, u64 *offset_ret,
371                            u64 *num_banks_ret)
372 {
373         char *s;
374         char *next;
375         char *message;
376         struct blk_io_trace *test = trace->io;
377         int len = test->pdu_len;
378         u64 bank;
379         u64 offset;
380         u64 num_banks;
381
382         if (!(test->action & BLK_TC_ACT(BLK_TC_NOTIFY)))
383                 return -1;
384         if ((test->action & ~__BLK_TN_CGROUP) != BLK_TN_MESSAGE)
385                 return -1;
386
387         message = (char *)(test + 1);
388         if (test->action & __BLK_TN_CGROUP) {
389                 len -= sizeof(struct blk_io_cgroup_payload);
390                 message += sizeof(struct blk_io_cgroup_payload);
391         }
392         /* the message is fio rw bank offset num_banks */
393         if (len < 3)
394                 return -1;
395         if (strncmp(message, "fio r ", 6) != 0)
396                 return -1;
397
398         message = strndup(message, len);
399         s = strchr(message, ' ');
400         if (!s)
401                 goto out;
402         s++;
403         s = strchr(s, ' ');
404         if (!s)
405                 goto out;
406
407         bank = strtoll(s, &next, 10);
408         if (s == next)
409                 goto out;
410         s = next;
411
412         offset = strtoll(s, &next, 10);
413         if (s == next)
414                 goto out;
415         s = next;
416
417         num_banks = strtoll(s, &next, 10);
418         if (s == next)
419                 goto out;
420
421         *bank_ret = bank;
422         *offset_ret = offset;
423         *num_banks_ret = num_banks;
424
425         return 0;
426 out:
427         free(message);
428         return -1;
429 }
430
431 static struct dev_info *lookup_dev(struct trace *trace, struct blk_io_trace *io)
432 {
433         u32 dev = io->device;
434         int i;
435         struct dev_info *di = NULL;
436
437         for (i = 0; i < trace->num_devices; i++) {
438                 if (trace->devices[i].device == dev) {
439                         di = trace->devices + i;
440                         goto found;
441                 }
442         }
443         i = trace->num_devices++;
444         if (i >= MAX_DEVICES_PER_TRACE) {
445                 fprintf(stderr, "Trace contains too many devices (%d)\n", i);
446                 exit(1);
447         }
448         di = trace->devices + i;
449         di->device = dev;
450 found:
451         return di;
452 }
453
454 static void map_devices(struct trace *trace)
455 {
456         struct dev_info *di;
457         u64 found;
458         u64 map_start = 0;
459         int i;
460
461         first_record(trace);
462         while (1) {
463                 if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
464                         di = lookup_dev(trace, trace->io);
465                         found = trace->io->sector << 9;
466                         if (found < di->min)
467                                 di->min = found;
468
469                         found += trace->io->bytes;
470                         if (di->max < found)
471                                 di->max = found;
472                 }
473                 if (next_record(trace))
474                         break;
475         }
476         first_record(trace);
477         for (i = 0; i < trace->num_devices; i++) {
478                 di = trace->devices + i;
479                 di->map = map_start;
480                 map_start += di->max - di->min;
481         }
482 }
483
484 static u64 map_io(struct trace *trace, struct blk_io_trace *io)
485 {
486         struct dev_info *di = lookup_dev(trace, io);
487         u64 val = trace->io->sector << 9;
488         return di->map + val - di->min;
489 }
490
491 void find_extreme_offsets(struct trace *trace, u64 *min_ret, u64 *max_ret, u64 *max_bank_ret,
492                           u64 *max_offset_ret)
493 {
494         u64 found = 0;
495         u64 max = 0, min = ~(u64)0;
496         u64 max_bank = 0;
497         u64 max_bank_offset = 0;
498         u64 num_banks = 0;
499
500         map_devices(trace);
501
502         first_record(trace);
503         while (1) {
504                 if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
505                         found = map_io(trace, trace->io);
506                         if (found < min)
507                                 min = found;
508
509                         found += trace->io->bytes;
510                         if (max < found)
511                                 max = found;
512                 } else {
513                         u64 bank;
514                         u64 offset;
515                         if (!parse_fio_bank_message(trace, &bank,
516                                                     &offset, &num_banks)) {
517                                 if (bank > max_bank)
518                                         max_bank = bank;
519                                 if (offset > max_bank_offset)
520                                         max_bank_offset = offset;
521                         }
522                 }
523                 if (next_record(trace))
524                         break;
525         }
526         first_record(trace);
527         *min_ret = min;
528         *max_ret = max;
529         *max_bank_ret = max_bank;
530         *max_offset_ret = max_bank_offset;
531 }
532
533 static void check_io_types(struct trace *trace)
534 {
535         struct blk_io_trace *io = trace->io;
536         int action = io->action & BLK_TA_MASK;
537
538         if (!(io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
539                 switch (action) {
540                 case __BLK_TA_COMPLETE:
541                         trace->found_completion = 1;
542                         break;
543                 case __BLK_TA_ISSUE:
544                         trace->found_issue = 1;
545                         break;
546                 case __BLK_TA_QUEUE:
547                         trace->found_queue = 1;
548                         break;
549                 };
550         }
551 }
552
553
554 int filter_outliers(struct trace *trace, u64 min_offset, u64 max_offset,
555                     u64 *yzoom_min, u64 *yzoom_max)
556 {
557         int hits[11];
558         u64 max_per_bucket[11];
559         u64 min_per_bucket[11];
560         u64 bytes_per_bucket = (max_offset - min_offset + 1) / 10;
561         int slot;
562         int fat_count = 0;
563
564         memset(hits, 0, sizeof(int) * 11);
565         memset(max_per_bucket, 0, sizeof(u64) * 11);
566         memset(min_per_bucket, 0xff, sizeof(u64) * 11);
567         first_record(trace);
568         while (1) {
569                 check_io_types(trace);
570                 if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY)) &&
571                     (trace->io->action & BLK_TA_MASK) == __BLK_TA_QUEUE) {
572                         u64 off = map_io(trace, trace->io) - min_offset;
573
574                         slot = (int)(off / bytes_per_bucket);
575                         hits[slot]++;
576                         if (off < min_per_bucket[slot])
577                                 min_per_bucket[slot] = off;
578
579                         off += trace->io->bytes;
580                         slot = (int)(off / bytes_per_bucket);
581                         hits[slot]++;
582                         if (off > max_per_bucket[slot])
583                                 max_per_bucket[slot] = off;
584                 }
585                 if (next_record(trace))
586                         break;
587         }
588         first_record(trace);
589         for (slot = 0; slot < 11; slot++) {
590                 if (hits[slot] > fat_count) {
591                         fat_count = hits[slot];
592                 }
593         }
594
595         *yzoom_max = max_offset;
596         for (slot = 10; slot >= 0; slot--) {
597                 double d = hits[slot];
598
599                 if (d >= (double)fat_count * .05) {
600                         *yzoom_max = max_per_bucket[slot] + min_offset;
601                         break;
602                 }
603         }
604
605         *yzoom_min = min_offset;
606         for (slot = 0; slot < 10; slot++) {
607                 double d = hits[slot];
608
609                 if (d >= (double)fat_count * .05) {
610                         *yzoom_min = min_per_bucket[slot] + min_offset;
611                         break;
612                 }
613         }
614         return 0;
615 }
616
617 static char footer[] = ".blktrace.0";
618 static int footer_len = sizeof(footer) - 1;
619
620 static int match_trace(char *name, int *len)
621 {
622         int match_len;
623         int footer_start;
624
625         match_len = strlen(name);
626         if (match_len <= footer_len)
627                 return 0;
628
629         footer_start = match_len - footer_len;
630         if (strcmp(name + footer_start, footer) != 0)
631                 return 0;
632
633         if (len)
634                 *len = match_len;
635         return 1;
636 }
637
638 struct tracelist {
639         struct tracelist *next;
640         char *name;
641 };
642
643 static struct tracelist *traces_list(char *dir_name, int *len)
644 {
645         int count = 0;
646         struct tracelist *traces = NULL;
647         int dlen = strlen(dir_name);
648         DIR *dir = opendir(dir_name);
649         if (!dir)
650                 return NULL;
651
652         while (1) {
653                 int n = 0;
654                 struct tracelist *tl;
655                 struct dirent *d = readdir(dir);
656                 if (!d)
657                         break;
658
659                 if (!match_trace(d->d_name, &n))
660                         continue;
661
662                 n += dlen + 1; /* dir + '/' + file */
663                 /* Allocate space for tracelist + filename */
664                 tl = calloc(1, sizeof(struct tracelist) + (sizeof(char) * (n + 1)));
665                 if (!tl) {
666                         closedir(dir);
667                         return NULL;
668                 }
669                 tl->next = traces;
670                 tl->name = (char *)(tl + 1);
671                 snprintf(tl->name, n, "%s/%s", dir_name, d->d_name);
672                 traces = tl;
673                 count++;
674         }
675
676         closedir(dir);
677
678         if (len)
679                 *len = count;
680
681         return traces;
682 }
683
684 static void traces_free(struct tracelist *traces)
685 {
686         while (traces) {
687                 struct tracelist *tl = traces;
688                 traces = traces->next;
689                 free(tl);
690         }
691 }
692
693 static int dump_traces(struct tracelist *traces, int count, char *dumpfile)
694 {
695         struct tracelist *tl;
696         char **argv = NULL;
697         int argc = 0;
698         int i;
699         int err = 0;
700
701         argc = count * 2; /* {"-i", trace } */
702         argc += 4; /* See below */
703         argv = calloc(argc + 1, sizeof(char *));
704         if (!argv)
705                 return -errno;
706
707         i = 0;
708         argv[i++] = "blkparse";
709         argv[i++] = "-O";
710         argv[i++] = "-d";
711         argv[i++] = dumpfile;
712         for (tl = traces; tl != NULL; tl = tl->next) {
713                 argv[i++] = "-i";
714                 argv[i++] = tl->name;
715         }
716
717         err = run_program(argc, argv, 1, NULL, NULL);
718         if (err)
719                 fprintf(stderr, "%s exited with %d, expected 0\n", argv[0], err);
720         free(argv);
721         return err;
722 }
723
724 static char *find_trace_file(char *filename)
725 {
726         int ret;
727         struct stat st;
728         char *dot;
729         int found_dir = 0;
730         char *dumpfile;
731         int len = strlen(filename);
732
733         /* look for an exact match of whatever they pass in.
734          * If it is a file, assume it is the dump file.
735          * If a directory, remember that it existed so we
736          * can combine traces in that directory later
737          */
738         ret = stat(filename, &st);
739         if (ret == 0) {
740                 if (S_ISREG(st.st_mode))
741                         return strdup(filename);
742
743                 if (S_ISDIR(st.st_mode))
744                         found_dir = 1;
745         }
746
747         if (found_dir) {
748                 int i;
749                 /* Eat up trailing '/'s */
750                 for (i = len - 1; filename[i] == '/'; i--)
751                         filename[i] = '\0';
752         }
753
754         /*
755          * try tacking .dump onto the end and see if that already
756          * has been generated
757          */
758         ret = asprintf(&dumpfile, "%s.dump", filename);
759         if (ret == -1) {
760                 perror("Error building dump file name");
761                 return NULL;
762         }
763         ret = stat(dumpfile, &st);
764         if (ret == 0)
765                 return dumpfile;
766
767         /*
768          * try to generate the .dump from all the traces in
769          * a single dir.
770          */
771         if (found_dir) {
772                 int count;
773                 struct tracelist *traces = traces_list(filename, &count);
774                 if (traces) {
775                         ret = dump_traces(traces, count, dumpfile);
776                         traces_free(traces);
777                         if (ret == 0)
778                                 return dumpfile;
779                 }
780         }
781         free(dumpfile);
782
783         /*
784          * try to generate the .dump from all the blktrace
785          * files for a named trace
786          */
787         dot = strrchr(filename, '.');
788         if (!dot || strcmp(".dump", dot) != 0) {
789                 struct tracelist trace = {0 ,NULL};
790                 if (dot && dot != filename)
791                         len = dot - filename;
792
793                 ret = asprintf(&trace.name, "%*s.blktrace.0", len, filename);
794                 if (ret == -1)
795                         return NULL;
796                 ret = asprintf(&dumpfile, "%*s.dump", len, filename);
797                 if (ret == -1) {
798                         free(trace.name);
799                         return NULL;
800                 }
801
802                 ret = dump_traces(&trace, 1, dumpfile);
803                 if (ret == 0) {
804                         free(trace.name);
805                         return dumpfile;
806                 }
807                 free(trace.name);
808                 free(dumpfile);
809         }
810         return NULL;
811 }
812 struct trace *open_trace(char *filename)
813 {
814         int fd;
815         char *p;
816         struct stat st;
817         int ret;
818         struct trace *trace;
819         char *found_filename;
820
821         trace = calloc(1, sizeof(*trace));
822         if (!trace) {
823                 fprintf(stderr, "unable to allocate memory for trace\n");
824                 return NULL;
825         }
826
827         found_filename = find_trace_file(filename);
828         if (!found_filename) {
829                 fprintf(stderr, "Unable to find trace file %s\n", filename);
830                 goto fail;
831         }
832         filename = found_filename;
833
834         fd = open(filename, O_RDONLY);
835         if (fd < 0) {
836                 fprintf(stderr, "Unable to open trace file %s err %s\n", filename, strerror(errno));
837                 goto fail;
838         }
839         ret = fstat(fd, &st);
840         if (ret < 0) {
841                 fprintf(stderr, "stat failed on %s err %s\n", filename, strerror(errno));
842                 goto fail_fd;
843         }
844         p = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
845         if (p == MAP_FAILED) {
846                 fprintf(stderr, "Unable to mmap trace file %s, err %s\n", filename, strerror(errno));
847                 goto fail_fd;
848         }
849         trace->fd = fd;
850         trace->len = st.st_size;
851         trace->start = p;
852         trace->cur = p;
853         trace->io = (struct blk_io_trace *)p;
854         return trace;
855
856 fail_fd:
857         close(fd);
858 fail:
859         free(trace);
860         return NULL;
861 }
862 static inline int tput_event(struct trace *trace)
863 {
864         if (trace->found_completion)
865                 return __BLK_TA_COMPLETE;
866         if (trace->found_issue)
867                 return __BLK_TA_ISSUE;
868         if (trace->found_queue)
869                 return __BLK_TA_QUEUE;
870
871         return __BLK_TA_COMPLETE;
872 }
873
874 int action_char_to_num(char action)
875 {
876         switch (action) {
877         case 'Q':
878                 return __BLK_TA_QUEUE;
879         case 'D':
880                 return __BLK_TA_ISSUE;
881         case 'C':
882                 return __BLK_TA_COMPLETE;
883         }
884         return -1;
885 }
886
887 static inline int io_event(struct trace *trace)
888 {
889         if (plot_io_action)
890                 return plot_io_action;
891         if (trace->found_queue)
892                 return __BLK_TA_QUEUE;
893         if (trace->found_issue)
894                 return __BLK_TA_ISSUE;
895         if (trace->found_completion)
896                 return __BLK_TA_COMPLETE;
897
898         return __BLK_TA_COMPLETE;
899 }
900
901 void add_tput(struct trace *trace, struct graph_line_data *writes_gld,
902               struct graph_line_data *reads_gld)
903 {
904         struct blk_io_trace *io = trace->io;
905         struct graph_line_data *gld;
906         int action = io->action & BLK_TA_MASK;
907         int seconds;
908
909         if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
910                 return;
911
912         if (action != tput_event(trace))
913                 return;
914
915         if (BLK_DATADIR(io->action) & BLK_TC_READ)
916                 gld = reads_gld;
917         else
918                 gld = writes_gld;
919
920         seconds = SECONDS(io->time);
921         gld->data[seconds].sum += io->bytes;
922
923         gld->data[seconds].count = 1;
924         if (gld->data[seconds].sum > gld->max)
925                 gld->max = gld->data[seconds].sum;
926 }
927
928 #define GDD_PTR_ALLOC_STEP 16
929
930 static struct pid_map *get_pid_map(struct trace_file *tf, u32 pid)
931 {
932         struct pid_map *pm;
933
934         if (!io_per_process) {
935                 if (!tf->io_plots)
936                         tf->io_plots = 1;
937                 return NULL;
938         }
939
940         pm = process_hash_insert(pid, NULL);
941         /* New entry? */
942         if (!pm->index) {
943                 if (tf->io_plots == tf->io_plots_allocated) {
944                         tf->io_plots_allocated += GDD_PTR_ALLOC_STEP;
945                         tf->gdd_reads = realloc(tf->gdd_reads, tf->io_plots_allocated * sizeof(struct graph_dot_data *));
946                         if (!tf->gdd_reads)
947                                 abort();
948                         tf->gdd_writes = realloc(tf->gdd_writes, tf->io_plots_allocated * sizeof(struct graph_dot_data *));
949                         if (!tf->gdd_writes)
950                                 abort();
951                         memset(tf->gdd_reads + tf->io_plots_allocated - GDD_PTR_ALLOC_STEP,
952                                0, GDD_PTR_ALLOC_STEP * sizeof(struct graph_dot_data *));
953                         memset(tf->gdd_writes + tf->io_plots_allocated - GDD_PTR_ALLOC_STEP,
954                                0, GDD_PTR_ALLOC_STEP * sizeof(struct graph_dot_data *));
955                 }
956                 pm->index = tf->io_plots++;
957
958                 return pm;
959         }
960         return pm;
961 }
962
963 void add_io(struct trace *trace, struct trace_file *tf)
964 {
965         struct blk_io_trace *io = trace->io;
966         int action = io->action & BLK_TA_MASK;
967         u64 offset;
968         int index;
969         char *label;
970         struct pid_map *pm;
971
972         if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
973                 return;
974
975         if (action != io_event(trace))
976                 return;
977
978         offset = map_io(trace, io);
979
980         pm = get_pid_map(tf, io->pid);
981         if (!pm) {
982                 index = 0;
983                 label = "";
984         } else {
985                 index = pm->index;
986                 label = pm->name;
987         }
988         if (BLK_DATADIR(io->action) & BLK_TC_READ) {
989                 if (!tf->gdd_reads[index])
990                         tf->gdd_reads[index] = alloc_dot_data(tf->min_seconds, tf->max_seconds, tf->min_offset, tf->max_offset, tf->stop_seconds, pick_color(), strdup(label));
991                 set_gdd_bit(tf->gdd_reads[index], offset, io->bytes, io->time);
992         } else if (BLK_DATADIR(io->action) & BLK_TC_WRITE) {
993                 if (!tf->gdd_writes[index])
994                         tf->gdd_writes[index] = alloc_dot_data(tf->min_seconds, tf->max_seconds, tf->min_offset, tf->max_offset, tf->stop_seconds, pick_color(), strdup(label));
995                 set_gdd_bit(tf->gdd_writes[index], offset, io->bytes, io->time);
996         }
997 }
998
999 void add_pending_io(struct trace *trace, struct graph_line_data *gld)
1000 {
1001         unsigned int seconds;
1002         struct blk_io_trace *io = trace->io;
1003         int action = io->action & BLK_TA_MASK;
1004         double avg;
1005         struct pending_io *pio;
1006
1007         if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
1008                 return;
1009
1010         if (action == __BLK_TA_QUEUE) {
1011                 if (io->sector == 0)
1012                         return;
1013                 /*
1014                  * If D (issue) events are available, use them for I/O
1015                  * accounting.  Nothing needs to be done for Q.
1016                  */
1017                 if (trace->found_issue)
1018                         return;
1019                 /*
1020                  * If there are no D or C events, then all that can be
1021                  * done is to account the Q event (and make sure not to
1022                  * add the I/O to the hash, because it will never be
1023                  * removed).
1024                  */
1025                 if (!trace->found_completion)
1026                         goto account_io;
1027                 /*
1028                  * When there are no ISSUE events, count depth and
1029                  * latency from queue events.
1030                  */
1031                 pio = hash_queued_io(trace->io);
1032                 if (pio) {
1033                         pio->dispatch_time = io->time;
1034                         goto account_io;
1035                 }
1036                 return;
1037         }
1038         if (action == __BLK_TA_REQUEUE) {
1039                 if (trace->ios_in_flight > 0)
1040                         trace->ios_in_flight--;
1041                 return;
1042         }
1043         if (action != __BLK_TA_ISSUE)
1044                 return;
1045
1046         pio = hash_dispatched_io(trace->io);
1047         if (!pio)
1048                 return;
1049
1050         if (!trace->found_completion) {
1051                 list_del(&pio->hash_list);
1052                 free(pio);
1053         }
1054
1055 account_io:
1056         trace->ios_in_flight++;
1057
1058         seconds = SECONDS(io->time);
1059         gld->data[seconds].sum += trace->ios_in_flight;
1060         gld->data[seconds].count++;
1061
1062         avg = (double)gld->data[seconds].sum / gld->data[seconds].count;
1063         if (gld->max < (u64)avg) {
1064                 gld->max = avg;
1065         }
1066 }
1067
1068 void add_completed_io(struct trace *trace,
1069                       struct graph_line_data *latency_gld)
1070 {
1071         struct blk_io_trace *io = trace->io;
1072         int seconds;
1073         int action = io->action & BLK_TA_MASK;
1074         struct pending_io *pio;
1075         double avg;
1076         u64 latency;
1077
1078         if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
1079                 return;
1080
1081         if (action != __BLK_TA_COMPLETE)
1082                 return;
1083
1084         seconds = SECONDS(io->time);
1085
1086         pio = hash_completed_io(trace->io);
1087         if (!pio)
1088                 return;
1089
1090         if (trace->ios_in_flight > 0)
1091                 trace->ios_in_flight--;
1092         if (io->time >= pio->dispatch_time) {
1093                 latency = io->time - pio->dispatch_time;
1094                 latency_gld->data[seconds].sum += latency;
1095                 latency_gld->data[seconds].count++;
1096         }
1097
1098         list_del(&pio->hash_list);
1099         free(pio);
1100
1101         avg = (double)latency_gld->data[seconds].sum /
1102                 latency_gld->data[seconds].count;
1103         if (latency_gld->max < (u64)avg) {
1104                 latency_gld->max = avg;
1105         }
1106 }
1107
1108 void add_iop(struct trace *trace, struct graph_line_data *gld)
1109 {
1110         struct blk_io_trace *io = trace->io;
1111         int action = io->action & BLK_TA_MASK;
1112         int seconds;
1113
1114         if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
1115                 return;
1116
1117         /* iops and tput use the same events */
1118         if (action != tput_event(trace))
1119                 return;
1120
1121         seconds = SECONDS(io->time);
1122         gld->data[seconds].sum += 1;
1123         gld->data[seconds].count = 1;
1124         if (gld->data[seconds].sum > gld->max)
1125                 gld->max = gld->data[seconds].sum;
1126 }
1127
1128 void check_record(struct trace *trace)
1129 {
1130         handle_notify(trace);
1131 }