blktrace.c: Add support for read_iolog_chunked
[fio.git] / blktrace.c
1 /*
2  * blktrace support code for fio
3  */
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <unistd.h>
7 #include <errno.h>
8
9 #include "flist.h"
10 #include "fio.h"
11 #include "iolog.h"
12 #include "blktrace.h"
13 #include "blktrace_api.h"
14 #include "oslib/linux-dev-lookup.h"
15
16 /*
17  * Just discard the pdu by seeking past it.
18  */
19 static int discard_pdu(FILE* f, struct blk_io_trace *t)
20 {
21         if (t->pdu_len == 0)
22                 return 0;
23
24         dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
25         if (fseek(f, t->pdu_len, SEEK_CUR) < 0)
26                 return -errno;
27
28         return t->pdu_len;
29 }
30
31 /*
32  * Check if this is a blktrace binary data file. We read a single trace
33  * into memory and check for the magic signature.
34  */
35 bool is_blktrace(const char *filename, int *need_swap)
36 {
37         struct blk_io_trace t;
38         int fd, ret;
39
40         fd = open(filename, O_RDONLY);
41         if (fd < 0)
42                 return false;
43
44         ret = read(fd, &t, sizeof(t));
45         close(fd);
46
47         if (ret < 0) {
48                 perror("read blktrace");
49                 return false;
50         } else if (ret != sizeof(t)) {
51                 log_err("fio: short read on blktrace file\n");
52                 return false;
53         }
54
55         if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
56                 *need_swap = 0;
57                 return true;
58         }
59
60         /*
61          * Maybe it needs to be endian swapped...
62          */
63         t.magic = fio_swap32(t.magic);
64         if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
65                 *need_swap = 1;
66                 return true;
67         }
68
69         return false;
70 }
71
72 #define FMINORBITS      20
73 #define FMINORMASK      ((1U << FMINORBITS) - 1)
74 #define FMAJOR(dev)     ((unsigned int) ((dev) >> FMINORBITS))
75 #define FMINOR(dev)     ((unsigned int) ((dev) & FMINORMASK))
76
77 static void trace_add_open_close_event(struct thread_data *td, int fileno, enum file_log_act action)
78 {
79         struct io_piece *ipo;
80
81         ipo = calloc(1, sizeof(*ipo));
82         init_ipo(ipo);
83
84         ipo->ddir = DDIR_INVAL;
85         ipo->fileno = fileno;
86         ipo->file_action = action;
87         flist_add_tail(&ipo->list, &td->io_log_list);
88 }
89
90 static int trace_add_file(struct thread_data *td, __u32 device)
91 {
92         static unsigned int last_maj, last_min, last_fileno;
93         unsigned int maj = FMAJOR(device);
94         unsigned int min = FMINOR(device);
95         struct fio_file *f;
96         char dev[256];
97         unsigned int i;
98
99         if (last_maj == maj && last_min == min)
100                 return last_fileno;
101
102         last_maj = maj;
103         last_min = min;
104
105         /*
106          * check for this file in our list
107          */
108         for_each_file(td, f, i)
109                 if (f->major == maj && f->minor == min) {
110                         last_fileno = f->fileno;
111                         return last_fileno;
112                 }
113
114         strcpy(dev, "/dev");
115         if (blktrace_lookup_device(td->o.replay_redirect, dev, maj, min)) {
116                 int fileno;
117
118                 if (td->o.replay_redirect)
119                         dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
120                                         " with: %s\n", maj, min,
121                                         td->o.replay_redirect);
122                 else
123                         dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
124
125                 dprint(FD_BLKTRACE, "add devices %s\n", dev);
126                 fileno = add_file_exclusive(td, dev);
127                 td->o.open_files++;
128                 td->files[fileno]->major = maj;
129                 td->files[fileno]->minor = min;
130                 trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
131                 last_fileno = fileno;
132         }
133
134         return last_fileno;
135 }
136
137 static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
138 {
139         if (!o->replay_align)
140                 return;
141
142         t->bytes = (t->bytes + o->replay_align - 1) & ~(o->replay_align - 1);
143 }
144
145 /*
146  * Store blk_io_trace data in an ipo for later retrieval.
147  */
148 static void store_ipo(struct thread_data *td, unsigned long long offset,
149                       unsigned int bytes, int rw, unsigned long long ttime,
150                       int fileno)
151 {
152         struct io_piece *ipo;
153
154         ipo = calloc(1, sizeof(*ipo));
155         init_ipo(ipo);
156
157         ipo->offset = offset * 512;
158         if (td->o.replay_scale)
159                 ipo->offset = ipo->offset / td->o.replay_scale;
160         ipo_bytes_align(td->o.replay_align, ipo);
161         ipo->len = bytes;
162         ipo->delay = ttime / 1000;
163         if (rw)
164                 ipo->ddir = DDIR_WRITE;
165         else
166                 ipo->ddir = DDIR_READ;
167         ipo->fileno = fileno;
168
169         dprint(FD_BLKTRACE, "store ddir=%d, off=%llu, len=%lu, delay=%lu\n",
170                                                         ipo->ddir, ipo->offset,
171                                                         ipo->len, ipo->delay);
172         queue_io_piece(td, ipo);
173 }
174
175 static bool handle_trace_notify(struct blk_io_trace *t)
176 {
177         switch (t->action) {
178         case BLK_TN_PROCESS:
179                 dprint(FD_BLKTRACE, "got process notify: %x, %d\n",
180                                 t->action, t->pid);
181                 break;
182         case BLK_TN_TIMESTAMP:
183                 dprint(FD_BLKTRACE, "got timestamp notify: %x, %d\n",
184                                 t->action, t->pid);
185                 break;
186         case BLK_TN_MESSAGE:
187                 break;
188         default:
189                 dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
190                 break;
191         }
192         return false;
193 }
194
195 static bool handle_trace_discard(struct thread_data *td,
196                                  struct blk_io_trace *t,
197                                  unsigned long long ttime,
198                                  unsigned long *ios, unsigned long long *bs)
199 {
200         struct io_piece *ipo;
201         int fileno;
202
203         if (td->o.replay_skip & (1u << DDIR_TRIM))
204                 return false;
205
206         ipo = calloc(1, sizeof(*ipo));
207         init_ipo(ipo);
208         fileno = trace_add_file(td, t->device);
209
210         ios[DDIR_TRIM]++;
211         if (t->bytes > bs[DDIR_TRIM])
212                 bs[DDIR_TRIM] = t->bytes;
213
214         td->o.size += t->bytes;
215
216         INIT_FLIST_HEAD(&ipo->list);
217
218         ipo->offset = t->sector * 512;
219         if (td->o.replay_scale)
220                 ipo->offset = ipo->offset / td->o.replay_scale;
221         ipo_bytes_align(td->o.replay_align, ipo);
222         ipo->len = t->bytes;
223         ipo->delay = ttime / 1000;
224         ipo->ddir = DDIR_TRIM;
225         ipo->fileno = fileno;
226
227         dprint(FD_BLKTRACE, "store discard, off=%llu, len=%lu, delay=%lu\n",
228                                                         ipo->offset, ipo->len,
229                                                         ipo->delay);
230         queue_io_piece(td, ipo);
231         return true;
232 }
233
234 static void dump_trace(struct blk_io_trace *t)
235 {
236         log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
237 }
238
239 static bool handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
240                             unsigned long long ttime, unsigned long *ios,
241                             unsigned long long *bs)
242 {
243         int rw;
244         int fileno;
245
246         fileno = trace_add_file(td, t->device);
247
248         rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
249
250         if (rw) {
251                 if (td->o.replay_skip & (1u << DDIR_WRITE))
252                         return false;
253         } else {
254                 if (td->o.replay_skip & (1u << DDIR_READ))
255                         return false;
256         }
257
258         if (!t->bytes) {
259                 if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
260                         dump_trace(t);
261                 return false;
262         }
263
264         if (t->bytes > bs[rw])
265                 bs[rw] = t->bytes;
266
267         ios[rw]++;
268         td->o.size += t->bytes;
269         store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
270         return true;
271 }
272
273 static bool handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
274                                unsigned long long ttime, unsigned long *ios)
275 {
276         struct io_piece *ipo;
277         int fileno;
278
279         if (td->o.replay_skip & (1u << DDIR_SYNC))
280                 return false;
281
282         ipo = calloc(1, sizeof(*ipo));
283         init_ipo(ipo);
284         fileno = trace_add_file(td, t->device);
285
286         ipo->delay = ttime / 1000;
287         ipo->ddir = DDIR_SYNC;
288         ipo->fileno = fileno;
289
290         ios[DDIR_SYNC]++;
291         dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
292         queue_io_piece(td, ipo);
293         return true;
294 }
295
296 /*
297  * We only care for queue traces, most of the others are side effects
298  * due to internal workings of the block layer.
299  */
300 static bool queue_trace(struct thread_data *td, struct blk_io_trace *t,
301                          unsigned long *ios, unsigned long long *bs)
302 {
303         static unsigned long long last_ttime;
304         unsigned long long delay = 0;
305
306         if ((t->action & 0xffff) != __BLK_TA_QUEUE)
307                 return false;
308
309         if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
310                 if (!last_ttime || td->o.no_stall)
311                         delay = 0;
312                 else if (td->o.replay_time_scale == 100)
313                         delay = t->time - last_ttime;
314                 else {
315                         double tmp = t->time - last_ttime;
316                         double scale;
317
318                         scale = (double) 100.0 / (double) td->o.replay_time_scale;
319                         tmp *= scale;
320                         delay = tmp;
321                 }
322                 last_ttime = t->time;
323         }
324
325         t_bytes_align(&td->o, t);
326
327         if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
328                 return handle_trace_notify(t);
329         else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
330                 return handle_trace_discard(td, t, delay, ios, bs);
331         else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
332                 return handle_trace_flush(td, t, delay, ios);
333         else
334                 return handle_trace_fs(td, t, delay, ios, bs);
335 }
336
337 static void byteswap_trace(struct blk_io_trace *t)
338 {
339         t->magic = fio_swap32(t->magic);
340         t->sequence = fio_swap32(t->sequence);
341         t->time = fio_swap64(t->time);
342         t->sector = fio_swap64(t->sector);
343         t->bytes = fio_swap32(t->bytes);
344         t->action = fio_swap32(t->action);
345         t->pid = fio_swap32(t->pid);
346         t->device = fio_swap32(t->device);
347         t->cpu = fio_swap32(t->cpu);
348         t->error = fio_swap16(t->error);
349         t->pdu_len = fio_swap16(t->pdu_len);
350 }
351
352 static bool t_is_write(struct blk_io_trace *t)
353 {
354         return (t->action & BLK_TC_ACT(BLK_TC_WRITE | BLK_TC_DISCARD)) != 0;
355 }
356
357 static enum fio_ddir t_get_ddir(struct blk_io_trace *t)
358 {
359         if (t->action & BLK_TC_ACT(BLK_TC_READ))
360                 return DDIR_READ;
361         else if (t->action & BLK_TC_ACT(BLK_TC_WRITE))
362                 return DDIR_WRITE;
363         else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
364                 return DDIR_TRIM;
365
366         return DDIR_INVAL;
367 }
368
369 static void depth_inc(struct blk_io_trace *t, int *depth)
370 {
371         enum fio_ddir ddir;
372
373         ddir = t_get_ddir(t);
374         if (ddir != DDIR_INVAL)
375                 depth[ddir]++;
376 }
377
378 static void depth_dec(struct blk_io_trace *t, int *depth)
379 {
380         enum fio_ddir ddir;
381
382         ddir = t_get_ddir(t);
383         if (ddir != DDIR_INVAL)
384                 depth[ddir]--;
385 }
386
387 static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
388 {
389         enum fio_ddir ddir = DDIR_INVAL;
390
391         ddir = t_get_ddir(t);
392         if (ddir != DDIR_INVAL) {
393                 depth[ddir] = max(depth[ddir], this_depth[ddir]);
394                 this_depth[ddir] = 0;
395         }
396 }
397
398 /*
399  * Load a blktrace file by reading all the blk_io_trace entries, and storing
400  * them as io_pieces like the fio text version would do.
401  */
402 bool init_blktrace_read(struct thread_data *td, const char *filename, int need_swap)
403 {
404         int old_state;
405
406         td->io_log_rfile = fopen(filename, "rb");
407         if (!td->io_log_rfile) {
408                 td_verror(td, errno, "open blktrace file");
409                 goto err;
410         }
411         td->io_log_blktrace_swap = need_swap;
412         td->o.size = 0;
413
414         free_release_files(td);
415
416         old_state = td_bump_runstate(td, TD_SETTING_UP);
417
418         if (!read_blktrace(td)) {
419                 goto err;
420         }
421
422         td_restore_runstate(td, old_state);
423
424         if (!td->files_index) {
425                 log_err("fio: did not find replay device(s)\n");
426                 return false;
427         }
428
429         return true;
430
431 err:
432         if (td->io_log_rfile) {
433                 fclose(td->io_log_rfile);
434                 td->io_log_rfile = NULL;
435         }
436         return false;
437 }
438
439 bool read_blktrace(struct thread_data* td)
440 {
441         struct blk_io_trace t;
442         unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
443         unsigned long long rw_bs[DDIR_RWDIR_CNT] = { };
444         unsigned long skipped_writes;
445         FILE *f = td->io_log_rfile;
446         int i, max_depth;
447         struct fio_file *fiof;
448         int this_depth[DDIR_RWDIR_CNT] = { };
449         int depth[DDIR_RWDIR_CNT] = { };
450         int64_t items_to_fetch = 0;
451
452         if (td->o.read_iolog_chunked) {
453                 items_to_fetch = iolog_items_to_fetch(td);
454                 if (!items_to_fetch)
455                         return true;
456         }
457
458         skipped_writes = 0;
459         do {
460                 int ret = fread(&t, 1, sizeof(t), f);
461
462                 if (ferror(f)) {
463                         td_verror(td, errno, "read blktrace file");
464                         goto err;
465                 } else if (feof(f)) {
466                         break;
467                 } else if (ret < (int) sizeof(t)) {
468                         log_err("fio: iolog short read\n");
469                         break;
470                 }
471
472                 if (td->io_log_blktrace_swap)
473                         byteswap_trace(&t);
474
475                 if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
476                         log_err("fio: bad magic in blktrace data: %x\n",
477                                                                 t.magic);
478                         goto err;
479                 }
480                 if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
481                         log_err("fio: bad blktrace version %d\n",
482                                                                 t.magic & 0xff);
483                         goto err;
484                 }
485                 ret = discard_pdu(f, &t);
486                 if (ret < 0) {
487                         td_verror(td, -ret, "blktrace lseek");
488                         goto err;
489                 }
490                 if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
491                         if ((t.action & 0xffff) == __BLK_TA_QUEUE)
492                                 depth_inc(&t, this_depth);
493                         else if (((t.action & 0xffff) == __BLK_TA_BACKMERGE) ||
494                                 ((t.action & 0xffff) == __BLK_TA_FRONTMERGE))
495                                 depth_dec(&t, this_depth);
496                         else if ((t.action & 0xffff) == __BLK_TA_COMPLETE)
497                                 depth_end(&t, this_depth, depth);
498
499                         if (t_is_write(&t) && read_only) {
500                                 skipped_writes++;
501                                 continue;
502                         }
503                 }
504
505                 if (!queue_trace(td, &t, ios, rw_bs))
506                         continue;
507
508                 if (td->o.read_iolog_chunked) {
509                         td->io_log_current++;
510                         items_to_fetch--;
511                         if (items_to_fetch == 0)
512                                 break;
513                 }
514         } while (1);
515
516         if (td->o.read_iolog_chunked) {
517                 td->io_log_highmark = td->io_log_current;
518                 td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
519                 fio_gettime(&td->io_log_highmark_time, NULL);
520         }
521
522         if (skipped_writes)
523                 log_err("fio: %s skips replay of %lu writes due to read-only\n",
524                                                 td->o.name, skipped_writes);
525
526         if (td->o.read_iolog_chunked) {
527                 if (td->io_log_current == 0) {
528                         return false;
529                 }
530                 td->o.td_ddir = TD_DDIR_RW;
531                 if ((rw_bs[DDIR_READ] > td->o.max_bs[DDIR_READ] ||
532                      rw_bs[DDIR_WRITE] > td->o.max_bs[DDIR_WRITE] ||
533                      rw_bs[DDIR_TRIM] > td->o.max_bs[DDIR_TRIM]) &&
534                     td->orig_buffer)
535                 {
536                         td->o.max_bs[DDIR_READ] = max(td->o.max_bs[DDIR_READ], rw_bs[DDIR_READ]);
537                         td->o.max_bs[DDIR_WRITE] = max(td->o.max_bs[DDIR_WRITE], rw_bs[DDIR_WRITE]);
538                         td->o.max_bs[DDIR_TRIM] = max(td->o.max_bs[DDIR_TRIM], rw_bs[DDIR_TRIM]);
539                         io_u_quiesce(td);
540                         free_io_mem(td);
541                         init_io_u_buffers(td);
542                 }
543                 return true;
544         }
545
546         for_each_file(td, fiof, i)
547                 trace_add_open_close_event(td, fiof->fileno, FIO_LOG_CLOSE_FILE);
548
549         fclose(td->io_log_rfile);
550         td->io_log_rfile = NULL;
551
552         /*
553          * For stacked devices, we don't always get a COMPLETE event so
554          * the depth grows to insane values. Limit it to something sane(r).
555          */
556         max_depth = 0;
557         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
558                 if (depth[i] > 1024)
559                         depth[i] = 1024;
560                 else if (!depth[i] && ios[i])
561                         depth[i] = 1;
562                 max_depth = max(depth[i], max_depth);
563         }
564
565         if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
566             !ios[DDIR_SYNC]) {
567                 log_err("fio: found no ios in blktrace data\n");
568                 return false;
569         }
570
571         td->o.td_ddir = 0;
572         if (ios[DDIR_READ]) {
573                 td->o.td_ddir |= TD_DDIR_READ;
574                 td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
575         }
576         if (ios[DDIR_WRITE]) {
577                 td->o.td_ddir |= TD_DDIR_WRITE;
578                 td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
579         }
580         if (ios[DDIR_TRIM]) {
581                 td->o.td_ddir |= TD_DDIR_TRIM;
582                 td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
583         }
584
585         /*
586          * We need to do direct/raw ios to the device, to avoid getting
587          * read-ahead in our way. But only do so if the minimum block size
588          * is a multiple of 4k, otherwise we don't know if it's safe to do so.
589          */
590         if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
591                 td->o.odirect = 1;
592
593         /*
594          * If depth wasn't manually set, use probed depth
595          */
596         if (!fio_option_is_set(&td->o, iodepth))
597                 td->o.iodepth = td->o.iodepth_low = max_depth;
598
599         return true;
600 err:
601         fclose(f);
602         return false;
603 }
604
605 static int init_merge_param_list(fio_fp64_t *vals, struct blktrace_cursor *bcs,
606                                  int nr_logs, int def, size_t off)
607 {
608         int i = 0, len = 0;
609
610         while (len < FIO_IO_U_LIST_MAX_LEN && vals[len].u.f != 0.0)
611                 len++;
612
613         if (len && len != nr_logs)
614                 return len;
615
616         for (i = 0; i < nr_logs; i++) {
617                 int *val = (int *)((char *)&bcs[i] + off);
618                 *val = def;
619                 if (len)
620                         *val = (int)vals[i].u.f;
621         }
622
623         return 0;
624
625 }
626
627 static int find_earliest_io(struct blktrace_cursor *bcs, int nr_logs)
628 {
629         __u64 time = ~(__u64)0;
630         int idx = 0, i;
631
632         for (i = 0; i < nr_logs; i++) {
633                 if (bcs[i].t.time < time) {
634                         time = bcs[i].t.time;
635                         idx = i;
636                 }
637         }
638
639         return idx;
640 }
641
642 static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs)
643 {
644         bcs[i].iter++;
645         if (bcs[i].iter < bcs[i].nr_iter) {
646                 fseek(bcs[i].f, 0, SEEK_SET);
647                 return;
648         }
649
650         *nr_logs -= 1;
651
652         /* close file */
653         fclose(bcs[i].f);
654
655         /* keep active files contiguous */
656         memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
657 }
658
659 static int read_trace(struct thread_data *td, struct blktrace_cursor *bc)
660 {
661         int ret = 0;
662         struct blk_io_trace *t = &bc->t;
663
664 read_skip:
665         /* read an io trace */
666         ret = fread(&t, 1, sizeof(t), bc->f);
667         if (ferror(bc->f)) {
668                 td_verror(td, errno, "read blktrace file");
669                 return ret;
670         } else if (feof(bc->f)) {
671                 if (!bc->length)
672                         bc->length = bc->t.time;
673                 return ret;
674         } else if (ret < (int) sizeof(*t)) {
675                 log_err("fio: iolog short read\n");
676                 return -1;
677         }
678
679         if (bc->swap)
680                 byteswap_trace(t);
681
682         /* skip over actions that fio does not care about */
683         if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
684             t_get_ddir(t) == DDIR_INVAL) {
685                 ret = discard_pdu(bc->f, t);
686                 if (ret < 0) {
687                         td_verror(td, -ret, "blktrace lseek");
688                         return ret;
689                 }
690                 goto read_skip;
691         }
692
693         t->time = (t->time + bc->iter * bc->length) * bc->scalar / 100;
694
695         return ret;
696 }
697
698 static int write_trace(FILE *fp, struct blk_io_trace *t)
699 {
700         /* pdu is not used so just write out only the io trace */
701         t->pdu_len = 0;
702         return fwrite((void *)t, sizeof(*t), 1, fp);
703 }
704
705 int merge_blktrace_iologs(struct thread_data *td)
706 {
707         int nr_logs = get_max_str_idx(td->o.read_iolog_file);
708         struct blktrace_cursor *bcs = malloc(sizeof(struct blktrace_cursor) *
709                                              nr_logs);
710         struct blktrace_cursor *bc;
711         FILE *merge_fp;
712         char *str, *ptr, *name, *merge_buf;
713         int i, ret;
714
715         ret = init_merge_param_list(td->o.merge_blktrace_scalars, bcs, nr_logs,
716                                     100, offsetof(struct blktrace_cursor,
717                                                   scalar));
718         if (ret) {
719                 log_err("fio: merge_blktrace_scalars(%d) != nr_logs(%d)\n",
720                         ret, nr_logs);
721                 goto err_param;
722         }
723
724         ret = init_merge_param_list(td->o.merge_blktrace_iters, bcs, nr_logs,
725                                     1, offsetof(struct blktrace_cursor,
726                                                 nr_iter));
727         if (ret) {
728                 log_err("fio: merge_blktrace_iters(%d) != nr_logs(%d)\n",
729                         ret, nr_logs);
730                 goto err_param;
731         }
732
733         /* setup output file */
734         merge_fp = fopen(td->o.merge_blktrace_file, "w");
735         merge_buf = malloc(128 * 1024);
736         if (!merge_buf)
737                 goto err_out_file;
738         ret = setvbuf(merge_fp, merge_buf, _IOFBF, 128 * 1024);
739         if (ret)
740                 goto err_merge_buf;
741
742         /* setup input files */
743         str = ptr = strdup(td->o.read_iolog_file);
744         nr_logs = 0;
745         for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
746                 bcs[i].f = fopen(name, "rb");
747                 if (!bcs[i].f) {
748                         log_err("fio: could not open file: %s\n", name);
749                         ret = -errno;
750                         free(str);
751                         goto err_file;
752                 }
753                 nr_logs++;
754
755                 if (!is_blktrace(name, &bcs[i].swap)) {
756                         log_err("fio: file is not a blktrace: %s\n", name);
757                         free(str);
758                         goto err_file;
759                 }
760
761                 ret = read_trace(td, &bcs[i]);
762                 if (ret < 0) {
763                         free(str);
764                         goto err_file;
765                 } else if (!ret) {
766                         merge_finish_file(bcs, i, &nr_logs);
767                         i--;
768                 }
769         }
770         free(str);
771
772         /* merge files */
773         while (nr_logs) {
774                 i = find_earliest_io(bcs, nr_logs);
775                 bc = &bcs[i];
776                 /* skip over the pdu */
777                 ret = discard_pdu(bc->f, &bc->t);
778                 if (ret < 0) {
779                         td_verror(td, -ret, "blktrace lseek");
780                         goto err_file;
781                 }
782
783                 ret = write_trace(merge_fp, &bc->t);
784                 ret = read_trace(td, bc);
785                 if (ret < 0)
786                         goto err_file;
787                 else if (!ret)
788                         merge_finish_file(bcs, i, &nr_logs);
789         }
790
791         /* set iolog file to read from the newly merged file */
792         td->o.read_iolog_file = td->o.merge_blktrace_file;
793         ret = 0;
794
795 err_file:
796         /* cleanup */
797         for (i = 0; i < nr_logs; i++) {
798                 fclose(bcs[i].f);
799         }
800 err_merge_buf:
801         free(merge_buf);
802 err_out_file:
803         fflush(merge_fp);
804         fclose(merge_fp);
805 err_param:
806         free(bcs);
807
808         return ret;
809 }