Add cross-stripe intel sample verify job
[fio.git] / blktrace.c
1 /*
2  * blktrace support code for fio
3  */
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <unistd.h>
7
8 #include "flist.h"
9 #include "fio.h"
10 #include "blktrace.h"
11 #include "blktrace_api.h"
12 #include "oslib/linux-dev-lookup.h"
13
14 #define TRACE_FIFO_SIZE 8192
15
16 /*
17  * fifo refill frontend, to avoid reading data in trace sized bites
18  */
19 static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
20 {
21         char buf[TRACE_FIFO_SIZE];
22         unsigned int total;
23         int ret;
24
25         total = sizeof(buf);
26         if (total > fifo_room(fifo))
27                 total = fifo_room(fifo);
28
29         ret = read(fd, buf, total);
30         if (ret < 0) {
31                 td_verror(td, errno, "read blktrace file");
32                 return -1;
33         }
34
35         if (ret > 0)
36                 ret = fifo_put(fifo, buf, ret);
37
38         dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
39         return ret;
40 }
41
42 /*
43  * Retrieve 'len' bytes from the fifo, refilling if necessary.
44  */
45 static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
46                           void *buf, unsigned int len)
47 {
48         if (fifo_len(fifo) < len) {
49                 int ret = refill_fifo(td, fifo, fd);
50
51                 if (ret < 0)
52                         return ret;
53         }
54
55         return fifo_get(fifo, buf, len);
56 }
57
58 /*
59  * Just discard the pdu by seeking past it.
60  */
61 static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
62                        struct blk_io_trace *t)
63 {
64         if (t->pdu_len == 0)
65                 return 0;
66
67         dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
68         return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
69 }
70
71 /*
72  * Check if this is a blktrace binary data file. We read a single trace
73  * into memory and check for the magic signature.
74  */
75 bool is_blktrace(const char *filename, int *need_swap)
76 {
77         struct blk_io_trace t;
78         int fd, ret;
79
80         fd = open(filename, O_RDONLY);
81         if (fd < 0)
82                 return false;
83
84         ret = read(fd, &t, sizeof(t));
85         close(fd);
86
87         if (ret < 0) {
88                 perror("read blktrace");
89                 return false;
90         } else if (ret != sizeof(t)) {
91                 log_err("fio: short read on blktrace file\n");
92                 return false;
93         }
94
95         if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
96                 *need_swap = 0;
97                 return true;
98         }
99
100         /*
101          * Maybe it needs to be endian swapped...
102          */
103         t.magic = fio_swap32(t.magic);
104         if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
105                 *need_swap = 1;
106                 return true;
107         }
108
109         return false;
110 }
111
112 #define FMINORBITS      20
113 #define FMINORMASK      ((1U << FMINORBITS) - 1)
114 #define FMAJOR(dev)     ((unsigned int) ((dev) >> FMINORBITS))
115 #define FMINOR(dev)     ((unsigned int) ((dev) & FMINORMASK))
116
117 static void trace_add_open_close_event(struct thread_data *td, int fileno, enum file_log_act action)
118 {
119         struct io_piece *ipo;
120
121         ipo = calloc(1, sizeof(*ipo));
122         init_ipo(ipo);
123
124         ipo->ddir = DDIR_INVAL;
125         ipo->fileno = fileno;
126         ipo->file_action = action;
127         flist_add_tail(&ipo->list, &td->io_log_list);
128 }
129
130 static int trace_add_file(struct thread_data *td, __u32 device)
131 {
132         static unsigned int last_maj, last_min, last_fileno;
133         unsigned int maj = FMAJOR(device);
134         unsigned int min = FMINOR(device);
135         struct fio_file *f;
136         char dev[256];
137         unsigned int i;
138
139         if (last_maj == maj && last_min == min)
140                 return last_fileno;
141
142         last_maj = maj;
143         last_min = min;
144
145         /*
146          * check for this file in our list
147          */
148         for_each_file(td, f, i)
149                 if (f->major == maj && f->minor == min) {
150                         last_fileno = f->fileno;
151                         return last_fileno;
152                 }
153
154         strcpy(dev, "/dev");
155         if (blktrace_lookup_device(td->o.replay_redirect, dev, maj, min)) {
156                 int fileno;
157
158                 if (td->o.replay_redirect)
159                         dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
160                                         " with: %s\n", maj, min,
161                                         td->o.replay_redirect);
162                 else
163                         dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
164
165                 dprint(FD_BLKTRACE, "add devices %s\n", dev);
166                 fileno = add_file_exclusive(td, dev);
167                 td->o.open_files++;
168                 td->files[fileno]->major = maj;
169                 td->files[fileno]->minor = min;
170                 trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
171                 last_fileno = fileno;
172         }
173
174         return last_fileno;
175 }
176
177 static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
178 {
179         if (!o->replay_align)
180                 return;
181
182         t->bytes = (t->bytes + o->replay_align - 1) & ~(o->replay_align - 1);
183 }
184
185 /*
186  * Store blk_io_trace data in an ipo for later retrieval.
187  */
188 static void store_ipo(struct thread_data *td, unsigned long long offset,
189                       unsigned int bytes, int rw, unsigned long long ttime,
190                       int fileno)
191 {
192         struct io_piece *ipo;
193
194         ipo = calloc(1, sizeof(*ipo));
195         init_ipo(ipo);
196
197         ipo->offset = offset * 512;
198         if (td->o.replay_scale)
199                 ipo->offset = ipo->offset / td->o.replay_scale;
200         ipo_bytes_align(td->o.replay_align, ipo);
201         ipo->len = bytes;
202         ipo->delay = ttime / 1000;
203         if (rw)
204                 ipo->ddir = DDIR_WRITE;
205         else
206                 ipo->ddir = DDIR_READ;
207         ipo->fileno = fileno;
208
209         dprint(FD_BLKTRACE, "store ddir=%d, off=%llu, len=%lu, delay=%lu\n",
210                                                         ipo->ddir, ipo->offset,
211                                                         ipo->len, ipo->delay);
212         queue_io_piece(td, ipo);
213 }
214
215 static void handle_trace_notify(struct blk_io_trace *t)
216 {
217         switch (t->action) {
218         case BLK_TN_PROCESS:
219                 dprint(FD_BLKTRACE, "got process notify: %x, %d\n",
220                                 t->action, t->pid);
221                 break;
222         case BLK_TN_TIMESTAMP:
223                 dprint(FD_BLKTRACE, "got timestamp notify: %x, %d\n",
224                                 t->action, t->pid);
225                 break;
226         case BLK_TN_MESSAGE:
227                 break;
228         default:
229                 dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
230                 break;
231         }
232 }
233
234 static void handle_trace_discard(struct thread_data *td,
235                                  struct blk_io_trace *t,
236                                  unsigned long long ttime,
237                                  unsigned long *ios, unsigned int *bs)
238 {
239         struct io_piece *ipo;
240         int fileno;
241
242         if (td->o.replay_skip & (1u << DDIR_TRIM))
243                 return;
244
245         ipo = calloc(1, sizeof(*ipo));
246         init_ipo(ipo);
247         fileno = trace_add_file(td, t->device);
248
249         ios[DDIR_TRIM]++;
250         if (t->bytes > bs[DDIR_TRIM])
251                 bs[DDIR_TRIM] = t->bytes;
252
253         td->o.size += t->bytes;
254
255         INIT_FLIST_HEAD(&ipo->list);
256
257         ipo->offset = t->sector * 512;
258         if (td->o.replay_scale)
259                 ipo->offset = ipo->offset / td->o.replay_scale;
260         ipo_bytes_align(td->o.replay_align, ipo);
261         ipo->len = t->bytes;
262         ipo->delay = ttime / 1000;
263         ipo->ddir = DDIR_TRIM;
264         ipo->fileno = fileno;
265
266         dprint(FD_BLKTRACE, "store discard, off=%llu, len=%lu, delay=%lu\n",
267                                                         ipo->offset, ipo->len,
268                                                         ipo->delay);
269         queue_io_piece(td, ipo);
270 }
271
272 static void dump_trace(struct blk_io_trace *t)
273 {
274         log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
275 }
276
277 static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
278                             unsigned long long ttime, unsigned long *ios,
279                             unsigned int *bs)
280 {
281         int rw;
282         int fileno;
283
284         fileno = trace_add_file(td, t->device);
285
286         rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
287
288         if (rw) {
289                 if (td->o.replay_skip & (1u << DDIR_WRITE))
290                         return;
291         } else {
292                 if (td->o.replay_skip & (1u << DDIR_READ))
293                         return;
294         }
295
296         if (!t->bytes) {
297                 if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
298                         dump_trace(t);
299                 return;
300         }
301
302         if (t->bytes > bs[rw])
303                 bs[rw] = t->bytes;
304
305         ios[rw]++;
306         td->o.size += t->bytes;
307         store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
308 }
309
310 static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
311                                unsigned long long ttime, unsigned long *ios)
312 {
313         struct io_piece *ipo;
314         int fileno;
315
316         if (td->o.replay_skip & (1u << DDIR_SYNC))
317                 return;
318
319         ipo = calloc(1, sizeof(*ipo));
320         init_ipo(ipo);
321         fileno = trace_add_file(td, t->device);
322
323         ipo->delay = ttime / 1000;
324         ipo->ddir = DDIR_SYNC;
325         ipo->fileno = fileno;
326
327         ios[DDIR_SYNC]++;
328         dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
329         queue_io_piece(td, ipo);
330 }
331
332 /*
333  * We only care for queue traces, most of the others are side effects
334  * due to internal workings of the block layer.
335  */
336 static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
337                          unsigned long *ios, unsigned int *bs)
338 {
339         static unsigned long long last_ttime;
340         unsigned long long delay = 0;
341
342         if ((t->action & 0xffff) != __BLK_TA_QUEUE)
343                 return;
344
345         if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
346                 if (!last_ttime || td->o.no_stall)
347                         delay = 0;
348                 else if (td->o.replay_time_scale == 100)
349                         delay = t->time - last_ttime;
350                 else {
351                         double tmp = t->time - last_ttime;
352                         double scale;
353
354                         scale = (double) 100.0 / (double) td->o.replay_time_scale;
355                         tmp *= scale;
356                         delay = tmp;
357                 }
358                 last_ttime = t->time;
359         }
360
361         t_bytes_align(&td->o, t);
362
363         if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
364                 handle_trace_notify(t);
365         else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
366                 handle_trace_discard(td, t, delay, ios, bs);
367         else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
368                 handle_trace_flush(td, t, delay, ios);
369         else
370                 handle_trace_fs(td, t, delay, ios, bs);
371 }
372
373 static void byteswap_trace(struct blk_io_trace *t)
374 {
375         t->magic = fio_swap32(t->magic);
376         t->sequence = fio_swap32(t->sequence);
377         t->time = fio_swap64(t->time);
378         t->sector = fio_swap64(t->sector);
379         t->bytes = fio_swap32(t->bytes);
380         t->action = fio_swap32(t->action);
381         t->pid = fio_swap32(t->pid);
382         t->device = fio_swap32(t->device);
383         t->cpu = fio_swap32(t->cpu);
384         t->error = fio_swap16(t->error);
385         t->pdu_len = fio_swap16(t->pdu_len);
386 }
387
388 static bool t_is_write(struct blk_io_trace *t)
389 {
390         return (t->action & BLK_TC_ACT(BLK_TC_WRITE | BLK_TC_DISCARD)) != 0;
391 }
392
393 static enum fio_ddir t_get_ddir(struct blk_io_trace *t)
394 {
395         if (t->action & BLK_TC_ACT(BLK_TC_READ))
396                 return DDIR_READ;
397         else if (t->action & BLK_TC_ACT(BLK_TC_WRITE))
398                 return DDIR_WRITE;
399         else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
400                 return DDIR_TRIM;
401
402         return DDIR_INVAL;
403 }
404
405 static void depth_inc(struct blk_io_trace *t, int *depth)
406 {
407         enum fio_ddir ddir;
408
409         ddir = t_get_ddir(t);
410         if (ddir != DDIR_INVAL)
411                 depth[ddir]++;
412 }
413
414 static void depth_dec(struct blk_io_trace *t, int *depth)
415 {
416         enum fio_ddir ddir;
417
418         ddir = t_get_ddir(t);
419         if (ddir != DDIR_INVAL)
420                 depth[ddir]--;
421 }
422
423 static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
424 {
425         enum fio_ddir ddir = DDIR_INVAL;
426
427         ddir = t_get_ddir(t);
428         if (ddir != DDIR_INVAL) {
429                 depth[ddir] = max(depth[ddir], this_depth[ddir]);
430                 this_depth[ddir] = 0;
431         }
432 }
433
434 /*
435  * Load a blktrace file by reading all the blk_io_trace entries, and storing
436  * them as io_pieces like the fio text version would do.
437  */
438 bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
439 {
440         struct blk_io_trace t;
441         unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
442         unsigned int rw_bs[DDIR_RWDIR_CNT] = { };
443         unsigned long skipped_writes;
444         struct fifo *fifo;
445         int fd, i, old_state, max_depth;
446         struct fio_file *f;
447         int this_depth[DDIR_RWDIR_CNT] = { };
448         int depth[DDIR_RWDIR_CNT] = { };
449
450         fd = open(filename, O_RDONLY);
451         if (fd < 0) {
452                 td_verror(td, errno, "open blktrace file");
453                 return false;
454         }
455
456         fifo = fifo_alloc(TRACE_FIFO_SIZE);
457
458         old_state = td_bump_runstate(td, TD_SETTING_UP);
459
460         td->o.size = 0;
461         skipped_writes = 0;
462         do {
463                 int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
464
465                 if (ret < 0)
466                         goto err;
467                 else if (!ret)
468                         break;
469                 else if (ret < (int) sizeof(t)) {
470                         log_err("fio: short fifo get\n");
471                         break;
472                 }
473
474                 if (need_swap)
475                         byteswap_trace(&t);
476
477                 if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
478                         log_err("fio: bad magic in blktrace data: %x\n",
479                                                                 t.magic);
480                         goto err;
481                 }
482                 if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
483                         log_err("fio: bad blktrace version %d\n",
484                                                                 t.magic & 0xff);
485                         goto err;
486                 }
487                 ret = discard_pdu(td, fifo, fd, &t);
488                 if (ret < 0) {
489                         td_verror(td, ret, "blktrace lseek");
490                         goto err;
491                 } else if (t.pdu_len != ret) {
492                         log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
493                         goto err;
494                 }
495                 if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
496                         if ((t.action & 0xffff) == __BLK_TA_QUEUE)
497                                 depth_inc(&t, this_depth);
498                         else if (((t.action & 0xffff) == __BLK_TA_BACKMERGE) ||
499                                 ((t.action & 0xffff) == __BLK_TA_FRONTMERGE))
500                                 depth_dec(&t, this_depth);
501                         else if ((t.action & 0xffff) == __BLK_TA_COMPLETE)
502                                 depth_end(&t, this_depth, depth);
503
504                         if (t_is_write(&t) && read_only) {
505                                 skipped_writes++;
506                                 continue;
507                         }
508                 }
509
510                 handle_trace(td, &t, ios, rw_bs);
511         } while (1);
512
513         for_each_file(td, f, i)
514                 trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
515
516         fifo_free(fifo);
517         close(fd);
518
519         td_restore_runstate(td, old_state);
520
521         if (!td->files_index) {
522                 log_err("fio: did not find replay device(s)\n");
523                 return false;
524         }
525
526         /*
527          * For stacked devices, we don't always get a COMPLETE event so
528          * the depth grows to insane values. Limit it to something sane(r).
529          */
530         max_depth = 0;
531         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
532                 if (depth[i] > 1024)
533                         depth[i] = 1024;
534                 else if (!depth[i] && ios[i])
535                         depth[i] = 1;
536                 max_depth = max(depth[i], max_depth);
537         }
538
539         if (skipped_writes)
540                 log_err("fio: %s skips replay of %lu writes due to read-only\n",
541                                                 td->o.name, skipped_writes);
542
543         if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
544             !ios[DDIR_SYNC]) {
545                 log_err("fio: found no ios in blktrace data\n");
546                 return false;
547         } else if (ios[DDIR_READ] && !ios[DDIR_WRITE]) {
548                 td->o.td_ddir = TD_DDIR_READ;
549                 td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
550         } else if (!ios[DDIR_READ] && ios[DDIR_WRITE]) {
551                 td->o.td_ddir = TD_DDIR_WRITE;
552                 td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
553         } else {
554                 td->o.td_ddir = TD_DDIR_RW;
555                 td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
556                 td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
557                 td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
558         }
559
560         /*
561          * We need to do direct/raw ios to the device, to avoid getting
562          * read-ahead in our way. But only do so if the minimum block size
563          * is a multiple of 4k, otherwise we don't know if it's safe to do so.
564          */
565         if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
566                 td->o.odirect = 1;
567
568         /*
569          * If depth wasn't manually set, use probed depth
570          */
571         if (!fio_option_is_set(&td->o, iodepth))
572                 td->o.iodepth = td->o.iodepth_low = max_depth;
573
574         return true;
575 err:
576         close(fd);
577         fifo_free(fifo);
578         return false;
579 }
580
581 static int init_merge_param_list(fio_fp64_t *vals, struct blktrace_cursor *bcs,
582                                  int nr_logs, int def, size_t off)
583 {
584         int i = 0, len = 0;
585
586         while (len < FIO_IO_U_LIST_MAX_LEN && vals[len].u.f != 0.0)
587                 len++;
588
589         if (len && len != nr_logs)
590                 return len;
591
592         for (i = 0; i < nr_logs; i++) {
593                 int *val = (int *)((char *)&bcs[i] + off);
594                 *val = def;
595                 if (len)
596                         *val = (int)vals[i].u.f;
597         }
598
599         return 0;
600
601 }
602
603 static int find_earliest_io(struct blktrace_cursor *bcs, int nr_logs)
604 {
605         __u64 time = ~(__u64)0;
606         int idx = 0, i;
607
608         for (i = 0; i < nr_logs; i++) {
609                 if (bcs[i].t.time < time) {
610                         time = bcs[i].t.time;
611                         idx = i;
612                 }
613         }
614
615         return idx;
616 }
617
618 static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs)
619 {
620         bcs[i].iter++;
621         if (bcs[i].iter < bcs[i].nr_iter) {
622                 lseek(bcs[i].fd, 0, SEEK_SET);
623                 return;
624         }
625
626         *nr_logs -= 1;
627
628         /* close file */
629         fifo_free(bcs[i].fifo);
630         close(bcs[i].fd);
631
632         /* keep active files contiguous */
633         memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
634 }
635
636 static int read_trace(struct thread_data *td, struct blktrace_cursor *bc)
637 {
638         int ret = 0;
639         struct blk_io_trace *t = &bc->t;
640
641 read_skip:
642         /* read an io trace */
643         ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t));
644         if (ret < 0) {
645                 return ret;
646         } else if (!ret) {
647                 if (!bc->length)
648                         bc->length = bc->t.time;
649                 return ret;
650         } else if (ret < (int) sizeof(*t)) {
651                 log_err("fio: short fifo get\n");
652                 return -1;
653         }
654
655         if (bc->swap)
656                 byteswap_trace(t);
657
658         /* skip over actions that fio does not care about */
659         if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
660             t_get_ddir(t) == DDIR_INVAL) {
661                 ret = discard_pdu(td, bc->fifo, bc->fd, t);
662                 if (ret < 0) {
663                         td_verror(td, ret, "blktrace lseek");
664                         return ret;
665                 } else if (t->pdu_len != ret) {
666                         log_err("fio: discarded %d of %d\n", ret,
667                                 t->pdu_len);
668                         return -1;
669                 }
670                 goto read_skip;
671         }
672
673         t->time = (t->time + bc->iter * bc->length) * bc->scalar / 100;
674
675         return ret;
676 }
677
678 static int write_trace(FILE *fp, struct blk_io_trace *t)
679 {
680         /* pdu is not used so just write out only the io trace */
681         t->pdu_len = 0;
682         return fwrite((void *)t, sizeof(*t), 1, fp);
683 }
684
685 int merge_blktrace_iologs(struct thread_data *td)
686 {
687         int nr_logs = get_max_str_idx(td->o.read_iolog_file);
688         struct blktrace_cursor *bcs = malloc(sizeof(struct blktrace_cursor) *
689                                              nr_logs);
690         struct blktrace_cursor *bc;
691         FILE *merge_fp;
692         char *str, *ptr, *name, *merge_buf;
693         int i, ret;
694
695         ret = init_merge_param_list(td->o.merge_blktrace_scalars, bcs, nr_logs,
696                                     100, offsetof(struct blktrace_cursor,
697                                                   scalar));
698         if (ret) {
699                 log_err("fio: merge_blktrace_scalars(%d) != nr_logs(%d)\n",
700                         ret, nr_logs);
701                 goto err_param;
702         }
703
704         ret = init_merge_param_list(td->o.merge_blktrace_iters, bcs, nr_logs,
705                                     1, offsetof(struct blktrace_cursor,
706                                                 nr_iter));
707         if (ret) {
708                 log_err("fio: merge_blktrace_iters(%d) != nr_logs(%d)\n",
709                         ret, nr_logs);
710                 goto err_param;
711         }
712
713         /* setup output file */
714         merge_fp = fopen(td->o.merge_blktrace_file, "w");
715         merge_buf = malloc(128 * 1024);
716         ret = setvbuf(merge_fp, merge_buf, _IOFBF, 128 * 1024);
717         if (ret)
718                 goto err_out_file;
719
720         /* setup input files */
721         str = ptr = strdup(td->o.read_iolog_file);
722         nr_logs = 0;
723         for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
724                 bcs[i].fd = open(name, O_RDONLY);
725                 if (bcs[i].fd < 0) {
726                         log_err("fio: could not open file: %s\n", name);
727                         ret = bcs[i].fd;
728                         goto err_file;
729                 }
730                 bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE);
731                 nr_logs++;
732
733                 if (!is_blktrace(name, &bcs[i].swap)) {
734                         log_err("fio: file is not a blktrace: %s\n", name);
735                         goto err_file;
736                 }
737
738                 ret = read_trace(td, &bcs[i]);
739                 if (ret < 0) {
740                         goto err_file;
741                 } else if (!ret) {
742                         merge_finish_file(bcs, i, &nr_logs);
743                         i--;
744                 }
745         }
746         free(str);
747
748         /* merge files */
749         while (nr_logs) {
750                 i = find_earliest_io(bcs, nr_logs);
751                 bc = &bcs[i];
752                 /* skip over the pdu */
753                 ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t);
754                 if (ret < 0) {
755                         td_verror(td, ret, "blktrace lseek");
756                         goto err_file;
757                 } else if (bc->t.pdu_len != ret) {
758                         log_err("fio: discarded %d of %d\n", ret,
759                                 bc->t.pdu_len);
760                         goto err_file;
761                 }
762
763                 ret = write_trace(merge_fp, &bc->t);
764                 ret = read_trace(td, bc);
765                 if (ret < 0)
766                         goto err_file;
767                 else if (!ret)
768                         merge_finish_file(bcs, i, &nr_logs);
769         }
770
771         /* set iolog file to read from the newly merged file */
772         td->o.read_iolog_file = td->o.merge_blktrace_file;
773         ret = 0;
774
775 err_file:
776         /* cleanup */
777         for (i = 0; i < nr_logs; i++) {
778                 fifo_free(bcs[i].fifo);
779                 close(bcs[i].fd);
780         }
781 err_out_file:
782         fflush(merge_fp);
783         fclose(merge_fp);
784         free(merge_buf);
785 err_param:
786         free(bcs);
787
788         return ret;
789 }