t/zbd/test-zbd-support: Report a test summary when finished
[fio.git] / blktrace.c
1 /*
2  * blktrace support code for fio
3  */
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <sys/ioctl.h>
7 #include <unistd.h>
8 #include <linux/fs.h>
9
10 #include "flist.h"
11 #include "fio.h"
12 #include "blktrace.h"
13 #include "blktrace_api.h"
14 #include "oslib/linux-dev-lookup.h"
15
16 #define TRACE_FIFO_SIZE 8192
17
18 /*
19  * fifo refill frontend, to avoid reading data in trace sized bites
20  */
21 static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
22 {
23         char buf[TRACE_FIFO_SIZE];
24         unsigned int total;
25         int ret;
26
27         total = sizeof(buf);
28         if (total > fifo_room(fifo))
29                 total = fifo_room(fifo);
30
31         ret = read(fd, buf, total);
32         if (ret < 0) {
33                 td_verror(td, errno, "read blktrace file");
34                 return -1;
35         }
36
37         if (ret > 0)
38                 ret = fifo_put(fifo, buf, ret);
39
40         dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
41         return ret;
42 }
43
44 /*
45  * Retrieve 'len' bytes from the fifo, refilling if necessary.
46  */
47 static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
48                           void *buf, unsigned int len)
49 {
50         if (fifo_len(fifo) < len) {
51                 int ret = refill_fifo(td, fifo, fd);
52
53                 if (ret < 0)
54                         return ret;
55         }
56
57         return fifo_get(fifo, buf, len);
58 }
59
60 /*
61  * Just discard the pdu by seeking past it.
62  */
63 static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
64                        struct blk_io_trace *t)
65 {
66         if (t->pdu_len == 0)
67                 return 0;
68
69         dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
70         return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
71 }
72
73 /*
74  * Check if this is a blktrace binary data file. We read a single trace
75  * into memory and check for the magic signature.
76  */
77 bool is_blktrace(const char *filename, int *need_swap)
78 {
79         struct blk_io_trace t;
80         int fd, ret;
81
82         fd = open(filename, O_RDONLY);
83         if (fd < 0)
84                 return false;
85
86         ret = read(fd, &t, sizeof(t));
87         close(fd);
88
89         if (ret < 0) {
90                 perror("read blktrace");
91                 return false;
92         } else if (ret != sizeof(t)) {
93                 log_err("fio: short read on blktrace file\n");
94                 return false;
95         }
96
97         if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
98                 *need_swap = 0;
99                 return true;
100         }
101
102         /*
103          * Maybe it needs to be endian swapped...
104          */
105         t.magic = fio_swap32(t.magic);
106         if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
107                 *need_swap = 1;
108                 return true;
109         }
110
111         return false;
112 }
113
114 #define FMINORBITS      20
115 #define FMINORMASK      ((1U << FMINORBITS) - 1)
116 #define FMAJOR(dev)     ((unsigned int) ((dev) >> FMINORBITS))
117 #define FMINOR(dev)     ((unsigned int) ((dev) & FMINORMASK))
118
119 static void trace_add_open_close_event(struct thread_data *td, int fileno, enum file_log_act action)
120 {
121         struct io_piece *ipo;
122
123         ipo = calloc(1, sizeof(*ipo));
124         init_ipo(ipo);
125
126         ipo->ddir = DDIR_INVAL;
127         ipo->fileno = fileno;
128         ipo->file_action = action;
129         flist_add_tail(&ipo->list, &td->io_log_list);
130 }
131
132 static int get_dev_blocksize(const char *dev, unsigned int *bs)
133 {
134         int fd;
135
136         fd = open(dev, O_RDONLY);
137         if (fd < 0)
138                 return 1;
139
140         if (ioctl(fd, BLKSSZGET, bs) < 0) {
141                 close(fd);
142                 return 1;
143         }
144
145         close(fd);
146         return 0;
147 }
148
149 static int trace_add_file(struct thread_data *td, __u32 device,
150                           unsigned int *bs)
151 {
152         static unsigned int last_maj, last_min, last_fileno, last_bs;
153         unsigned int maj = FMAJOR(device);
154         unsigned int min = FMINOR(device);
155         struct fio_file *f;
156         unsigned int i;
157         char dev[256];
158
159         if (last_maj == maj && last_min == min) {
160                 *bs = last_bs;
161                 return last_fileno;
162         }
163
164         last_maj = maj;
165         last_min = min;
166
167         /*
168          * check for this file in our list
169          */
170         for_each_file(td, f, i) {
171                 if (f->major == maj && f->minor == min) {
172                         last_fileno = f->fileno;
173                         last_bs = f->bs;
174                         goto out;
175                 }
176         }
177
178         strcpy(dev, "/dev");
179         if (blktrace_lookup_device(td->o.replay_redirect, dev, maj, min)) {
180                 unsigned int this_bs;
181                 int fileno;
182
183                 if (td->o.replay_redirect)
184                         dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
185                                         " with: %s\n", maj, min,
186                                         td->o.replay_redirect);
187                 else
188                         dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
189
190                 dprint(FD_BLKTRACE, "add devices %s\n", dev);
191                 fileno = add_file_exclusive(td, dev);
192
193                 if (get_dev_blocksize(dev, &this_bs))
194                         this_bs = 512;
195
196                 td->o.open_files++;
197                 td->files[fileno]->major = maj;
198                 td->files[fileno]->minor = min;
199                 td->files[fileno]->bs = this_bs;
200                 trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
201
202                 last_fileno = fileno;
203                 last_bs = this_bs;
204         }
205
206 out:
207         *bs = last_bs;
208         return last_fileno;
209 }
210
211 static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
212 {
213         if (!o->replay_align)
214                 return;
215
216         t->bytes = (t->bytes + o->replay_align - 1) & ~(o->replay_align - 1);
217 }
218
219 /*
220  * Store blk_io_trace data in an ipo for later retrieval.
221  */
222 static void store_ipo(struct thread_data *td, unsigned long long offset,
223                       unsigned int bytes, int rw, unsigned long long ttime,
224                       int fileno, unsigned int bs)
225 {
226         struct io_piece *ipo;
227
228         ipo = calloc(1, sizeof(*ipo));
229         init_ipo(ipo);
230
231         ipo->offset = offset * bs;
232         if (td->o.replay_scale)
233                 ipo->offset = ipo->offset / td->o.replay_scale;
234         ipo_bytes_align(td->o.replay_align, ipo);
235         ipo->len = bytes;
236         ipo->delay = ttime / 1000;
237         if (rw)
238                 ipo->ddir = DDIR_WRITE;
239         else
240                 ipo->ddir = DDIR_READ;
241         ipo->fileno = fileno;
242
243         dprint(FD_BLKTRACE, "store ddir=%d, off=%llu, len=%lu, delay=%lu\n",
244                                                         ipo->ddir, ipo->offset,
245                                                         ipo->len, ipo->delay);
246         queue_io_piece(td, ipo);
247 }
248
249 static void handle_trace_notify(struct blk_io_trace *t)
250 {
251         switch (t->action) {
252         case BLK_TN_PROCESS:
253                 dprint(FD_BLKTRACE, "got process notify: %x, %d\n",
254                                 t->action, t->pid);
255                 break;
256         case BLK_TN_TIMESTAMP:
257                 dprint(FD_BLKTRACE, "got timestamp notify: %x, %d\n",
258                                 t->action, t->pid);
259                 break;
260         case BLK_TN_MESSAGE:
261                 break;
262         default:
263                 dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
264                 break;
265         }
266 }
267
268 static void handle_trace_discard(struct thread_data *td,
269                                  struct blk_io_trace *t,
270                                  unsigned long long ttime,
271                                  unsigned long *ios, unsigned int *rw_bs)
272 {
273         struct io_piece *ipo;
274         unsigned int bs;
275         int fileno;
276
277         if (td->o.replay_skip & (1u << DDIR_TRIM))
278                 return;
279
280         ipo = calloc(1, sizeof(*ipo));
281         init_ipo(ipo);
282         fileno = trace_add_file(td, t->device, &bs);
283
284         ios[DDIR_TRIM]++;
285         if (t->bytes > rw_bs[DDIR_TRIM])
286                 rw_bs[DDIR_TRIM] = t->bytes;
287
288         td->o.size += t->bytes;
289
290         INIT_FLIST_HEAD(&ipo->list);
291
292         ipo->offset = t->sector * bs;
293         if (td->o.replay_scale)
294                 ipo->offset = ipo->offset / td->o.replay_scale;
295         ipo_bytes_align(td->o.replay_align, ipo);
296         ipo->len = t->bytes;
297         ipo->delay = ttime / 1000;
298         ipo->ddir = DDIR_TRIM;
299         ipo->fileno = fileno;
300
301         dprint(FD_BLKTRACE, "store discard, off=%llu, len=%lu, delay=%lu\n",
302                                                         ipo->offset, ipo->len,
303                                                         ipo->delay);
304         queue_io_piece(td, ipo);
305 }
306
307 static void dump_trace(struct blk_io_trace *t)
308 {
309         log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
310 }
311
312 static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
313                             unsigned long long ttime, unsigned long *ios,
314                             unsigned int *rw_bs)
315 {
316         unsigned int bs;
317         int rw;
318         int fileno;
319
320         fileno = trace_add_file(td, t->device, &bs);
321
322         rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
323
324         if (rw) {
325                 if (td->o.replay_skip & (1u << DDIR_WRITE))
326                         return;
327         } else {
328                 if (td->o.replay_skip & (1u << DDIR_READ))
329                         return;
330         }
331
332         if (!t->bytes) {
333                 if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
334                         dump_trace(t);
335                 return;
336         }
337
338         if (t->bytes > rw_bs[rw])
339                 rw_bs[rw] = t->bytes;
340
341         ios[rw]++;
342         td->o.size += t->bytes;
343         store_ipo(td, t->sector, t->bytes, rw, ttime, fileno, bs);
344 }
345
346 static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
347                                unsigned long long ttime, unsigned long *ios)
348 {
349         struct io_piece *ipo;
350         unsigned int bs;
351         int fileno;
352
353         if (td->o.replay_skip & (1u << DDIR_SYNC))
354                 return;
355
356         ipo = calloc(1, sizeof(*ipo));
357         init_ipo(ipo);
358         fileno = trace_add_file(td, t->device, &bs);
359
360         ipo->delay = ttime / 1000;
361         ipo->ddir = DDIR_SYNC;
362         ipo->fileno = fileno;
363
364         ios[DDIR_SYNC]++;
365         dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
366         queue_io_piece(td, ipo);
367 }
368
369 /*
370  * We only care for queue traces, most of the others are side effects
371  * due to internal workings of the block layer.
372  */
373 static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
374                          unsigned long *ios, unsigned int *bs)
375 {
376         static unsigned long long last_ttime;
377         unsigned long long delay = 0;
378
379         if ((t->action & 0xffff) != __BLK_TA_QUEUE)
380                 return;
381
382         if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
383                 if (!last_ttime || td->o.no_stall)
384                         delay = 0;
385                 else if (td->o.replay_time_scale == 100)
386                         delay = t->time - last_ttime;
387                 else {
388                         double tmp = t->time - last_ttime;
389                         double scale;
390
391                         scale = (double) 100.0 / (double) td->o.replay_time_scale;
392                         tmp *= scale;
393                         delay = tmp;
394                 }
395                 last_ttime = t->time;
396         }
397
398         t_bytes_align(&td->o, t);
399
400         if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
401                 handle_trace_notify(t);
402         else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
403                 handle_trace_discard(td, t, delay, ios, bs);
404         else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
405                 handle_trace_flush(td, t, delay, ios);
406         else
407                 handle_trace_fs(td, t, delay, ios, bs);
408 }
409
410 static void byteswap_trace(struct blk_io_trace *t)
411 {
412         t->magic = fio_swap32(t->magic);
413         t->sequence = fio_swap32(t->sequence);
414         t->time = fio_swap64(t->time);
415         t->sector = fio_swap64(t->sector);
416         t->bytes = fio_swap32(t->bytes);
417         t->action = fio_swap32(t->action);
418         t->pid = fio_swap32(t->pid);
419         t->device = fio_swap32(t->device);
420         t->cpu = fio_swap32(t->cpu);
421         t->error = fio_swap16(t->error);
422         t->pdu_len = fio_swap16(t->pdu_len);
423 }
424
425 static bool t_is_write(struct blk_io_trace *t)
426 {
427         return (t->action & BLK_TC_ACT(BLK_TC_WRITE | BLK_TC_DISCARD)) != 0;
428 }
429
430 static enum fio_ddir t_get_ddir(struct blk_io_trace *t)
431 {
432         if (t->action & BLK_TC_ACT(BLK_TC_READ))
433                 return DDIR_READ;
434         else if (t->action & BLK_TC_ACT(BLK_TC_WRITE))
435                 return DDIR_WRITE;
436         else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
437                 return DDIR_TRIM;
438
439         return DDIR_INVAL;
440 }
441
442 static void depth_inc(struct blk_io_trace *t, int *depth)
443 {
444         enum fio_ddir ddir;
445
446         ddir = t_get_ddir(t);
447         if (ddir != DDIR_INVAL)
448                 depth[ddir]++;
449 }
450
451 static void depth_dec(struct blk_io_trace *t, int *depth)
452 {
453         enum fio_ddir ddir;
454
455         ddir = t_get_ddir(t);
456         if (ddir != DDIR_INVAL)
457                 depth[ddir]--;
458 }
459
460 static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
461 {
462         enum fio_ddir ddir = DDIR_INVAL;
463
464         ddir = t_get_ddir(t);
465         if (ddir != DDIR_INVAL) {
466                 depth[ddir] = max(depth[ddir], this_depth[ddir]);
467                 this_depth[ddir] = 0;
468         }
469 }
470
471 /*
472  * Load a blktrace file by reading all the blk_io_trace entries, and storing
473  * them as io_pieces like the fio text version would do.
474  */
475 bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
476 {
477         struct blk_io_trace t;
478         unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
479         unsigned int rw_bs[DDIR_RWDIR_CNT] = { };
480         unsigned long skipped_writes;
481         struct fifo *fifo;
482         int fd, i, old_state, max_depth;
483         struct fio_file *f;
484         int this_depth[DDIR_RWDIR_CNT] = { };
485         int depth[DDIR_RWDIR_CNT] = { };
486
487         fd = open(filename, O_RDONLY);
488         if (fd < 0) {
489                 td_verror(td, errno, "open blktrace file");
490                 return false;
491         }
492
493         fifo = fifo_alloc(TRACE_FIFO_SIZE);
494
495         old_state = td_bump_runstate(td, TD_SETTING_UP);
496
497         td->o.size = 0;
498         skipped_writes = 0;
499         do {
500                 int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
501
502                 if (ret < 0)
503                         goto err;
504                 else if (!ret)
505                         break;
506                 else if (ret < (int) sizeof(t)) {
507                         log_err("fio: short fifo get\n");
508                         break;
509                 }
510
511                 if (need_swap)
512                         byteswap_trace(&t);
513
514                 if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
515                         log_err("fio: bad magic in blktrace data: %x\n",
516                                                                 t.magic);
517                         goto err;
518                 }
519                 if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
520                         log_err("fio: bad blktrace version %d\n",
521                                                                 t.magic & 0xff);
522                         goto err;
523                 }
524                 ret = discard_pdu(td, fifo, fd, &t);
525                 if (ret < 0) {
526                         td_verror(td, ret, "blktrace lseek");
527                         goto err;
528                 } else if (t.pdu_len != ret) {
529                         log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
530                         goto err;
531                 }
532                 if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
533                         if ((t.action & 0xffff) == __BLK_TA_QUEUE)
534                                 depth_inc(&t, this_depth);
535                         else if (((t.action & 0xffff) == __BLK_TA_BACKMERGE) ||
536                                 ((t.action & 0xffff) == __BLK_TA_FRONTMERGE))
537                                 depth_dec(&t, this_depth);
538                         else if ((t.action & 0xffff) == __BLK_TA_COMPLETE)
539                                 depth_end(&t, this_depth, depth);
540
541                         if (t_is_write(&t) && read_only) {
542                                 skipped_writes++;
543                                 continue;
544                         }
545                 }
546
547                 handle_trace(td, &t, ios, rw_bs);
548         } while (1);
549
550         for_each_file(td, f, i)
551                 trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
552
553         fifo_free(fifo);
554         close(fd);
555
556         td_restore_runstate(td, old_state);
557
558         if (!td->files_index) {
559                 log_err("fio: did not find replay device(s)\n");
560                 return false;
561         }
562
563         /*
564          * For stacked devices, we don't always get a COMPLETE event so
565          * the depth grows to insane values. Limit it to something sane(r).
566          */
567         max_depth = 0;
568         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
569                 if (depth[i] > 1024)
570                         depth[i] = 1024;
571                 else if (!depth[i] && ios[i])
572                         depth[i] = 1;
573                 max_depth = max(depth[i], max_depth);
574         }
575
576         if (skipped_writes)
577                 log_err("fio: %s skips replay of %lu writes due to read-only\n",
578                                                 td->o.name, skipped_writes);
579
580         if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
581             !ios[DDIR_SYNC]) {
582                 log_err("fio: found no ios in blktrace data\n");
583                 return false;
584         } else if (ios[DDIR_READ] && !ios[DDIR_WRITE]) {
585                 td->o.td_ddir = TD_DDIR_READ;
586                 td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
587         } else if (!ios[DDIR_READ] && ios[DDIR_WRITE]) {
588                 td->o.td_ddir = TD_DDIR_WRITE;
589                 td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
590         } else {
591                 td->o.td_ddir = TD_DDIR_RW;
592                 td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
593                 td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
594                 td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
595         }
596
597         /*
598          * We need to do direct/raw ios to the device, to avoid getting
599          * read-ahead in our way. But only do so if the minimum block size
600          * is a multiple of 4k, otherwise we don't know if it's safe to do so.
601          */
602         if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
603                 td->o.odirect = 1;
604
605         /*
606          * If depth wasn't manually set, use probed depth
607          */
608         if (!fio_option_is_set(&td->o, iodepth))
609                 td->o.iodepth = td->o.iodepth_low = max_depth;
610
611         return true;
612 err:
613         close(fd);
614         fifo_free(fifo);
615         return false;
616 }
617
618 static int init_merge_param_list(fio_fp64_t *vals, struct blktrace_cursor *bcs,
619                                  int nr_logs, int def, size_t off)
620 {
621         int i = 0, len = 0;
622
623         while (len < FIO_IO_U_LIST_MAX_LEN && vals[len].u.f != 0.0)
624                 len++;
625
626         if (len && len != nr_logs)
627                 return len;
628
629         for (i = 0; i < nr_logs; i++) {
630                 int *val = (int *)((char *)&bcs[i] + off);
631                 *val = def;
632                 if (len)
633                         *val = (int)vals[i].u.f;
634         }
635
636         return 0;
637
638 }
639
640 static int find_earliest_io(struct blktrace_cursor *bcs, int nr_logs)
641 {
642         __u64 time = ~(__u64)0;
643         int idx = 0, i;
644
645         for (i = 0; i < nr_logs; i++) {
646                 if (bcs[i].t.time < time) {
647                         time = bcs[i].t.time;
648                         idx = i;
649                 }
650         }
651
652         return idx;
653 }
654
655 static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs)
656 {
657         bcs[i].iter++;
658         if (bcs[i].iter < bcs[i].nr_iter) {
659                 lseek(bcs[i].fd, 0, SEEK_SET);
660                 return;
661         }
662
663         *nr_logs -= 1;
664
665         /* close file */
666         fifo_free(bcs[i].fifo);
667         close(bcs[i].fd);
668
669         /* keep active files contiguous */
670         memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
671 }
672
673 static int read_trace(struct thread_data *td, struct blktrace_cursor *bc)
674 {
675         int ret = 0;
676         struct blk_io_trace *t = &bc->t;
677
678 read_skip:
679         /* read an io trace */
680         ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t));
681         if (ret < 0) {
682                 return ret;
683         } else if (!ret) {
684                 if (!bc->length)
685                         bc->length = bc->t.time;
686                 return ret;
687         } else if (ret < (int) sizeof(*t)) {
688                 log_err("fio: short fifo get\n");
689                 return -1;
690         }
691
692         if (bc->swap)
693                 byteswap_trace(t);
694
695         /* skip over actions that fio does not care about */
696         if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
697             t_get_ddir(t) == DDIR_INVAL) {
698                 ret = discard_pdu(td, bc->fifo, bc->fd, t);
699                 if (ret < 0) {
700                         td_verror(td, ret, "blktrace lseek");
701                         return ret;
702                 } else if (t->pdu_len != ret) {
703                         log_err("fio: discarded %d of %d\n", ret,
704                                 t->pdu_len);
705                         return -1;
706                 }
707                 goto read_skip;
708         }
709
710         t->time = (t->time + bc->iter * bc->length) * bc->scalar / 100;
711
712         return ret;
713 }
714
715 static int write_trace(FILE *fp, struct blk_io_trace *t)
716 {
717         /* pdu is not used so just write out only the io trace */
718         t->pdu_len = 0;
719         return fwrite((void *)t, sizeof(*t), 1, fp);
720 }
721
722 int merge_blktrace_iologs(struct thread_data *td)
723 {
724         int nr_logs = get_max_str_idx(td->o.read_iolog_file);
725         struct blktrace_cursor *bcs = malloc(sizeof(struct blktrace_cursor) *
726                                              nr_logs);
727         struct blktrace_cursor *bc;
728         FILE *merge_fp;
729         char *str, *ptr, *name, *merge_buf;
730         int i, ret;
731
732         ret = init_merge_param_list(td->o.merge_blktrace_scalars, bcs, nr_logs,
733                                     100, offsetof(struct blktrace_cursor,
734                                                   scalar));
735         if (ret) {
736                 log_err("fio: merge_blktrace_scalars(%d) != nr_logs(%d)\n",
737                         ret, nr_logs);
738                 goto err_param;
739         }
740
741         ret = init_merge_param_list(td->o.merge_blktrace_iters, bcs, nr_logs,
742                                     1, offsetof(struct blktrace_cursor,
743                                                 nr_iter));
744         if (ret) {
745                 log_err("fio: merge_blktrace_iters(%d) != nr_logs(%d)\n",
746                         ret, nr_logs);
747                 goto err_param;
748         }
749
750         /* setup output file */
751         merge_fp = fopen(td->o.merge_blktrace_file, "w");
752         merge_buf = malloc(128 * 1024);
753         ret = setvbuf(merge_fp, merge_buf, _IOFBF, 128 * 1024);
754         if (ret)
755                 goto err_out_file;
756
757         /* setup input files */
758         str = ptr = strdup(td->o.read_iolog_file);
759         nr_logs = 0;
760         for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
761                 bcs[i].fd = open(name, O_RDONLY);
762                 if (bcs[i].fd < 0) {
763                         log_err("fio: could not open file: %s\n", name);
764                         ret = bcs[i].fd;
765                         goto err_file;
766                 }
767                 bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE);
768                 nr_logs++;
769
770                 if (!is_blktrace(name, &bcs[i].swap)) {
771                         log_err("fio: file is not a blktrace: %s\n", name);
772                         goto err_file;
773                 }
774
775                 ret = read_trace(td, &bcs[i]);
776                 if (ret < 0) {
777                         goto err_file;
778                 } else if (!ret) {
779                         merge_finish_file(bcs, i, &nr_logs);
780                         i--;
781                 }
782         }
783         free(str);
784
785         /* merge files */
786         while (nr_logs) {
787                 i = find_earliest_io(bcs, nr_logs);
788                 bc = &bcs[i];
789                 /* skip over the pdu */
790                 ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t);
791                 if (ret < 0) {
792                         td_verror(td, ret, "blktrace lseek");
793                         goto err_file;
794                 } else if (bc->t.pdu_len != ret) {
795                         log_err("fio: discarded %d of %d\n", ret,
796                                 bc->t.pdu_len);
797                         goto err_file;
798                 }
799
800                 ret = write_trace(merge_fp, &bc->t);
801                 ret = read_trace(td, bc);
802                 if (ret < 0)
803                         goto err_file;
804                 else if (!ret)
805                         merge_finish_file(bcs, i, &nr_logs);
806         }
807
808         /* set iolog file to read from the newly merged file */
809         td->o.read_iolog_file = td->o.merge_blktrace_file;
810         ret = 0;
811
812 err_file:
813         /* cleanup */
814         for (i = 0; i < nr_logs; i++) {
815                 fifo_free(bcs[i].fifo);
816                 close(bcs[i].fd);
817         }
818 err_out_file:
819         fflush(merge_fp);
820         fclose(merge_fp);
821         free(merge_buf);
822 err_param:
823         free(bcs);
824
825         return ret;
826 }