static unsigned int rt_threshold = 1000000;
static unsigned int ios_threshold = 10;
+static unsigned int rate_threshold;
+static unsigned int set_rate;
+static unsigned int max_depth = 256;
static int output_ascii = 1;
+static char *filename;
struct bs {
unsigned int bs;
struct btrace_out {
unsigned long ios[DDIR_RWDIR_CNT];
- unsigned long rw_bs[DDIR_RWDIR_CNT];
unsigned long merges[DDIR_RWDIR_CNT];
uint64_t last_end[DDIR_RWDIR_CNT];
int inflight;
unsigned int depth;
- uint64_t first_ttime;
- uint64_t last_ttime;
+ int depth_disabled;
+ int complete_seen;
- struct trace_file *files;
- int nr_files;
- unsigned int last_major, last_minor;
+ uint64_t first_ttime[DDIR_RWDIR_CNT];
+ uint64_t last_ttime[DDIR_RWDIR_CNT];
+ uint64_t kb[DDIR_RWDIR_CNT];
uint64_t start_delay;
};
struct flist_head hash_list;
struct flist_head pid_list;
pid_t pid;
+
+ struct trace_file *files;
+ int nr_files;
+ unsigned int last_major, last_minor;
+
struct btrace_out o;
};
static struct flist_head pid_hash[PID_HASH_SIZE];
static FLIST_HEAD(pid_list);
-static FLIST_HEAD(inflight_list);
+#define INFLIGHT_HASH_BITS 8
+#define INFLIGHT_HASH_SIZE (1U << INFLIGHT_HASH_BITS)
+static struct flist_head inflight_hash[INFLIGHT_HASH_SIZE];
static uint64_t first_ttime = -1ULL;
static struct inflight *inflight_find(uint64_t sector)
{
+ struct flist_head *inflight_list;
struct flist_head *e;
- flist_for_each(e, &inflight_list) {
+ inflight_list = &inflight_hash[hash_long(sector, INFLIGHT_HASH_BITS)];
+
+ flist_for_each(e, inflight_list) {
struct inflight *i = flist_entry(e, struct inflight, list);
if (i->end_sector == sector)
free(i);
}
-static void inflight_merge(struct inflight *i, int rw, unsigned int size)
+static void __inflight_add(struct inflight *i)
{
- i->p->o.merges[rw]++;
- if (size)
- i->end_sector += (size >> 9);
+ struct flist_head *list;
+
+ list = &inflight_hash[hash_long(i->end_sector, INFLIGHT_HASH_BITS)];
+ flist_add_tail(&i->list, list);
}
static void inflight_add(struct btrace_pid *p, uint64_t sector, uint32_t len)
i = calloc(1, sizeof(*i));
i->p = p;
o->inflight++;
- o->depth = max((int) o->depth, o->inflight);
+ if (!o->depth_disabled) {
+ o->depth = max((int) o->depth, o->inflight);
+ if (o->depth >= max_depth && !o->complete_seen) {
+ o->depth_disabled = 1;
+ o->depth = max_depth;
+ }
+ }
i->end_sector = sector + (len >> 9);
- flist_add_tail(&i->list, &inflight_list);
+ __inflight_add(i);
+}
+
+static void inflight_merge(struct inflight *i, int rw, unsigned int size)
+{
+ i->p->o.merges[rw]++;
+ if (size) {
+ i->end_sector += (size >> 9);
+ flist_del(&i->list);
+ __inflight_add(i);
+ }
}
/*
return trace_fifo_get(fifo, fd, NULL, t->pdu_len);
}
-static void handle_trace_notify(struct blk_io_trace *t)
+static int handle_trace_notify(struct blk_io_trace *t)
{
switch (t->action) {
case BLK_TN_PROCESS:
case BLK_TN_MESSAGE:
break;
default:
- fprintf(stderr, "unknown trace act %x\n", t->action);
- break;
+ log_err("unknown trace act %x\n", t->action);
+ return 1;
}
+
+ return 0;
}
static void __add_bs(struct btrace_out *o, unsigned int len, int rw)
#define FMAJOR(dev) ((unsigned int) ((dev) >> FMINORBITS))
#define FMINOR(dev) ((unsigned int) ((dev) & FMINORMASK))
-static void btrace_add_file(struct btrace_out *o, uint32_t devno)
+static int btrace_add_file(struct btrace_pid *p, uint32_t devno)
{
unsigned int maj = FMAJOR(devno);
unsigned int min = FMINOR(devno);
unsigned int i;
char dev[256];
- if (o->last_major == maj && o->last_minor == min)
- return;
+ if (filename)
+ return 0;
+ if (p->last_major == maj && p->last_minor == min)
+ return 0;
- o->last_major = maj;
- o->last_minor = min;
+ p->last_major = maj;
+ p->last_minor = min;
/*
* check for this file in our list
*/
- for (i = 0; i < o->nr_files; i++) {
- f = &o->files[i];
+ for (i = 0; i < p->nr_files; i++) {
+ f = &p->files[i];
if (f->major == maj && f->minor == min)
- return;
+ return 0;
}
strcpy(dev, "/dev");
if (!blktrace_lookup_device(NULL, dev, maj, min)) {
log_err("fio: failed to find device %u/%u\n", maj, min);
- return;
+ if (!output_ascii) {
+ log_err("fio: use -d to specify device\n");
+ return 1;
+ }
+ return 0;
}
- o->files = realloc(o->files, (o->nr_files + 1) * sizeof(*f));
- f = &o->files[o->nr_files];
+ p->files = realloc(p->files, (p->nr_files + 1) * sizeof(*f));
+ f = &p->files[p->nr_files];
f->name = strdup(dev);
f->major = maj;
f->minor = min;
- o->nr_files++;
+ p->nr_files++;
+ return 0;
}
-static void handle_trace_discard(struct blk_io_trace *t, struct btrace_out *o)
+static int t_to_rwdir(struct blk_io_trace *t)
{
- btrace_add_file(o, t->device);
+ if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+ return DDIR_TRIM;
+
+ return (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+}
- if (o->first_ttime == -1ULL)
- o->first_ttime = t->time;
+static int handle_trace_discard(struct blk_io_trace *t, struct btrace_pid *p)
+{
+ struct btrace_out *o = &p->o;
+
+ if (btrace_add_file(p, t->device))
+ return 1;
+
+ if (o->first_ttime[2] == -1ULL)
+ o->first_ttime[2] = t->time;
o->ios[DDIR_TRIM]++;
add_bs(o, t->bytes, DDIR_TRIM);
+ return 0;
}
-static void handle_trace_fs(struct blk_io_trace *t, struct btrace_out *o)
+static int handle_trace_fs(struct blk_io_trace *t, struct btrace_pid *p)
{
+ struct btrace_out *o = &p->o;
int rw;
- btrace_add_file(o, t->device);
+ if (btrace_add_file(p, t->device))
+ return 1;
first_ttime = min(first_ttime, (uint64_t) t->time);
- if (o->first_ttime == -1ULL)
- o->first_ttime = t->time;
-
rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+ if (o->first_ttime[rw] == -1ULL)
+ o->first_ttime[rw] = t->time;
+
add_bs(o, t->bytes, rw);
o->ios[rw]++;
o->seq[rw]++;
o->last_end[rw] = t->sector + (t->bytes >> 9);
+ return 0;
}
-static void handle_queue_trace(struct blk_io_trace *t, struct btrace_out *o)
+static int handle_queue_trace(struct blk_io_trace *t, struct btrace_pid *p)
{
if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
- handle_trace_notify(t);
+ return handle_trace_notify(t);
else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
- handle_trace_discard(t, o);
+ return handle_trace_discard(t, p);
else
- handle_trace_fs(t, o);
+ return handle_trace_fs(t, p);
}
-static void handle_trace(struct blk_io_trace *t, struct btrace_pid *p)
+static int handle_trace(struct blk_io_trace *t, struct btrace_pid *p)
{
unsigned int act = t->action & 0xffff;
+ int ret = 0;
if (act == __BLK_TA_QUEUE) {
inflight_add(p, t->sector, t->bytes);
- handle_queue_trace(t, &p->o);
+ ret = handle_queue_trace(t, p);
} else if (act == __BLK_TA_REQUEUE) {
p->o.inflight--;
} else if (act == __BLK_TA_BACKMERGE) {
inflight_remove(i);
i = inflight_find(t->sector);
- if (i) {
- int rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
-
- inflight_merge(i, rw, t->bytes);
- }
+ if (i)
+ inflight_merge(i, t_to_rwdir(t), t->bytes);
} else if (act == __BLK_TA_FRONTMERGE) {
struct inflight *i;
inflight_remove(i);
i = inflight_find(t->sector);
- if (i) {
- int rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
-
- inflight_merge(i, rw, 0);
- }
+ if (i)
+ inflight_merge(i, t_to_rwdir(t), 0);
} else if (act == __BLK_TA_COMPLETE) {
struct inflight *i;
i = inflight_find(t->sector + (t->bytes >> 9));
- if (i)
+ if (i) {
+ i->p->o.kb[t_to_rwdir(t)] += (t->bytes >> 10);
+ i->p->o.complete_seen = 1;
inflight_remove(i);
+ }
}
+
+ return ret;
}
static void byteswap_trace(struct blk_io_trace *t)
int i;
p = calloc(1, sizeof(*p));
- p->o.first_ttime = -1ULL;
- p->o.last_ttime = -1ULL;
- for (i = 0; i < DDIR_RWDIR_CNT; i++)
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ p->o.first_ttime[i] = -1ULL;
+ p->o.last_ttime[i] = -1ULL;
p->o.last_end[i] = -1ULL;
+ }
p->pid = pid;
flist_add_tail(&p->hash_list, hash_list);
* Load a blktrace file by reading all the blk_io_trace entries, and storing
* them as io_pieces like the fio text version would do.
*/
-static int load_blktrace(const char *filename, int need_swap)
+static int load_blktrace(const char *fname, int need_swap)
{
struct btrace_pid *p;
unsigned long traces;
struct blk_io_trace t;
struct fifo *fifo;
- int fd;
+ int fd, ret = 0;
- fd = open(filename, O_RDONLY);
+ fd = open(fname, O_RDONLY);
if (fd < 0) {
perror("open trace file\n");
return 1;
traces = 0;
do {
- int ret = trace_fifo_get(fifo, fd, &t, sizeof(t));
-
+ ret = trace_fifo_get(fifo, fd, &t, sizeof(t));
if (ret < 0)
goto err;
else if (!ret)
break;
else if (ret < (int) sizeof(t)) {
- fprintf(stderr, "fio: short fifo get\n");
+ log_err("fio: short fifo get\n");
break;
}
byteswap_trace(&t);
if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
- fprintf(stderr, "fio: bad magic in blktrace data: %x\n",
- t.magic);
+ log_err("fio: bad magic in blktrace data: %x\n", t.magic);
goto err;
}
if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
- fprintf(stderr, "fio: bad blktrace version %d\n",
- t.magic & 0xff);
+ log_err("fio: bad blktrace version %d\n", t.magic & 0xff);
goto err;
}
ret = discard_pdu(fifo, fd, &t);
if (ret < 0) {
- fprintf(stderr, "blktrace lseek\n");
+ log_err("blktrace lseek\n");
goto err;
} else if (t.pdu_len != ret) {
- fprintf(stderr, "fio: discarded %d of %d\n", ret, t.pdu_len);
+ log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
goto err;
}
p = pid_hash_get(t.pid);
- handle_trace(&t, p);
- p->o.last_ttime = t.time;
+ ret = handle_trace(&t, p);
+ if (ret)
+ break;
+ p->o.last_ttime[t_to_rwdir(&t)] = t.time;
traces++;
} while (1);
fifo_free(fifo);
close(fd);
+ if (ret)
+ return ret;
+
if (output_ascii)
printf("Traces loaded: %lu\n", traces);
return bsb->nr - bsa->nr;
}
+static unsigned long o_to_kb_rate(struct btrace_out *o, int rw)
+{
+ uint64_t usec = (o->last_ttime[rw] - o->first_ttime[rw]) / 1000ULL;
+ uint64_t val;
+
+ if (!usec)
+ return 0;
+
+ val = o->kb[rw] * 1000ULL;
+ return val / (usec / 1000ULL);
+}
+
+static uint64_t o_first_ttime(struct btrace_out *o)
+{
+ uint64_t first;
+
+ first = min(o->first_ttime[0], o->first_ttime[1]);
+ return min(first, o->first_ttime[2]);
+}
+
+static uint64_t o_longest_ttime(struct btrace_out *o)
+{
+ uint64_t ret = 0;
+ int i;
+
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ uint64_t diff;
+
+ diff = o->last_ttime[i] - o->first_ttime[i];
+ ret = max(diff, ret);
+ }
+
+ return ret;
+}
+
static void __output_p_ascii(struct btrace_pid *p, unsigned long *ios)
{
const char *msg[] = { "reads", "writes", "trims" };
struct btrace_out *o = &p->o;
- unsigned long total;
+ unsigned long total, usec;
int i, j;
printf("[pid:\t%u]\n", p->pid);
perc = ((float) o->merges[i] * 100.0) / (float) total;
printf("\tmerges: %lu (perc=%3.2f%%)\n", o->merges[i], perc);
perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
- printf("\tseq: %lu (perc=%3.2f%%)\n", o->seq[i], perc);
+ printf("\tseq: %lu (perc=%3.2f%%)\n", (unsigned long) o->seq[i], perc);
+ printf("\trate: %lu KB/sec\n", o_to_kb_rate(o, i));
for (j = 0; j < o->nr_bs[i]; j++) {
struct bs *bs = &o->bs[i][j];
}
printf("depth:\t%u\n", o->depth);
- printf("usec:\t%llu (delay=%llu)\n", (o->last_ttime - o->first_ttime) / 1000ULL, (unsigned long long) o->start_delay);
+ usec = o_longest_ttime(o) / 1000ULL;
+ printf("usec:\t%lu (delay=%llu)\n", usec, (unsigned long long) o->start_delay);
printf("files:\t");
- for (i = 0; i < o->nr_files; i++)
- printf("%s,", o->files[i].name);
+ for (i = 0; i < p->nr_files; i++)
+ printf("%s,", p->files[i].name);
printf("\n");
printf("\n");
log_err("fio: trace has both read/write and trim\n");
return 1;
}
+ if (!p->nr_files) {
+ log_err("fio: no devices found\n");
+ return 1;
+ }
printf("[pid%u]\n", p->pid);
printf("direct=1\n");
printf("rwmixread=%u\n", (int) (perc + 0.99));
}
- printf("percentage_sequential=");
+ printf("percentage_random=");
for (i = 0; i < DDIR_RWDIR_CNT; i++) {
if (o->seq[i] && o->ios[i]) {
perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
if (i)
printf(",");
+ perc = 100.0 - perc;
printf("%u", (int) perc);
}
printf("\n");
printf("filename=");
- for (i = 0; i < o->nr_files; i++) {
+ for (i = 0; i < p->nr_files; i++) {
if (i)
printf(":");
- printf("%s", o->files[i].name);
+ printf("%s", p->files[i].name);
}
printf("\n");
printf("startdelay=%llus\n", o->start_delay / 1000000ULL);
- time = o->last_ttime - o->first_ttime;
+ time = o_longest_ttime(o);
time = (time + 1000000000ULL - 1) / 1000000000ULL;
printf("runtime=%llus\n", time);
printf("%u/%u", bs->bs, (int) perc);
}
}
- printf("\n\n");
+ printf("\n");
+
+ if (set_rate) {
+ printf("rate=");
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ unsigned long rate;
+
+ rate = o_to_kb_rate(o, i);
+ if (i)
+ printf(",");
+ if (rate)
+ printf("%luk", rate);
+ }
+ printf("\n");
+ }
+ printf("\n");
return 0;
}
qsort(o->bs[i], o->nr_bs[i], sizeof(struct bs), bs_cmp);
}
+ if (filename) {
+ p->files = malloc(sizeof(struct trace_file));
+ p->nr_files++;
+ p->files[0].name = filename;
+ }
+
if (output_ascii)
__output_p_ascii(p, ios);
else
return ret;
}
+static void remove_ddir(struct btrace_out *o, int rw)
+{
+ o->ios[rw] = 0;
+}
+
static int prune_entry(struct btrace_out *o)
{
+ unsigned long rate;
uint64_t time;
+ int i;
if (ddir_rw_sum(o->ios) < ios_threshold)
return 1;
- time = (o->last_ttime - o->first_ttime) / 1000ULL;
+ time = o_longest_ttime(o) / 1000ULL;
if (time < rt_threshold)
return 1;
+ rate = 0;
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ unsigned long this_rate;
+
+ this_rate = o_to_kb_rate(o, i);
+ if (this_rate < rate_threshold) {
+ remove_ddir(o, i);
+ this_rate = 0;
+ }
+ rate += this_rate;
+ }
+
+ if (rate < rate_threshold)
+ return 1;
+
return 0;
}
return ddir_rw_sum(pb->o.ios) - ddir_rw_sum(pa->o.ios);
}
+static void free_p(struct btrace_pid *p)
+{
+ struct btrace_out *o = &p->o;
+ int i;
+
+ for (i = 0; i < p->nr_files; i++) {
+ if (p->files[i].name && p->files[i].name != filename)
+ free(p->files[i].name);
+ }
+
+ for (i = 0; i < DDIR_RWDIR_CNT; i++)
+ free(o->bs[i]);
+
+ free(p->files);
+ flist_del(&p->pid_list);
+ flist_del(&p->hash_list);
+ free(p);
+}
+
static int output_p(void)
{
unsigned long ios[DDIR_RWDIR_CNT];
struct flist_head *e, *tmp;
+ int depth_disabled = 0;
int ret = 0;
flist_for_each_safe(e, tmp, &pid_list) {
p = flist_entry(e, struct btrace_pid, pid_list);
if (prune_entry(&p->o)) {
- flist_del(&p->pid_list);
- flist_del(&p->hash_list);
- free(p);
+ free_p(p);
continue;
}
- p->o.start_delay = (p->o.first_ttime / 1000ULL) - first_ttime;
+ p->o.start_delay = (o_first_ttime(&p->o) / 1000ULL) - first_ttime;
+ depth_disabled += p->o.depth_disabled;
}
+ if (depth_disabled)
+ log_err("fio: missing completion traces, depths capped at %u\n", max_depth);
+
memset(ios, 0, sizeof(ios));
flist_sort(NULL, &pid_list, entry_cmp);
p = flist_entry(e, struct btrace_pid, pid_list);
ret |= __output_p(p, ios);
+ if (ret && !output_ascii)
+ break;
}
if (output_ascii)
static int usage(char *argv[])
{
- fprintf(stderr, "%s: <blktrace bin file>\n", argv[0]);
- fprintf(stderr, "\t-t\tUsec threshold to ignore task\n");
- fprintf(stderr, "\t-n\tNumber IOS threshold to ignore task\n");
- fprintf(stderr, "\t-f\tFio job file output\n");
+ log_err("%s: <blktrace bin file>\n", argv[0]);
+ log_err("\t-t\tUsec threshold to ignore task\n");
+ log_err("\t-n\tNumber IOS threshold to ignore task\n");
+ log_err("\t-f\tFio job file output\n");
+ log_err("\t-d\tUse this file/device for replay\n");
+ log_err("\t-r\tIgnore jobs with less than this KB/sec rate\n");
+ log_err("\t-R\tSet rate in fio job\n");
+ log_err("\t-D\tCap queue depth at this value (def=%u)\n", max_depth);
return 1;
}
-int main(int argc, char *argv[])
+static int trace_needs_swap(const char *trace_file, int *swap)
{
- int fd, ret, need_swap = -1;
struct blk_io_trace t;
- int i, c;
-
- if (argc < 2)
- return usage(argv);
+ int fd, ret;
- while ((c = getopt(argc, argv, "t:n:f")) != -1) {
- switch (c) {
- case 't':
- rt_threshold = atoi(optarg);
- break;
- case 'n':
- ios_threshold = atoi(optarg);
- break;
- case 'f':
- output_ascii = 0;
- break;
- case '?':
- default:
- return usage(argv);
- }
- }
-
- if (argc == optind)
- return usage(argv);
-
- fd = open(argv[optind], O_RDONLY);
+ *swap = -1;
+
+ fd = open(trace_file, O_RDONLY);
if (fd < 0) {
perror("open");
return 1;
ret = read(fd, &t, sizeof(t));
if (ret < 0) {
+ close(fd);
perror("read");
return 1;
} else if (ret != sizeof(t)) {
- fprintf(stderr, "fio: short read on trace file\n");
+ close(fd);
+ log_err("fio: short read on trace file\n");
return 1;
}
close(fd);
if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
- need_swap = 0;
+ *swap = 0;
else {
/*
* Maybe it needs to be endian swapped...
*/
t.magic = fio_swap32(t.magic);
if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
- need_swap = 1;
+ *swap = 1;
}
- if (need_swap == -1) {
- fprintf(stderr, "fio: blktrace appears corrupt\n");
+ if (*swap == -1) {
+ log_err("fio: blktrace appears corrupt\n");
return 1;
}
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int need_swap, i, c;
+
+ if (argc < 2)
+ return usage(argv);
+
+ while ((c = getopt(argc, argv, "t:n:fd:r:RD:")) != -1) {
+ switch (c) {
+ case 'R':
+ set_rate = 1;
+ break;
+ case 'r':
+ rate_threshold = atoi(optarg);
+ break;
+ case 't':
+ rt_threshold = atoi(optarg);
+ break;
+ case 'n':
+ ios_threshold = atoi(optarg);
+ break;
+ case 'f':
+ output_ascii = 0;
+ break;
+ case 'd':
+ filename = strdup(optarg);
+ break;
+ case 'D':
+ max_depth = atoi(optarg);
+ break;
+ case '?':
+ default:
+ return usage(argv);
+ }
+ }
+
+ if (argc == optind)
+ return usage(argv);
+
+ if (trace_needs_swap(argv[optind], &need_swap))
+ return 1;
+
for (i = 0; i < PID_HASH_SIZE; i++)
INIT_FLIST_HEAD(&pid_hash[i]);
+ for (i = 0; i < INFLIGHT_HASH_SIZE; i++)
+ INIT_FLIST_HEAD(&inflight_hash[i]);
load_blktrace(argv[optind], need_swap);
first_ttime /= 1000ULL;