Collect a block-wise histogram of trim and write errors
authorDan Ehrenberg <dehrenberg@chromium.org>
Tue, 14 Apr 2015 22:58:16 +0000 (15:58 -0700)
committerJens Axboe <axboe@fb.com>
Wed, 15 Apr 2015 00:16:20 +0000 (18:16 -0600)
This patch collects information on how many trims or writes each trim
block sized unit can undergo before its first error. It also records
the way in which an error occurred, if one did.

Output is provided with a histogram which can be output either in
normal or json output. Terse output is omitted, since it seemed like
that would require a version rev bump.

Signed-off-by: Dan Ehrenberg <dehrenberg@chromium.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
12 files changed:
HOWTO
client.c
filesetup.c
fio.1
init.c
io_u.c
iolog.c
options.c
server.c
stat.c
stat.h
thread_options.h

diff --git a/HOWTO b/HOWTO
index 3f258db..6c7abcc 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -1468,6 +1468,11 @@ log_store_compressed=bool        If set, and log_compression is also set,
                command line parameter. The files will be stored with a
                .fz suffix.
 
+block_error_percentiles=bool   If set, record errors in trim block-sized
+               units from writes and trims and output a histogram of
+               how many trims it took to get to errors, and what kind
+               of error was encountered.
+
 lockmem=int    Pin down the specified amount of memory with mlock(2). Can
                potentially be used instead of removing memory or booting
                with less memory to simulate a smaller amount of memory.
@@ -1507,13 +1512,13 @@ clat_percentiles=bool Enable the reporting of percentiles of
                 completion latencies.
 
 percentile_list=float_list Overwrite the default list of percentiles
-               for completion latencies. Each number is a floating
-               number in the range (0,100], and the maximum length of
-               the list is 20. Use ':' to separate the numbers, and
-               list the numbers in ascending order. For example,
-               --percentile_list=99.5:99.9 will cause fio to report
-               the values of completion latency below which 99.5% and
-               99.9% of the observed latencies fell, respectively.
+               for completion latencies and the block error histogram.
+               Each number is a floating number in the range (0,100],
+               and the maximum length of the list is 20. Use ':'
+               to separate the numbers, and list the numbers in ascending
+               order. For example, --percentile_list=99.5:99.9 will cause
+               fio to report the values of completion latency below which
+               99.5% and 99.9% of the observed latencies fell, respectively.
 
 clocksource=str        Use the given clocksource as the base of timing. The
                supported options are:
index 760ec85..af3407c 100644 (file)
--- a/client.c
+++ b/client.c
@@ -891,6 +891,10 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
        dst->latency_target     = le64_to_cpu(src->latency_target);
        dst->latency_window     = le64_to_cpu(src->latency_window);
        dst->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(src->latency_percentile.u.i));
+
+       dst->nr_block_infos     = le64_to_cpu(src->nr_block_infos);
+       for (i = 0; i < dst->nr_block_infos; i++)
+               dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
 }
 
 static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
index 79e5df1..10fc415 100644 (file)
@@ -880,6 +880,27 @@ int setup_files(struct thread_data *td)
                }
        }
 
+       if (td->o.block_error_hist) {
+               int len;
+
+               assert(td->o.nr_files == 1);    /* checked in fixup_options */
+               f = td->files[0];
+               len = f->io_size / td->o.bs[DDIR_TRIM];
+               if (len > MAX_NR_BLOCK_INFOS || len <= 0) {
+                       log_err("fio: cannot calculate block histogram with "
+                               "%d trim blocks, maximum %d\n",
+                               len, MAX_NR_BLOCK_INFOS);
+                       td_verror(td, EINVAL, "block_error_hist");
+                       goto err_out;
+               }
+
+               td->ts.nr_block_infos = len;
+               for (int i = 0; i < len; i++)
+                       td->ts.block_infos[i] =
+                               BLOCK_INFO(0, BLOCK_STATE_UNINIT);
+       } else
+               td->ts.nr_block_infos = 0;
+
        if (!o->size || (total_size && o->size > total_size))
                o->size = total_size;
 
diff --git a/fio.1 b/fio.1
index 0164f42..0f66b3a 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -1292,6 +1292,11 @@ a compressed format. They can be decompressed with fio, using the
 \fB\-\-inflate-log\fR command line parameter. The files will be stored with a
 \fB\.fz\fR suffix.
 .TP
+.BI block_error_percentiles \fR=\fPbool
+If set, record errors in trim block-sized units from writes and trims and output
+a histogram of how many trims it took to get to errors, and what kind of error
+was encountered.
+.TP
 .BI disable_lat \fR=\fPbool
 Disable measurements of total latency numbers. Useful only for cutting
 back the number of calls to \fBgettimeofday\fR\|(2), as that does impact performance at
@@ -1450,9 +1455,9 @@ exceeded before retrying operations
 Enable the reporting of percentiles of completion latencies.
 .TP
 .BI percentile_list \fR=\fPfloat_list
-Overwrite the default list of percentiles for completion
-latencies. Each number is a floating number in the range (0,100], and
-the maximum length of the list is 20. Use ':' to separate the
+Overwrite the default list of percentiles for completion latencies and the
+block error histogram. Each number is a floating number in the range (0,100],
+and the maximum length of the list is 20. Use ':' to separate the
 numbers. For example, \-\-percentile_list=99.5:99.9 will cause fio to
 report the values of completion latency below which 99.5% and 99.9% of
 the observed latencies fell, respectively.
diff --git a/init.c b/init.c
index 95e26b8..a126f79 100644 (file)
--- a/init.c
+++ b/init.c
@@ -767,6 +767,13 @@ static int fixup_options(struct thread_data *td)
        if (!td->loops)
                td->loops = 1;
 
+       if (td->o.block_error_hist && td->o.nr_files != 1) {
+               log_err("fio: block error histogram only available with "
+                       "with a single file per job, but %d files "
+                       "provided\n", td->o.nr_files);
+               ret = 1;
+       }
+
        return ret;
 }
 
diff --git a/io_u.c b/io_u.c
index aecc217..b3e72e9 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -1606,6 +1606,19 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 
        if (!gtod_reduce(td))
                add_iops_sample(td, idx, bytes, &icd->time);
+
+       if (td->ts.nr_block_infos && io_u->ddir == DDIR_TRIM) {
+               uint32_t *info = io_u_block_info(td, io_u);
+               if (BLOCK_INFO_STATE(*info) < BLOCK_STATE_TRIM_FAILURE) {
+                       if (io_u->ddir == DDIR_TRIM) {
+                               *info = BLOCK_INFO(BLOCK_STATE_TRIMMED,
+                                               BLOCK_INFO_TRIMS(*info) + 1);
+                       } else if (io_u->ddir == DDIR_WRITE) {
+                               *info = BLOCK_INFO_SET_STATE(BLOCK_STATE_WRITTEN,
+                                                               *info);
+                       }
+               }
+       }
 }
 
 static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
diff --git a/iolog.c b/iolog.c
index dfa329f..b29684a 100644 (file)
--- a/iolog.c
+++ b/iolog.c
@@ -291,6 +291,18 @@ void unlog_io_piece(struct thread_data *td, struct io_u *io_u)
 {
        struct io_piece *ipo = io_u->ipo;
 
+       if (td->ts.nr_block_infos) {
+               uint32_t *info = io_u_block_info(td, io_u);
+               if (BLOCK_INFO_STATE(*info) < BLOCK_STATE_TRIM_FAILURE) {
+                       if (io_u->ddir == DDIR_TRIM)
+                               *info = BLOCK_INFO_SET_STATE(*info,
+                                               BLOCK_STATE_TRIM_FAILURE);
+                       else if (io_u->ddir == DDIR_WRITE)
+                               *info = BLOCK_INFO_SET_STATE(*info,
+                                               BLOCK_STATE_WRITE_FAILURE);
+               }
+       }
+
        if (!ipo)
                return;
 
index d7ce94b..e0b7ff2 100644 (file)
--- a/options.c
+++ b/options.c
@@ -3196,6 +3196,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .group  = FIO_OPT_G_INVALID,
        },
 #endif
+       {
+               .name   = "block_error_percentiles",
+               .lname  = "Block error percentiles",
+               .type   = FIO_OPT_BOOL,
+               .off1   = td_var_offset(block_error_hist),
+               .help   = "Record trim block errors and make a histogram",
+               .def    = "0",
+               .category = FIO_OPT_C_LOG,
+               .group  = FIO_OPT_G_INVALID,
+       },
        {
                .name   = "bwavgtime",
                .lname  = "Bandwidth average time",
@@ -3320,11 +3330,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
        },
        {
                .name   = "percentile_list",
-               .lname  = "Completion latency percentile list",
+               .lname  = "Percentile list",
                .type   = FIO_OPT_FLOAT_LIST,
                .off1   = td_var_offset(percentile_list),
                .off2   = td_var_offset(percentile_precision),
-               .help   = "Specify a custom list of percentiles to report",
+               .help   = "Specify a custom list of percentiles to report for "
+                         "completion latency and block errors",
                .def    = "1:5:10:20:30:40:50:60:70:80:90:95:99:99.5:99.9:99.95:99.99",
                .maxlen = FIO_IO_U_LIST_MAX_LEN,
                .minfp  = 0.0,
index c249849..93c0987 100644 (file)
--- a/server.c
+++ b/server.c
@@ -1158,6 +1158,10 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
        p.ts.latency_window     = cpu_to_le64(ts->latency_window);
        p.ts.latency_percentile.u.i = cpu_to_le64(fio_double_to_uint64(ts->latency_percentile.u.f));
 
+       p.ts.nr_block_infos     = le64_to_cpu(ts->nr_block_infos);
+       for (i = 0; i < p.ts.nr_block_infos; i++)
+               p.ts.block_infos[i] = le32_to_cpu(ts->block_infos[i]);
+
        convert_gs(&p.rs, rs);
 
        fio_net_send_cmd(server_fd, FIO_NET_CMD_TS, &p, sizeof(p), NULL, NULL);
diff --git a/stat.c b/stat.c
index 252b2dc..34e3792 100644 (file)
--- a/stat.c
+++ b/stat.c
@@ -497,6 +497,147 @@ static void show_latencies(struct thread_stat *ts)
        show_lat_m(io_u_lat_m);
 }
 
+static int block_state_category(int block_state)
+{
+       switch (block_state) {
+       case BLOCK_STATE_UNINIT:
+               return 0;
+       case BLOCK_STATE_TRIMMED:
+       case BLOCK_STATE_WRITTEN:
+               return 1;
+       case BLOCK_STATE_WRITE_FAILURE:
+       case BLOCK_STATE_TRIM_FAILURE:
+               return 2;
+       default:
+               assert(0);
+       }
+}
+
+static int compare_block_infos(const void *bs1, const void *bs2)
+{
+       uint32_t block1 = *(uint32_t *)bs1;
+       uint32_t block2 = *(uint32_t *)bs2;
+       int state1 = BLOCK_INFO_STATE(block1);
+       int state2 = BLOCK_INFO_STATE(block2);
+       int bscat1 = block_state_category(state1);
+       int bscat2 = block_state_category(state2);
+       int cycles1 = BLOCK_INFO_TRIMS(block1);
+       int cycles2 = BLOCK_INFO_TRIMS(block2);
+
+       if (bscat1 < bscat2)
+               return -1;
+       if (bscat1 > bscat2)
+               return 1;
+
+       if (cycles1 < cycles2)
+               return -1;
+       if (cycles1 > cycles2)
+               return 1;
+
+       if (state1 < state2)
+               return -1;
+       if (state1 > state2)
+               return 1;
+
+       assert(block1 == block2);
+       return 0;
+}
+
+static int calc_block_percentiles(int nr_block_infos, uint32_t *block_infos,
+                                 fio_fp64_t *plist, unsigned int **percentiles,
+                                 unsigned int *types)
+{
+       int len = 0;
+       int i, nr_uninit;
+
+       qsort(block_infos, nr_block_infos, sizeof(uint32_t), compare_block_infos);
+
+       while (len < FIO_IO_U_LIST_MAX_LEN && plist[len].u.f != 0.0)
+               len++;
+
+       if (!len)
+               return 0;
+
+       /*
+        * Sort the percentile list. Note that it may already be sorted if
+        * we are using the default values, but since it's a short list this
+        * isn't a worry. Also note that this does not work for NaN values.
+        */
+       if (len > 1)
+               qsort((void *)plist, len, sizeof(plist[0]), double_cmp);
+
+       nr_uninit = 0;
+       /* Start only after the uninit entries end */
+       for (nr_uninit = 0;
+            nr_uninit < nr_block_infos
+               && BLOCK_INFO_STATE(block_infos[nr_uninit]) == BLOCK_STATE_UNINIT;
+            nr_uninit ++)
+               ;
+
+       if (nr_uninit == nr_block_infos)
+               return 0;
+
+       *percentiles = calloc(len, sizeof(**percentiles));
+
+       for (i = 0; i < len; i++) {
+               int idx = (plist[i].u.f * (nr_block_infos - nr_uninit) / 100)
+                               + nr_uninit;
+               (*percentiles)[i] = BLOCK_INFO_TRIMS(block_infos[idx]);
+       }
+
+       memset(types, 0, sizeof(*types) * BLOCK_STATE_COUNT);
+       for (i = 0; i < nr_block_infos; i++)
+               types[BLOCK_INFO_STATE(block_infos[i])]++;
+
+       return len;
+}
+
+static const char *block_state_names[] = {
+       [BLOCK_STATE_UNINIT] = "unwritten",
+       [BLOCK_STATE_TRIMMED] = "trimmed",
+       [BLOCK_STATE_WRITTEN] = "written",
+       [BLOCK_STATE_TRIM_FAILURE] = "trim failure",
+       [BLOCK_STATE_WRITE_FAILURE] = "write failure",
+};
+
+static void show_block_infos(int nr_block_infos, uint32_t *block_infos,
+                            fio_fp64_t *plist)
+{
+       int len, pos, i;
+       unsigned int *percentiles = NULL;
+       unsigned int block_state_counts[BLOCK_STATE_COUNT];
+
+       len = calc_block_percentiles(nr_block_infos, block_infos, plist,
+                                    &percentiles, block_state_counts);
+
+       log_info("  block lifetime percentiles :\n   |");
+       pos = 0;
+       for (i = 0; i < len; i++) {
+               uint32_t block_info = percentiles[i];
+#define LINE_LENGTH    75
+               char str[LINE_LENGTH];
+               int strln = snprintf(str, LINE_LENGTH, " %3.2fth=%u%c",
+                                    plist[i].u.f, block_info,
+                                    i == len - 1 ? '\n' : ',');
+               assert(strln < LINE_LENGTH);
+               if (pos + strln > LINE_LENGTH) {
+                       pos = 0;
+                       log_info("\n   |");
+               }
+               log_info("%s", str);
+               pos += strln;
+#undef LINE_LENGTH
+       }
+       if (percentiles)
+               free(percentiles);
+
+       log_info("        states               :");
+       for (i = 0; i < BLOCK_STATE_COUNT; i++)
+               log_info(" %s=%u%c",
+                        block_state_names[i], block_state_counts[i],
+                        i == BLOCK_STATE_COUNT - 1 ? '\n' : ',');
+}
+
 static void show_thread_status_normal(struct thread_stat *ts,
                                      struct group_run_stats *rs)
 {
@@ -596,6 +737,10 @@ static void show_thread_status_normal(struct thread_stat *ts,
                                        ts->latency_percentile.u.f,
                                        ts->latency_depth);
        }
+
+       if (ts->nr_block_infos)
+               show_block_infos(ts->nr_block_infos, ts->block_infos,
+                                 ts->percentile_list);
 }
 
 static void show_ddir_status_terse(struct thread_stat *ts,
@@ -998,6 +1143,45 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
        if (strlen(ts->description))
                json_object_add_value_string(root, "desc", ts->description);
 
+       if (ts->nr_block_infos) {
+               /* Block error histogram and types */
+               int len;
+               unsigned int *percentiles = NULL;
+               unsigned int block_state_counts[BLOCK_STATE_COUNT];
+
+               len = calc_block_percentiles(ts->nr_block_infos, ts->block_infos,
+                                            ts->percentile_list,
+                                            &percentiles, block_state_counts);
+
+               if (len) {
+                       struct json_object *block, *percentile_object, *states;
+                       int state, i;
+                       block = json_create_object();
+                       json_object_add_value_object(root, "block", block);
+
+                       percentile_object = json_create_object();
+                       json_object_add_value_object(block, "percentiles",
+                                                    percentile_object);
+                       for (i = 0; i < len; i++) {
+                               char buf[20];
+                               snprintf(buf, sizeof(buf), "%f",
+                                        ts->percentile_list[i].u.f);
+                               json_object_add_value_int(percentile_object,
+                                                         (const char *)buf,
+                                                         percentiles[i]);
+                       }
+
+                       states = json_create_object();
+                       json_object_add_value_object(block, "states", states);
+                       for (state = 0; state < BLOCK_STATE_COUNT; state++) {
+                               json_object_add_value_int(states,
+                                       block_state_names[state],
+                                       block_state_counts[state]);
+                       }
+                       free(percentiles);
+               }
+       }
+
        return root;
 }
 
@@ -1913,3 +2097,14 @@ void show_running_run_stats(void)
        helper_do_stat = 1;
        pthread_cond_signal(&helper_cond);
 }
+
+uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u)
+{
+       /* Ignore io_u's which span multiple blocks--they will just get
+        * inaccurate counts. */
+       int idx = (io_u->offset - io_u->file->file_offset)
+                       / td->o.bs[DDIR_TRIM];
+       uint32_t *info = &td->ts.block_infos[idx];
+       assert(idx < td->ts.nr_block_infos);
+       return info;
+}
diff --git a/stat.h b/stat.h
index 8b4416c..dd34012 100644 (file)
--- a/stat.h
+++ b/stat.h
@@ -112,6 +112,28 @@ struct group_run_stats {
 #define FIO_IO_U_LIST_MAX_LEN 20 /* The size of the default and user-specified
                                        list of percentiles */
 
+/*
+ * Trim cycle count measurements
+ */
+#define MAX_NR_BLOCK_INFOS     8192
+#define BLOCK_INFO_STATE_SHIFT 29
+#define BLOCK_INFO_TRIMS(block_info)   \
+       ((block_info) & ((1 << BLOCK_INFO_STATE_SHIFT) - 1))
+#define BLOCK_INFO_STATE(block_info)           \
+       ((block_info) >> BLOCK_INFO_STATE_SHIFT)
+#define BLOCK_INFO(state, trim_cycles) \
+       ((trim_cycles) | ((state) << BLOCK_INFO_STATE_SHIFT))
+#define BLOCK_INFO_SET_STATE(block_info, state)        \
+       BLOCK_INFO(state, BLOCK_INFO_TRIMS(block_info))
+enum block_info_state {
+       BLOCK_STATE_UNINIT,
+       BLOCK_STATE_TRIMMED,
+       BLOCK_STATE_WRITTEN,
+       BLOCK_STATE_TRIM_FAILURE,
+       BLOCK_STATE_WRITE_FAILURE,
+       BLOCK_STATE_COUNT,
+} state;
+
 #define MAX_PATTERN_SIZE       512
 #define FIO_JOBNAME_SIZE       128
 #define FIO_JOBDESC_SIZE       256
@@ -180,6 +202,9 @@ struct thread_stat {
        uint64_t total_err_count;
        uint32_t first_error;
 
+       uint64_t nr_block_infos;
+       uint32_t block_infos[MAX_NR_BLOCK_INFOS];
+
        uint32_t kb_base;
        uint32_t unit_base;
 
@@ -261,4 +286,6 @@ static inline int usec_to_msec(unsigned long *min, unsigned long *max,
 #define __THREAD_RUNSTR_SZ(nr) ((nr) * 5)
 #define THREAD_RUNSTR_SZ       __THREAD_RUNSTR_SZ(thread_number)
 
+uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u);
+
 #endif
index ee1114d..488ca6e 100644 (file)
@@ -261,6 +261,8 @@ struct thread_options {
        unsigned long long latency_target;
        unsigned long long latency_window;
        fio_fp64_t latency_percentile;
+
+       unsigned block_error_hist;
 };
 
 #define FIO_TOP_STR_MAX                256
@@ -488,6 +490,8 @@ struct thread_options_pack {
        uint64_t latency_window;
        uint32_t pad3;
        fio_fp64_t latency_percentile;
+
+       uint32_t block_error_hist;
 } __attribute__((packed));
 
 extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);