tools/fio_jsonplus2csv: accommodate multiple lat measurements

[fio.git] / tools / fio_jsonplus_clat2csv
diff --git a/tools/fio_jsonplus_clat2csv b/tools/fio_jsonplus_clat2csv

index 78a007e57e5837f6d35a4b1f9caa8fa68b4420ca..9544ab747ce395759933d62eb1386744d42f3905 100755 (executable)
--- a/tools/fio_jsonplus_clat2csv
+++ b/tools/fio_jsonplus_clat2csv
@@ -1,76 +1,97 @@
  #!/usr/bin/python2.7
  # Note: this script is python2 and python3 compatible.
-#
-# fio_jsonplus_clat2csv
-#
-# This script converts fio's json+ completion latency data to CSV format.
-#
-# For example:
-#
-# Run the following fio jobs:
-# ../fio --output=fio-jsonplus.output --output-format=json+ --name=test1
-#      --ioengine=null --time_based --runtime=5s --size=1G --rw=randrw
-#      --name=test2 --ioengine=null --time_based --runtime=3s --size=1G
-#      --rw=read --name=test3 --ioengine=null --time_based --runtime=4s
-#      --size=8G --rw=write
-#
-# Then run:
-# fio_jsonplus_clat2csv fio-jsonplus.output fio-latency.csv
-#
-# You will end up with the following 3 files
-#
-# -rw-r--r-- 1 root root  6467 Jun 27 14:57 fio-latency_job0.csv
-# -rw-r--r-- 1 root root  3985 Jun 27 14:57 fio-latency_job1.csv
-# -rw-r--r-- 1 root root  4490 Jun 27 14:57 fio-latency_job2.csv
-#
-# fio-latency_job0.csv will look something like:
-#
-# clat_nsec, read_count, read_cumulative, read_percentile, write_count,
-#      write_cumulative, write_percentile, trim_count, trim_cumulative,
-#      trim_percentile,
-# 25, 1, 1, 1.50870705013e-07, , , , , , ,
-# 26, 12, 13, 1.96131916517e-06, 947, 947, 0.000142955890032, , , ,
-# 27, 843677, 843690, 0.127288105112, 838347, 839294, 0.126696959629, , , ,
-# 28, 1877982, 2721672, 0.410620573454, 1870189, 2709483, 0.409014312345, , , ,
-# 29, 4471, 2726143, 0.411295116376, 7718, 2717201, 0.410179395301, , , ,
-# 30, 2142885, 4869028, 0.734593687087, 2138164, 4855365, 0.732949340025, , , ,
-# ...
-# 2544, , , , 2, 6624404, 0.999997433738, , , ,
-# 2576, 3, 6628178, 0.99999788781, 4, 6624408, 0.999998037564, , , ,
-# 2608, 4, 6628182, 0.999998491293, 4, 6624412, 0.999998641391, , , ,
-# 2640, 3, 6628185, 0.999998943905, 2, 6624414, 0.999998943304, , , ,
-# 2672, 1, 6628186, 0.999999094776, 3, 6624417, 0.999999396174, , , ,
-# 2736, 1, 6628187, 0.999999245646, 1, 6624418, 0.99999954713, , , ,
-# 2768, 2, 6628189, 0.999999547388, 1, 6624419, 0.999999698087, , , ,
-# 2800, , , , 1, 6624420, 0.999999849043, , , ,
-# 2832, 1, 6628190, 0.999999698259, , , , , , ,
-# 4192, 1, 6628191, 0.999999849129, , , , , , ,
-# 5792, , , , 1, 6624421, 1.0, , , ,
-# 10304, 1, 6628192, 1.0, , , , , , ,
-#
-# The first line says that you had one read IO with 25ns clat,
-# the cumulative number of read IOs at or below 25ns is 1, and
-# 25ns is the 0.00001509th percentile for read latency
-#
-# The job had 2 write IOs complete in 2544ns,
-# 6624404 write IOs completed in 2544ns or less,
-# and this represents the 99.99974th percentile for write latency
-#
-# The last line says that one read IO had 10304ns clat,
-# 6628192 read IOs had 10304ns or shorter clat, and
-# 10304ns is the 100th percentile for read latency
-#
+
+"""
+fio_jsonplus_clat2csv
+
+This script converts fio's json+ latency data to CSV format.
+
+For example:
+
+Run the following fio jobs:
+$ fio --output=fio-jsonplus.output --output-format=json+ --ioengine=null \
+    --time_based --runtime=3s --size=1G --slat_percentiles=1 \
+    --clat_percentiles=1 --lat_percentiles=1 \
+    --name=test1 --rw=randrw \
+    --name=test2 --rw=read \
+    --name=test3 --rw=write
+
+Then run:
+$ fio_jsonplus_clat2csv fio-jsonplus.output fio-jsonplus.csv
+
+You will end up with the following 3 files:
+
+-rw-r--r-- 1 root root 77547 Mar 24 15:17 fio-jsonplus_job0.csv
+-rw-r--r-- 1 root root 65413 Mar 24 15:17 fio-jsonplus_job1.csv
+-rw-r--r-- 1 root root 63291 Mar 24 15:17 fio-jsonplus_job2.csv
+
+fio-jsonplus_job0.csv will look something like:
+
+nsec, read_slat_ns_count, read_slat_ns_cumulative, read_slat_ns_percentile, read_clat_ns_count, read_clat_ns_cumulative, read_clat_ns_percentile, read_lat_ns_count, read_lat_ns_cumulative, read_lat_ns_percentile, write_slat_ns_count, write_slat_ns_cumulative, write_slat_ns_percentile, write_clat_ns_count, write_clat_ns_cumulative, write_clat_ns_percentile, write_lat_ns_count, write_lat_ns_cumulative, write_lat_ns_percentile, trim_slat_ns_count, trim_slat_ns_cumulative, trim_slat_ns_percentile, trim_clat_ns_count, trim_clat_ns_cumulative, trim_clat_ns_percentile, trim_lat_ns_count, trim_lat_ns_cumulative, trim_lat_ns_percentile,
+12, , , , 3, 3, 6.11006798673e-07, , , , , , , 2, 2, 4.07580840603e-07, , , , , , , , , , , , ,
+13, , , , 1364, 1367, 0.000278415431262, , , , , , , 1776, 1778, 0.000362339367296, , , , , , , , , , , , ,
+14, , , , 181872, 183239, 0.037320091594, , , , , , , 207436, 209214, 0.0426358089929, , , , , , , , , , , , ,
+15, , , , 1574811, 1758050, 0.358060167469, , , , , , , 1661435, 1870649, 0.381220345946, , , , , , , , , , , , ,
+16, , , , 2198478, 3956528, 0.805821835713, , , , , , , 2154571, 4025220, 0.820301275606, , , , , , , , , , , , ,
+17, , , , 724335, 4680863, 0.953346372218, , , , , , , 645351, 4670571, 0.951817627138, , , , , , , , , , , , ,
+18, , , , 71837, 4752700, 0.96797733735, , , , , , , 61084, 4731655, 0.964265961171, , , , , , , , , , , , ,
+19, , , , 15915, 4768615, 0.971218728417, , , , , , , 18419, 4750074, 0.968019576923, , , , , , , , , , , , ,
+20, , , , 12651, 4781266, 0.973795344087, , , , , , , 14176, 4764250, 0.970908509921, , , , , , , , , , , , ,
+...
+168960, , , , , , , , , , , , , 1, 4906999, 0.999999388629, 1, 4906997, 0.999998981048, , , , , , , , , ,
+177152, , , , , , , , , , , , , 1, 4907000, 0.999999592419, 1, 4906998, 0.999999184838, , , , , , , , , ,
+183296, , , , , , , , , , , , , 1, 4907001, 0.99999979621, 1, 4906999, 0.999999388629, , , , , , , , , ,
+189440, , , , , , , 1, 4909925, 0.999999185324, , , , , , , , , , , , , , , , , , ,
+214016, , , , 1, 4909928, 0.999999796331, 2, 4909927, 0.999999592662, , , , , , , , , , , , , , , , , , ,
+246784, , , , , , , , , , , , , , , , 1, 4907000, 0.999999592419, , , , , , , , , ,
+272384, , , , 1, 4909929, 1.0, 1, 4909928, 0.999999796331, , , , , , , , , , , , , , , , , , ,
+329728, , , , , , , , , , , , , 1, 4907002, 1.0, 1, 4907001, 0.99999979621, , , , , , , , , ,
+1003520, , , , , , , , , , , , , , , , 1, 4907002, 1.0, , , , , , , , , ,
+1089536, , , , , , , 1, 4909929, 1.0, , , , , , , , , , , , , , , , , , ,
+
+The first line says that there were three read IOs with 12ns clat,
+the cumulative number of read IOs at or below 12ns was two, and
+12ns was the 0.0000611th percentile for read latency. There were
+two write IOs with 12ns clat, the cumulative number of write IOs
+at or below 12ns was two, and 12ns was the 0.0000408th percentile
+for write latency.
+
+The job had one write IO complete at 168960ns and 4906999 write IOs
+completed at or below this duration. Also this duration was the
+99.99994th percentile for write latency. There was one write IO
+with a total latency of 168960ns, this duration had a cumulative
+frequency of 4906997 write IOs and was the 99.9998981048th percentile
+for write total latency.
+
+The last line says that one read IO had 1089536ns total latency, this
+duration had a cumulative frequency of 4909929 and represented the 100th
+percentile for read total latency.
+
+Running the following:
+
+$ fio_jsonplus_clat2csv fio-jsonplus.output fio-jsonplus.csv --validate
+fio-jsonplus_job0.csv validated
+fio-jsonplus_job1.csv validated
+fio-jsonplus_job2.csv validated
+
+will check the CSV data against the json+ output to confirm that the CSV
+data matches.
+"""
  
  from __future__ import absolute_import
  from __future__ import print_function
  import os
  import json
  import argparse
+import itertools
  import six
-from six.moves import range
  
+DDIR_LIST = ['read', 'write', 'trim']
+LAT_LIST = ['slat_ns', 'clat_ns', 'lat_ns']
  
  def parse_args():
+    """Parse command-line arguments."""
+
      parser = argparse.ArgumentParser()
      parser.add_argument('source',
                          help='fio json+ output file containing completion '
@@ -78,12 +99,26 @@ def parse_args():
      parser.add_argument('dest',
                          help='destination file stub for latency data in CSV '
                               'format. job number will be appended to filename')
+    parser.add_argument('--debug', '-d', action='store_true',
+                        help='enable debug prints')
+    parser.add_argument('--validate', action='store_true',
+                        help='validate CSV against JSON output')
      args = parser.parse_args()
  
      return args
  
  
  def percentile(idx, run_total):
+    """Return a percentile for a specified index based on a running total.
+
+    Parameters:
+        idx         index for which to generate percentile.
+        run_total   list of cumulative sums.
+
+    Returns:
+        Percentile represented by the specified index.
+    """
+
      total = run_total[len(run_total)-1]
      if total == 0:
          return 0
@@ -91,7 +126,18 @@ def percentile(idx, run_total):
      return float(run_total[idx]) / total
  
  
-def more_lines(indices, bins):
+def more_bins(indices, bins):
+    """Determine whether we have more bins to process.
+
+    Parameters:
+        indices     a dict containing the last index processed in each bin.
+        bins        a dict contaiing a set of bins to process.
+
+    Returns:
+        True if the indices do not yet point to the end of each bin in bins.
+        False if the indices point beyond their repsective bins.
+    """
+
      for key, value in six.iteritems(indices):
          if value < len(bins[key]):
              return True
@@ -99,78 +145,187 @@ def more_lines(indices, bins):
      return False
  
  
+def debug_print(debug, *args):
+    """Print debug messages.
+
+    Parameters:
+        debug       emit messages if True.
+        *args       arguments for print().
+    """
+
+    if debug:
+        print(*args)
+
+
+def get_csvfile(dest, jobnum):
+    """Generate CSV filename from command-line arguments and job numbers.
+
+    Paramaters:
+        dest        file specification for CSV filename.
+        jobnum      job number.
+
+    Returns:
+        A string that is a new filename that incorporates the job number.
+    """
+
+    stub, ext = os.path.splitext(dest)
+    return stub + '_job' + str(jobnum) + ext
+
+
+def validate(args, jsondata, col_labels):
+    """Validate CSV data against json+ output.
+
+    This function checks the CSV data to make sure that it was correctly
+    generated from the original json+ output. json+ 'bins' objects are
+    constructed from the CSV data and then compared to the corresponding
+    objects in the json+ data. An AssertionError will appear if a mismatch
+    is found.
+
+    Percentiles and cumulative counts are not checked.
+
+    Parameters:
+        args        command-line arguments for this script.
+        jsondata    json+ output to compare against.
+        col_labels  column labels for CSV data.
+
+    Returns
+        0 if no mismatches found.
+    """
+
+    colnames = [c.strip() for c in col_labels.split(',')]
+
+    for jobnum in range(len(jsondata['jobs'])):
+        job_data = jsondata['jobs'][jobnum]
+        csvfile = get_csvfile(args.dest, jobnum)
+
+        with open(csvfile, 'r') as csvsource:
+            csvlines = csvsource.read().split('\n')
+
+        assert csvlines[0] == col_labels
+        debug_print(args.debug, 'col_labels match for', csvfile)
+
+        # create 'bins' objects from the CSV data
+        counts = {}
+        for ddir in DDIR_LIST:
+            counts[ddir] = {}
+            for lat in LAT_LIST:
+                counts[ddir][lat] = {}
+
+        csvlines.pop(0)
+        for line in csvlines:
+            if line.strip() == "":
+                continue
+            values = line.split(',')
+            nsec = values[0]
+            for col in colnames:
+                if 'count' in col:
+                    val = values[colnames.index(col)]
+                    if val.strip() != "":
+                        count = int(val)
+                        ddir, lat, _, _ = col.split('_')
+                        lat = lat + '_ns'
+                        counts[ddir][lat][nsec] = count
+                        try:
+                            assert count == job_data[ddir][lat]['bins'][nsec]
+                        except Exception:
+                            print("mismatch:", csvfile, ddir, lat, nsec, "ns")
+                            return 1
+
+        # compare 'bins' objects created from the CSV data
+        # with corresponding 'bins' objects in the json+ output
+        for ddir in DDIR_LIST:
+            for lat in LAT_LIST:
+                if lat in job_data[ddir] and 'bins' in job_data[ddir][lat]:
+                    assert job_data[ddir][lat]['bins'] == counts[ddir][lat]
+                    debug_print(args.debug, csvfile, ddir, lat, "bins match")
+                else:
+                    assert counts[ddir][lat] == {}
+                    debug_print(args.debug, csvfile, ddir, lat, "bins empty")
+
+        print(csvfile, "validated")
+
+    return 0
+
+
  def main():
+    """Starting point for this script.
+
+    In standard mode, this script will generate CSV data from fio json+ output.
+    In validation mode it will check to make sure that counts in CSV files
+    match the counts in the json+ data.
+    """
+
      args = parse_args()
  
      with open(args.source, 'r') as source:
          jsondata = json.loads(source.read())
  
+    ddir_lat_list = list(ddir + '_' + lat for ddir, lat in itertools.product(DDIR_LIST, LAT_LIST))
+    debug_print(args.debug, 'ddir_lat_list: ', ddir_lat_list)
+    col_labels = 'nsec, '
+    for ddir_lat in ddir_lat_list:
+        col_labels += "{0}_count, {0}_cumulative, {0}_percentile, ".format(ddir_lat)
+    debug_print(args.debug, 'col_labels: ', col_labels)
+
+    if args.validate:
+        return validate(args, jsondata, col_labels)
+
      for jobnum in range(0, len(jsondata['jobs'])):
          bins = {}
          run_total = {}
-        ddir_set = set(['read', 'write', 'trim'])
-
-        prev_ddir = None
-        for ddir in ddir_set:
-            if 'bins' in jsondata['jobs'][jobnum][ddir]['clat_ns']:
-                bins_loc = 'clat_ns'
-            elif 'bins' in jsondata['jobs'][jobnum][ddir]['lat_ns']:
-                bins_loc = 'lat_ns'
-            else:
-                raise RuntimeError("Latency bins not found. "
-                                   "Are you sure you are using json+ output?")
-
-            bins[ddir] = [[int(key), value] for key, value in
-                          six.iteritems(jsondata['jobs'][jobnum][ddir][bins_loc]
-                          ['bins'])]
-            bins[ddir] = sorted(bins[ddir], key=lambda bin: bin[0])
-
-            run_total[ddir] = [0 for x in range(0, len(bins[ddir]))]
-            if len(bins[ddir]) > 0:
-                run_total[ddir][0] = bins[ddir][0][1]
-                for x in range(1, len(bins[ddir])):
-                    run_total[ddir][x] = run_total[ddir][x-1] + \
-                        bins[ddir][x][1]
-
-        stub, ext = os.path.splitext(args.dest)
-        outfile = stub + '_job' + str(jobnum) + ext
-
-        with open(outfile, 'w') as output:
-            output.write("{0}ec, ".format(bins_loc))
-            ddir_list = list(ddir_set)
-            for ddir in ddir_list:
-                output.write("{0}_count, {0}_cumulative, {0}_percentile, ".
-                             format(ddir))
-            output.write("\n")
+
+        for ddir in DDIR_LIST:
+            ddir_data = jsondata['jobs'][jobnum][ddir]
+            for lat in LAT_LIST:
+                ddir_lat = ddir + '_' + lat
+                if lat not in ddir_data or 'bins' not in ddir_data[lat]:
+                    bins[ddir_lat] = []
+                    debug_print(args.debug, 'job', jobnum, ddir_lat, 'not found')
+                    continue
+
+                debug_print(args.debug, 'job', jobnum, ddir_lat, 'processing')
+                bins[ddir_lat] = [[int(key), value] for key, value in
+                                  six.iteritems(ddir_data[lat]['bins'])]
+                bins[ddir_lat] = sorted(bins[ddir_lat], key=lambda bin: bin[0])
+
+                run_total[ddir_lat] = [0 for x in range(0, len(bins[ddir_lat]))]
+                run_total[ddir_lat][0] = bins[ddir_lat][0][1]
+                for index in range(1, len(bins[ddir_lat])):
+                    run_total[ddir_lat][index] = run_total[ddir_lat][index-1] + \
+                        bins[ddir_lat][index][1]
+
+        csvfile = get_csvfile(args.dest, jobnum)
+        with open(csvfile, 'w') as output:
+            output.write(col_labels + "\n")
  
  #
-# Have a counter for each ddir
+# Have a counter for each ddir_lat pairing
  # In each round, pick the shortest remaining duration
  # and output a line with any values for that duration
  #
-            indices = {x: 0 for x in ddir_list}
-            while more_lines(indices, bins):
+            indices = {x: 0 for x in ddir_lat_list}
+            while more_bins(indices, bins):
+                debug_print(args.debug, 'indices: ', indices)
                  min_lat = 17112760320
-                for ddir in ddir_list:
-                    if indices[ddir] < len(bins[ddir]):
-                        min_lat = min(bins[ddir][indices[ddir]][0], min_lat)
+                for ddir_lat in ddir_lat_list:
+                    if indices[ddir_lat] < len(bins[ddir_lat]):
+                        min_lat = min(bins[ddir_lat][indices[ddir_lat]][0], min_lat)
  
                  output.write("{0}, ".format(min_lat))
  
-                for ddir in ddir_list:
-                    if indices[ddir] < len(bins[ddir]) and \
-                       min_lat == bins[ddir][indices[ddir]][0]:
-                        count = bins[ddir][indices[ddir]][1]
-                        cumulative = run_total[ddir][indices[ddir]]
-                        ptile = percentile(indices[ddir], run_total[ddir])
-                        output.write("{0}, {1}, {2}, ".format(count,
-                                     cumulative, ptile))
-                        indices[ddir] += 1
+                for ddir_lat in ddir_lat_list:
+                    if indices[ddir_lat] < len(bins[ddir_lat]) and \
+                       min_lat == bins[ddir_lat][indices[ddir_lat]][0]:
+                        count = bins[ddir_lat][indices[ddir_lat]][1]
+                        cumulative = run_total[ddir_lat][indices[ddir_lat]]
+                        ptile = percentile(indices[ddir_lat], run_total[ddir_lat])
+                        output.write("{0}, {1}, {2}, ".format(count, cumulative, ptile))
+                        indices[ddir_lat] += 1
                      else:
                          output.write(", , , ")
                  output.write("\n")
  
-            print("{0} generated".format(outfile))
+            print("{0} generated".format(csvfile))
  
  
  if __name__ == '__main__':