[fio.git] / tools / hist / fiologparser_hist.py

#!/usr/bin/env python3
""" 
    Utility for converting *_clat_hist* files generated by fio into latency statistics.
    
    Example usage:
    
            $ fiologparser_hist.py *_clat_hist*
            end-time, samples, min, avg, median, 90%, 95%, 99%, max
            1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
            2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
            4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
            ...
    
    @author Karl Cronburg <karl.cronburg@gmail.com>
"""
import os
import sys
import pandas
import re
import numpy as np

runascmd = False

err = sys.stderr.write

class HistFileRdr():
    """ Class to read a hist file line by line, buffering
        a value array for the latest line, and allowing a preview
        of the next timestamp in next line
        Note: this does not follow a generator pattern, but must explicitly
        get next bin array.
    """
    def __init__(self, file):
        self.fp = open(file, 'r')
        self.data = self.nextData()

    def close(self):
        self.fp.close()
        self.fp = None

    def nextData(self):
        self.data = None
        if self.fp:
            line = self.fp.readline()
            if line == "":
                self.close()
            else:
                self.data = [int(x) for x in line.replace(' ', '').rstrip().split(',')]

        return self.data

    @property
    def curTS(self):
        ts = None
        if self.data:
            ts = self.data[0]
        return ts

    @property
    def curDir(self):
        d = None
        if self.data:
            d = self.data[1]
        return d

    @property
    def curBins(self):
        return self.data[3:]

def weighted_percentile(percs, vs, ws):
    """ Use linear interpolation to calculate the weighted percentile.
        
        Value and weight arrays are first sorted by value. The cumulative
        distribution function (cdf) is then computed, after which np.interp
        finds the two values closest to our desired weighted percentile(s)
        and linearly interpolates them.
        
        percs  :: List of percentiles we want to calculate
        vs     :: Array of values we are computing the percentile of
        ws     :: Array of weights for our corresponding values
        return :: Array of percentiles
    """
    idx = np.argsort(vs)
    vs, ws = vs[idx], ws[idx] # weights and values sorted by value
    cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
    return np.interp(percs, cdf, vs) # linear interpolation

def weights(start_ts, end_ts, start, end):
    """ Calculate weights based on fraction of sample falling in the
        given interval [start,end]. Weights computed using vector / array
        computation instead of for-loops.

        Note that samples with zero time length are effectively ignored
        (we set their weight to zero).

        start_ts :: Array of start times for a set of samples
        end_ts   :: Array of end times for a set of samples
        start    :: int
        end      :: int
        return   :: Array of weights
    """
    sbounds = np.maximum(start_ts, start).astype(float)
    ebounds = np.minimum(end_ts,   end).astype(float)
    ws = (ebounds - sbounds) / (end_ts - start_ts)
    if np.any(np.isnan(ws)):
      err("WARNING: zero-length sample(s) detected. Log file corrupt"
          " / bad time values? Ignoring these samples.\n")
    ws[np.where(np.isnan(ws))] = 0.0;
    return ws

def weighted_average(vs, ws):
    return np.sum(vs * ws) / np.sum(ws)


percs = None
columns = None

def gen_output_columns(ctx):
    global percs,columns
    strpercs = re.split('[,:]', ctx.percentiles)
    percs = [50.0]  # always print 50% in 'median' column
    percs.extend(list(map(float,strpercs)))
    if ctx.directions:
        columns = ["end-time", "dir", "samples", "min", "avg", "median"]
    else:
        columns = ["end-time", "samples", "min", "avg", "median"]
    columns.extend(list([x+'%' for x in strpercs]))
    columns.append("max")

def fmt_float_list(ctx, num=1):
  """ Return a comma separated list of float formatters to the required number
      of decimal places. For instance:

        fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
  """
  return ', '.join(["%%.%df" % ctx.decimals] * num)

# Default values - see beginning of main() for how we detect number columns in
# the input files:
__HIST_COLUMNS = 1216
__NON_HIST_COLUMNS = 3
__TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS

def read_chunk(rdr, sz):
    """ Read the next chunk of size sz from the given reader. """
    try:
        """ StopIteration occurs when the pandas reader is empty, and AttributeError
            occurs if rdr is None due to the file being empty. """
        new_arr = rdr.read().values
    except (StopIteration, AttributeError):
        return None

    # Let's leave the array as is, and let later code ignore the block size
    return new_arr

    #""" Extract array of the times, directions wo times, and histograms matrix without times column. """
    #times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
    #hists = new_arr[:,__NON_HIST_COLUMNS:]
    #times = times.reshape((len(times),1))
    #dirs  = rws.reshape((len(rws),1))
    #arr = np.append(times, hists, axis=1)
    #return arr

def get_min(fps, arrs):
    """ Find the file with the current first row with the smallest start time """
    return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])

def histogram_generator(ctx, fps, sz):
    
    # Create a chunked pandas reader for each of the files:
    rdrs = {}
    for fp in fps:
        try:
            rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
        except ValueError as e:
            if e.message == 'No columns to parse from file':
                if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
                rdrs[fp] = None
            else:
                raise(e)

    # Initial histograms from disk:
    arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
    while True:

        try:
            """ ValueError occurs when nothing more to read """
            fp = get_min(fps, arrs)
        except ValueError:
            return
        arr = arrs[fp]
        arri = np.insert(arr[0], 1, fps.index(fp))
        yield arri
        arrs[fp] = arr[1:]

        if arrs[fp].shape[0] == 0:
            arrs[fp] = read_chunk(rdrs[fp], sz)

def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
    """ Taken from fio's stat.c for calculating the latency value of a bin
        from that bin's index.
        
            idx  : the value of the index into the histogram bins
            edge : fractional value in the range [0,1]** indicating how far into
            the bin we wish to compute the latency value of.
        
        ** edge = 0.0 and 1.0 computes the lower and upper latency bounds
           respectively of the given bin index. """

    # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
    # all bits of the sample as index
    if (idx < (FIO_IO_U_PLAT_VAL << 1)):
        return idx 

    # Find the group and compute the minimum value of that group
    error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1 
    base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)

    # Find its bucket number of the group
    k = idx % FIO_IO_U_PLAT_VAL

    # Return the mean (if edge=0.5) of the range of the bucket
    return base + ((k + edge) * (1 << error_bits))
    
def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
    """ Converts the given *coarse* index into a non-coarse index as used by fio
        in stat.h:plat_idx_to_val(), subsequently computing the appropriate
        latency value for that bin.
        """

    # Multiply the index by the power of 2 coarseness to get the bin
    # bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
    stride = 1 << coarseness
    idx = idx * stride
    lower = _plat_idx_to_val(idx, edge=0.0)
    upper = _plat_idx_to_val(idx + stride, edge=1.0)
    return lower + (upper - lower) * edge

def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx, dir=dir):
    ps = weighted_percentile(percs, vs, ws)

    avg = weighted_average(vs, ws)
    values = [mn, avg] + list(ps) + [mx]
    if ctx.directions:
        row = [end, dir, ss_cnt]
        fmt = "%d, %s, %d, "
    else:
        row = [end, ss_cnt]
        fmt = "%d, %d, "
    row = row + [float(x) / ctx.divisor for x in values]
    if ctx.divisor > 1:
        fmt = fmt + fmt_float_list(ctx, len(percs)+3)
    else:
        # max and min are decimal values if no divisor
        fmt = fmt + "%d, " + fmt_float_list(ctx, len(percs)+1) + ", %d"

    print (fmt % tuple(row))

def update_extreme(val, fncn, new_val):
    """ Calculate min / max in the presence of None values """
    if val is None: return new_val
    else: return fncn(val, new_val)

# See beginning of main() for how bin_vals are computed
bin_vals = []
lower_bin_vals = [] # lower edge of each bin
upper_bin_vals = [] # upper edge of each bin 

def process_interval(ctx, iHist, iEnd, dir):
    """ print estimated percentages for the given merged sample
    """
    ss_cnt = 0 # number of samples affecting this interval
    mn_bin_val, mx_bin_val = None, None

    # Update total number of samples affecting current interval histogram:
    ss_cnt += np.sum(iHist)

    # Update min and max bin values
    idxs = np.nonzero(iHist != 0)[0]
    if idxs.size > 0:
        mn_bin_val = bin_vals[idxs[0]]
        mx_bin_val = bin_vals[idxs[-1]]

    if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val, dir=dir)


dir_map = ['r', 'w', 't']  # map of directional value in log to textual representation
def process_weighted_interval(ctx, samples, iStart, iEnd, printdirs):
    """ Construct the weighted histogram for the given interval by scanning
        through all the histograms and figuring out which of their bins have
        samples with latencies which overlap with the given interval
        [iStart,iEnd].
    """

    times, files, dirs, sizes, hists = samples[:,0], samples[:,1], samples[:,2], samples[:,3], samples[:,4:]
    iHist={}; ss_cnt = {}; mn_bin_val={}; mx_bin_val={}
    for dir in printdirs:
        iHist[dir] = np.zeros(__HIST_COLUMNS, dtype=float)
        ss_cnt[dir] = 0 # number of samples affecting this interval
        mn_bin_val[dir] = None
        mx_bin_val[dir] = None

    for end_time,file,dir,hist in zip(times,files,dirs,hists):

        # Only look at bins of the current histogram sample which
        # started before the end of the current time interval [start,end]
        start_times = (end_time - 0.5 * ctx.interval) - bin_vals / ctx.time_divisor
        idx = np.where(start_times < iEnd)
        s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]

        # Increment current interval histogram by weighted values of future histogram
        # total number of samples
        # and min and max values as necessary
        textdir = dir_map[dir]
        ws = hs * weights(s_ts, end_time, iStart, iEnd)
        mmidx = np.where(hs != 0)[0]
        if 'm' in printdirs:
            iHist['m'][idx] += ws
            ss_cnt['m'] += np.sum(hs)
            if mmidx.size > 0:
                mn_bin_val['m'] = update_extreme(mn_bin_val['m'], min, l_bvs[max(0,           mmidx[0]  - 1)])
                mx_bin_val['m'] = update_extreme(mx_bin_val['m'], max, u_bvs[min(len(hs) - 1, mmidx[-1] + 1)])
        if textdir in printdirs:
            iHist[textdir][idx] += ws
            ss_cnt[textdir] += np.sum(hs)  # Update total number of samples affecting current interval histogram:
            if mmidx.size > 0:
                mn_bin_val[textdir] = update_extreme(mn_bin_val[textdir], min, l_bvs[max(0,           mmidx[0]  - 1)])
                mx_bin_val[textdir] = update_extreme(mx_bin_val[textdir], max, u_bvs[min(len(hs) - 1, mmidx[-1] + 1)])

    for textdir in sorted(printdirs):
        if ss_cnt[textdir] > 0: print_all_stats(ctx, iEnd, mn_bin_val[textdir], ss_cnt[textdir], bin_vals, iHist[textdir], mx_bin_val[textdir], dir=textdir)

def guess_max_from_bins(ctx, hist_cols):
    """ Try to guess the GROUP_NR from given # of histogram
        columns seen in an input file """
    max_coarse = 8
    if ctx.group_nr < 19 or ctx.group_nr > 26:
        bins = [ctx.group_nr * (1 << 6)]
    else:
        bins = [1216,1280,1344,1408,1472,1536,1600,1664]
    coarses = range(max_coarse + 1)
    fncn = lambda z: list([z/2**x if z % 2**x == 0 else -10 for x in coarses])
    
    arr = np.transpose(list(map(fncn, bins)))
    idx = np.where(arr == hist_cols)
    if len(idx[1]) == 0:
        table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array','     ')
        errmsg = ("Unable to determine bin values from input clat_hist files. Namely \n"
            "the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
            "columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
            "This number needs to be equal to one of the following numbers:\n\n"
            + table + "\n\n"
            "Possible reasons and corresponding solutions:\n"
            "  - Input file(s) does not contain histograms.\n"
            "  - You recompiled fio with a different GROUP_NR. If so please specify this\n"
            "    new GROUP_NR on the command line with --group_nr\n")
        if runascmd:
            err(errmsg)
            exit(1)
        else:
            raise RuntimeError(errmsg) 

    return bins[idx[1][0]]

def output_weighted_interval_data(ctx,printdirs):

    fps = [open(f, 'r') for f in ctx.FILE]
    gen = histogram_generator(ctx, fps, ctx.buff_size)

    print(', '.join(columns))

    try:
        start, end = 0, ctx.interval
        arr = np.empty(shape=(0,__TOTAL_COLUMNS + 1),dtype=int)
        more_data = True
        while more_data or len(arr) > 0:

            # Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
            while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
                try:
                    new_arr = next(gen)
                except StopIteration:
                    more_data = False
                    break
                nashape  = new_arr.reshape((1,__TOTAL_COLUMNS + 1))
                arr = np.append(arr, nashape, axis=0)
            #arr = arr.astype(int)
            
            if arr.size > 0:
                # Jump immediately to the start of the input, rounding
                # down to the nearest multiple of the interval (useful when --log_unix_epoch
                # was used to create these histograms):
                if start == 0 and arr[0][0] - ctx.max_latency > end:
                    start = arr[0][0] - ctx.max_latency
                    start = start - (start % ctx.interval)
                    end = start + ctx.interval

                process_weighted_interval(ctx, arr, start, end, printdirs)
                
                # Update arr to throw away samples we no longer need - samples which
                # end before the start of the next interval, i.e. the end of the
                # current interval:
                idx = np.where(arr[:,0] > end)
                arr = arr[idx]
            
            start += ctx.interval
            end = start + ctx.interval
    finally:
        for fp in fps:
            fp.close()

def output_interval_data(ctx,directions):
    fps = [HistFileRdr(f) for f in ctx.FILE]

    print(', '.join(columns))

    start = 0
    end = ctx.interval
    while True:

        more_data = False

        # add bins from all files in target intervals
        arr = None
        numSamples = 0
        while True:
            foundSamples = False
            for fp in fps:
                ts = fp.curTS
                if ts and ts+10 < end:  # shift sample time when very close to an end time
                    curdirect = fp.curDir
                    numSamples += 1
                    foundSamples = True
                    if arr is None:
                        arr = {}
                        for d in directions:
                            arr[d] = np.zeros(shape=(__HIST_COLUMNS), dtype=int)
                    if 'm' in arr:
                        arr['m'] = np.add(arr['m'], fp.curBins)
                    if 'r' in arr and curdirect == 0:
                        arr['r'] = np.add(arr['r'], fp.curBins)
                    if 'w' in arr and curdirect == 1:
                        arr['w'] = np.add(arr['w'], fp.curBins)
                    if 't' in arr and curdirect == 2:
                        arr['t'] = np.add(arr['t'], fp.curBins)

                    more_data = True
                    fp.nextData()
                elif ts:
                    more_data = True

            # reached end of all files
            # or gone through all files without finding sample in interval
            if not more_data or not foundSamples:
                break

        if arr is not None:
            #print("{} size({}) samples({}) nonzero({}):".format(end, arr.size, numSamples, np.count_nonzero(arr)), str(arr), )
            for d in sorted(arr.keys()):
                aval = arr[d]
                process_interval(ctx, aval, end, d)

        # reach end of all files
        if not more_data:
            break

        start += ctx.interval
        end = start + ctx.interval

def main(ctx):

    if ctx.job_file:
        from configparser import SafeConfigParser, NoOptionError

        cp = SafeConfigParser(allow_no_value=True)
        with open(ctx.job_file, 'r') as fp:
            cp.readfp(fp)

        if ctx.interval is None:
            # Auto detect --interval value
            for s in cp.sections():
                try:
                    hist_msec = cp.get(s, 'log_hist_msec')
                    if hist_msec is not None:
                        ctx.interval = int(hist_msec)
                except NoOptionError:
                    pass

    if not hasattr(ctx, 'percentiles'):
        ctx.percentiles = "90,95,99"

    if ctx.directions:
        ctx.directions = ctx.directions.lower()

    if ctx.interval is None:
        ctx.interval = 1000

    if ctx.usbin:
        ctx.time_divisor = 1000.0        # bins are in us
    else:
        ctx.time_divisor = 1000000.0     # bins are in ns

    gen_output_columns(ctx)


    # Automatically detect how many columns are in the input files,
    # calculate the corresponding 'coarseness' parameter used to generate
    # those files, and calculate the appropriate bin latency values:
    with open(ctx.FILE[0], 'r') as fp:
        global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
        __TOTAL_COLUMNS = len(fp.readline().split(','))
        __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS

        max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
        coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
        bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness) for x in np.arange(__HIST_COLUMNS)], dtype=float)
        lower_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 0.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
        upper_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 1.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)

    # indicate which directions to output (read(0), write(1), trim(2), mixed(3))
    directions = set()
    if not ctx.directions or 'm' in ctx.directions: directions.add('m')
    if ctx.directions and 'r' in ctx.directions:    directions.add('r')
    if ctx.directions and 'w' in ctx.directions:    directions.add('w')
    if ctx.directions and 't' in ctx.directions:    directions.add('t')

    if ctx.noweight:
        output_interval_data(ctx, directions)
    else:
        output_weighted_interval_data(ctx, directions)


if __name__ == '__main__':
    import argparse
    runascmd = True
    p = argparse.ArgumentParser()
    arg = p.add_argument
    arg("FILE", help='space separated list of latency log filenames', nargs='+')
    arg('--buff_size',
        default=10000,
        type=int,
        help='number of samples to buffer into numpy at a time')

    arg('--max_latency',
        default=20,
        type=float,
        help='number of seconds of data to process at a time')

    arg('-i', '--interval',
        type=int,
        help='interval width (ms), default 1000 ms')

    arg('--noweight',
        action='store_true',
        default=False,
        help='do not perform weighting of samples between output intervals')

    arg('-d', '--divisor',
        required=False,
        type=int,
        default=1,
        help='divide the results by this value.')

    arg('--decimals',
        default=3,
        type=int,
        help='number of decimal places to print floats to')

    arg('--warn',
        dest='warn',
        action='store_true',
        default=False,
        help='print warning messages to stderr')

    arg('--group_nr',
        default=29,
        type=int,
        help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')

    arg('--job-file',
        default=None,
        type=str,
        help='Optional argument pointing to the job file used to create the '
             'given histogram files. Useful for auto-detecting --log_hist_msec and '
             '--log_unix_epoch (in fio) values.')

    arg('--percentiles',
        default="90:95:99",
        type=str,
        help='Optional argument of comma or colon separated percentiles to print. '
             'The default is "90.0:95.0:99.0".  min, median(50%%) and max percentiles are always printed')

    arg('--usbin',
        default=False,
        action='store_true',
        help='histogram bin latencies are in us (fio versions < 2.99. fio uses ns for version >= 2.99')

    arg('--directions',
        default=None,
        type=str,
        help='Optionally split results output by reads, writes, trims or mixed. '
             'Value may be any combination of "rwtm" characters. '
             'By default, only "mixed" results are output without a "dir" field. '
             'But, specifying the --directions option '
             'adds a "dir" field to the output content, and separate rows for each of the indicated '
             'directions.')

    main(p.parse_args())
Commit	Line	Data
	1	#!/usr/bin/env python3
	2	"""
	3	Utility for converting _clat_hist files generated by fio into latency statistics.
	4
	5	Example usage:
	6
	7	$ fiologparser_hist.py _clat_hist
	8	end-time, samples, min, avg, median, 90%, 95%, 99%, max
	9	1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
	10	2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
	11	4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
	12	...
	13
	14	@author Karl Cronburg <karl.cronburg@gmail.com>
	15	"""
	16	import os
	17	import sys
	18	import pandas
	19	import re
	20	import numpy as np
	21
	22	runascmd = False
	23
	24	err = sys.stderr.write
	25
	26	class HistFileRdr():
	27	""" Class to read a hist file line by line, buffering
	28	a value array for the latest line, and allowing a preview
	29	of the next timestamp in next line
	30	Note: this does not follow a generator pattern, but must explicitly
	31	get next bin array.
	32	"""
	33	def __init__(self, file):
	34	self.fp = open(file, 'r')
	35	self.data = self.nextData()
	36
	37	def close(self):
	38	self.fp.close()
	39	self.fp = None
	40
	41	def nextData(self):
	42	self.data = None
	43	if self.fp:
	44	line = self.fp.readline()
	45	if line == "":
	46	self.close()
	47	else:
	48	self.data = [int(x) for x in line.replace(' ', '').rstrip().split(',')]
	49
	50	return self.data
	51
	52	@property
	53	def curTS(self):
	54	ts = None
	55	if self.data:
	56	ts = self.data[0]
	57	return ts
	58
	59	@property
	60	def curDir(self):
	61	d = None
	62	if self.data:
	63	d = self.data[1]
	64	return d
	65
	66	@property
	67	def curBins(self):
	68	return self.data[3:]
	69
	70	def weighted_percentile(percs, vs, ws):
	71	""" Use linear interpolation to calculate the weighted percentile.
	72
	73	Value and weight arrays are first sorted by value. The cumulative
	74	distribution function (cdf) is then computed, after which np.interp
	75	finds the two values closest to our desired weighted percentile(s)
	76	and linearly interpolates them.
	77
	78	percs :: List of percentiles we want to calculate
	79	vs :: Array of values we are computing the percentile of
	80	ws :: Array of weights for our corresponding values
	81	return :: Array of percentiles
	82	"""
	83	idx = np.argsort(vs)
	84	vs, ws = vs[idx], ws[idx] # weights and values sorted by value
	85	cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
	86	return np.interp(percs, cdf, vs) # linear interpolation
	87
	88	def weights(start_ts, end_ts, start, end):
	89	""" Calculate weights based on fraction of sample falling in the
	90	given interval [start,end]. Weights computed using vector / array
	91	computation instead of for-loops.
	92
	93	Note that samples with zero time length are effectively ignored
	94	(we set their weight to zero).
	95
	96	start_ts :: Array of start times for a set of samples
	97	end_ts :: Array of end times for a set of samples
	98	start :: int
	99	end :: int
	100	return :: Array of weights
	101	"""
	102	sbounds = np.maximum(start_ts, start).astype(float)
	103	ebounds = np.minimum(end_ts, end).astype(float)
	104	ws = (ebounds - sbounds) / (end_ts - start_ts)
	105	if np.any(np.isnan(ws)):
	106	err("WARNING: zero-length sample(s) detected. Log file corrupt"
	107	" / bad time values? Ignoring these samples.\n")
	108	ws[np.where(np.isnan(ws))] = 0.0;
	109	return ws
	110
	111	def weighted_average(vs, ws):
	112	return np.sum(vs * ws) / np.sum(ws)
	113
	114
	115	percs = None
	116	columns = None
	117
	118	def gen_output_columns(ctx):
	119	global percs,columns
	120	strpercs = re.split('[,:]', ctx.percentiles)
	121	percs = [50.0] # always print 50% in 'median' column
	122	percs.extend(list(map(float,strpercs)))
	123	if ctx.directions:
	124	columns = ["end-time", "dir", "samples", "min", "avg", "median"]
	125	else:
	126	columns = ["end-time", "samples", "min", "avg", "median"]
	127	columns.extend(list([x+'%' for x in strpercs]))
	128	columns.append("max")
	129
	130	def fmt_float_list(ctx, num=1):
	131	""" Return a comma separated list of float formatters to the required number
	132	of decimal places. For instance:
	133
	134	fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
	135	"""
	136	return ', '.join(["%%.%df" % ctx.decimals] * num)
	137
	138	# Default values - see beginning of main() for how we detect number columns in
	139	# the input files:
	140	__HIST_COLUMNS = 1216
	141	__NON_HIST_COLUMNS = 3
	142	__TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
	143
	144	def read_chunk(rdr, sz):
	145	""" Read the next chunk of size sz from the given reader. """
	146	try:
	147	""" StopIteration occurs when the pandas reader is empty, and AttributeError
	148	occurs if rdr is None due to the file being empty. """
	149	new_arr = rdr.read().values
	150	except (StopIteration, AttributeError):
	151	return None
	152
	153	# Let's leave the array as is, and let later code ignore the block size
	154	return new_arr
	155
	156	#""" Extract array of the times, directions wo times, and histograms matrix without times column. """
	157	#times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
	158	#hists = new_arr[:,__NON_HIST_COLUMNS:]
	159	#times = times.reshape((len(times),1))
	160	#dirs = rws.reshape((len(rws),1))
	161	#arr = np.append(times, hists, axis=1)
	162	#return arr
	163
	164	def get_min(fps, arrs):
	165	""" Find the file with the current first row with the smallest start time """
	166	return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])
	167
	168	def histogram_generator(ctx, fps, sz):
	169
	170	# Create a chunked pandas reader for each of the files:
	171	rdrs = {}
	172	for fp in fps:
	173	try:
	174	rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
	175	except ValueError as e:
	176	if e.message == 'No columns to parse from file':
	177	if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
	178	rdrs[fp] = None
	179	else:
	180	raise(e)
	181
	182	# Initial histograms from disk:
	183	arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
	184	while True:
	185
	186	try:
	187	""" ValueError occurs when nothing more to read """
	188	fp = get_min(fps, arrs)
	189	except ValueError:
	190	return
	191	arr = arrs[fp]
	192	arri = np.insert(arr[0], 1, fps.index(fp))
	193	yield arri
	194	arrs[fp] = arr[1:]
	195
	196	if arrs[fp].shape[0] == 0:
	197	arrs[fp] = read_chunk(rdrs[fp], sz)
	198
	199	def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
	200	""" Taken from fio's stat.c for calculating the latency value of a bin
	201	from that bin's index.
	202
	203	idx : the value of the index into the histogram bins
	204	edge : fractional value in the range [0,1]** indicating how far into
	205	the bin we wish to compute the latency value of.
	206
	207	** edge = 0.0 and 1.0 computes the lower and upper latency bounds
	208	respectively of the given bin index. """
	209
	210	# MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
	211	# all bits of the sample as index
	212	if (idx < (FIO_IO_U_PLAT_VAL << 1)):
	213	return idx
	214
	215	# Find the group and compute the minimum value of that group
	216	error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1
	217	base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)
	218
	219	# Find its bucket number of the group
	220	k = idx % FIO_IO_U_PLAT_VAL
	221
	222	# Return the mean (if edge=0.5) of the range of the bucket
	223	return base + ((k + edge) * (1 << error_bits))
	224
	225	def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
	226	""" Converts the given coarse index into a non-coarse index as used by fio
	227	in stat.h:plat_idx_to_val(), subsequently computing the appropriate
	228	latency value for that bin.
	229	"""
	230
	231	# Multiply the index by the power of 2 coarseness to get the bin
	232	# bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
	233	stride = 1 << coarseness
	234	idx = idx * stride
	235	lower = _plat_idx_to_val(idx, edge=0.0)
	236	upper = _plat_idx_to_val(idx + stride, edge=1.0)
	237	return lower + (upper - lower) * edge
	238
	239	def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx, dir=dir):
	240	ps = weighted_percentile(percs, vs, ws)
	241
	242	avg = weighted_average(vs, ws)
	243	values = [mn, avg] + list(ps) + [mx]
	244	if ctx.directions:
	245	row = [end, dir, ss_cnt]
	246	fmt = "%d, %s, %d, "
	247	else:
	248	row = [end, ss_cnt]
	249	fmt = "%d, %d, "
	250	row = row + [float(x) / ctx.divisor for x in values]
	251	if ctx.divisor > 1:
	252	fmt = fmt + fmt_float_list(ctx, len(percs)+3)
	253	else:
	254	# max and min are decimal values if no divisor
	255	fmt = fmt + "%d, " + fmt_float_list(ctx, len(percs)+1) + ", %d"
	256
	257	print (fmt % tuple(row))
	258
	259	def update_extreme(val, fncn, new_val):
	260	""" Calculate min / max in the presence of None values """
	261	if val is None: return new_val
	262	else: return fncn(val, new_val)
	263
	264	# See beginning of main() for how bin_vals are computed
	265	bin_vals = []
	266	lower_bin_vals = [] # lower edge of each bin
	267	upper_bin_vals = [] # upper edge of each bin
	268
	269	def process_interval(ctx, iHist, iEnd, dir):
	270	""" print estimated percentages for the given merged sample
	271	"""
	272	ss_cnt = 0 # number of samples affecting this interval
	273	mn_bin_val, mx_bin_val = None, None
	274
	275	# Update total number of samples affecting current interval histogram:
	276	ss_cnt += np.sum(iHist)
	277
	278	# Update min and max bin values
	279	idxs = np.nonzero(iHist != 0)[0]
	280	if idxs.size > 0:
	281	mn_bin_val = bin_vals[idxs[0]]
	282	mx_bin_val = bin_vals[idxs[-1]]
	283
	284	if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val, dir=dir)
	285
	286
	287	dir_map = ['r', 'w', 't'] # map of directional value in log to textual representation
	288	def process_weighted_interval(ctx, samples, iStart, iEnd, printdirs):
	289	""" Construct the weighted histogram for the given interval by scanning
	290	through all the histograms and figuring out which of their bins have
	291	samples with latencies which overlap with the given interval
	292	[iStart,iEnd].
	293	"""
	294
	295	times, files, dirs, sizes, hists = samples[:,0], samples[:,1], samples[:,2], samples[:,3], samples[:,4:]
	296	iHist={}; ss_cnt = {}; mn_bin_val={}; mx_bin_val={}
	297	for dir in printdirs:
	298	iHist[dir] = np.zeros(__HIST_COLUMNS, dtype=float)
	299	ss_cnt[dir] = 0 # number of samples affecting this interval
	300	mn_bin_val[dir] = None
	301	mx_bin_val[dir] = None
	302
	303	for end_time,file,dir,hist in zip(times,files,dirs,hists):
	304
	305	# Only look at bins of the current histogram sample which
	306	# started before the end of the current time interval [start,end]
	307	start_times = (end_time - 0.5 * ctx.interval) - bin_vals / ctx.time_divisor
	308	idx = np.where(start_times < iEnd)
	309	s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]
	310
	311	# Increment current interval histogram by weighted values of future histogram
	312	# total number of samples
	313	# and min and max values as necessary
	314	textdir = dir_map[dir]
	315	ws = hs * weights(s_ts, end_time, iStart, iEnd)
	316	mmidx = np.where(hs != 0)[0]
	317	if 'm' in printdirs:
	318	iHist['m'][idx] += ws
	319	ss_cnt['m'] += np.sum(hs)
	320	if mmidx.size > 0:
	321	mn_bin_val['m'] = update_extreme(mn_bin_val['m'], min, l_bvs[max(0, mmidx[0] - 1)])
	322	mx_bin_val['m'] = update_extreme(mx_bin_val['m'], max, u_bvs[min(len(hs) - 1, mmidx[-1] + 1)])
	323	if textdir in printdirs:
	324	iHist[textdir][idx] += ws
	325	ss_cnt[textdir] += np.sum(hs) # Update total number of samples affecting current interval histogram:
	326	if mmidx.size > 0:
	327	mn_bin_val[textdir] = update_extreme(mn_bin_val[textdir], min, l_bvs[max(0, mmidx[0] - 1)])
	328	mx_bin_val[textdir] = update_extreme(mx_bin_val[textdir], max, u_bvs[min(len(hs) - 1, mmidx[-1] + 1)])
	329
	330	for textdir in sorted(printdirs):
	331	if ss_cnt[textdir] > 0: print_all_stats(ctx, iEnd, mn_bin_val[textdir], ss_cnt[textdir], bin_vals, iHist[textdir], mx_bin_val[textdir], dir=textdir)
	332
	333	def guess_max_from_bins(ctx, hist_cols):
	334	""" Try to guess the GROUP_NR from given # of histogram
	335	columns seen in an input file """
	336	max_coarse = 8
	337	if ctx.group_nr < 19 or ctx.group_nr > 26:
	338	bins = [ctx.group_nr * (1 << 6)]
	339	else:
	340	bins = [1216,1280,1344,1408,1472,1536,1600,1664]
	341	coarses = range(max_coarse + 1)
	342	fncn = lambda z: list([z/2x if z % 2x == 0 else -10 for x in coarses])
	343
	344	arr = np.transpose(list(map(fncn, bins)))
	345	idx = np.where(arr == hist_cols)
	346	if len(idx[1]) == 0:
	347	table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array',' ')
	348	errmsg = ("Unable to determine bin values from input clat_hist files. Namely \n"
	349	"the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
	350	"columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
	351	"This number needs to be equal to one of the following numbers:\n\n"
	352	+ table + "\n\n"
	353	"Possible reasons and corresponding solutions:\n"
	354	" - Input file(s) does not contain histograms.\n"
	355	" - You recompiled fio with a different GROUP_NR. If so please specify this\n"
	356	" new GROUP_NR on the command line with --group_nr\n")
	357	if runascmd:
	358	err(errmsg)
	359	exit(1)
	360	else:
	361	raise RuntimeError(errmsg)
	362
	363	return bins[idx[1][0]]
	364
	365	def output_weighted_interval_data(ctx,printdirs):
	366
	367	fps = [open(f, 'r') for f in ctx.FILE]
	368	gen = histogram_generator(ctx, fps, ctx.buff_size)
	369
	370	print(', '.join(columns))
	371
	372	try:
	373	start, end = 0, ctx.interval
	374	arr = np.empty(shape=(0,__TOTAL_COLUMNS + 1),dtype=int)
	375	more_data = True
	376	while more_data or len(arr) > 0:
	377
	378	# Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
	379	while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
	380	try:
	381	new_arr = next(gen)
	382	except StopIteration:
	383	more_data = False
	384	break
	385	nashape = new_arr.reshape((1,__TOTAL_COLUMNS + 1))
	386	arr = np.append(arr, nashape, axis=0)
	387	#arr = arr.astype(int)
	388
	389	if arr.size > 0:
	390	# Jump immediately to the start of the input, rounding
	391	# down to the nearest multiple of the interval (useful when --log_unix_epoch
	392	# was used to create these histograms):
	393	if start == 0 and arr[0][0] - ctx.max_latency > end:
	394	start = arr[0][0] - ctx.max_latency
	395	start = start - (start % ctx.interval)
	396	end = start + ctx.interval
	397
	398	process_weighted_interval(ctx, arr, start, end, printdirs)
	399
	400	# Update arr to throw away samples we no longer need - samples which
	401	# end before the start of the next interval, i.e. the end of the
	402	# current interval:
	403	idx = np.where(arr[:,0] > end)
	404	arr = arr[idx]
	405
	406	start += ctx.interval
	407	end = start + ctx.interval
	408	finally:
	409	for fp in fps:
	410	fp.close()
	411
	412	def output_interval_data(ctx,directions):
	413	fps = [HistFileRdr(f) for f in ctx.FILE]
	414
	415	print(', '.join(columns))
	416
	417	start = 0
	418	end = ctx.interval
	419	while True:
	420
	421	more_data = False
	422
	423	# add bins from all files in target intervals
	424	arr = None
	425	numSamples = 0
	426	while True:
	427	foundSamples = False
	428	for fp in fps:
	429	ts = fp.curTS
	430	if ts and ts+10 < end: # shift sample time when very close to an end time
	431	curdirect = fp.curDir
	432	numSamples += 1
	433	foundSamples = True
	434	if arr is None:
	435	arr = {}
	436	for d in directions:
	437	arr[d] = np.zeros(shape=(__HIST_COLUMNS), dtype=int)
	438	if 'm' in arr:
	439	arr['m'] = np.add(arr['m'], fp.curBins)
	440	if 'r' in arr and curdirect == 0:
	441	arr['r'] = np.add(arr['r'], fp.curBins)
	442	if 'w' in arr and curdirect == 1:
	443	arr['w'] = np.add(arr['w'], fp.curBins)
	444	if 't' in arr and curdirect == 2:
	445	arr['t'] = np.add(arr['t'], fp.curBins)
	446
	447	more_data = True
	448	fp.nextData()
	449	elif ts:
	450	more_data = True
	451
	452	# reached end of all files
	453	# or gone through all files without finding sample in interval
	454	if not more_data or not foundSamples:
	455	break
	456
	457	if arr is not None:
	458	#print("{} size({}) samples({}) nonzero({}):".format(end, arr.size, numSamples, np.count_nonzero(arr)), str(arr), )
	459	for d in sorted(arr.keys()):
	460	aval = arr[d]
	461	process_interval(ctx, aval, end, d)
	462
	463	# reach end of all files
	464	if not more_data:
	465	break
	466
	467	start += ctx.interval
	468	end = start + ctx.interval
	469
	470	def main(ctx):
	471
	472	if ctx.job_file:
	473	from configparser import SafeConfigParser, NoOptionError
	474
	475	cp = SafeConfigParser(allow_no_value=True)
	476	with open(ctx.job_file, 'r') as fp:
	477	cp.readfp(fp)
	478
	479	if ctx.interval is None:
	480	# Auto detect --interval value
	481	for s in cp.sections():
	482	try:
	483	hist_msec = cp.get(s, 'log_hist_msec')
	484	if hist_msec is not None:
	485	ctx.interval = int(hist_msec)
	486	except NoOptionError:
	487	pass
	488
	489	if not hasattr(ctx, 'percentiles'):
	490	ctx.percentiles = "90,95,99"
	491
	492	if ctx.directions:
	493	ctx.directions = ctx.directions.lower()
	494
	495	if ctx.interval is None:
	496	ctx.interval = 1000
	497
	498	if ctx.usbin:
	499	ctx.time_divisor = 1000.0 # bins are in us
	500	else:
	501	ctx.time_divisor = 1000000.0 # bins are in ns
	502
	503	gen_output_columns(ctx)
	504
	505
	506	# Automatically detect how many columns are in the input files,
	507	# calculate the corresponding 'coarseness' parameter used to generate
	508	# those files, and calculate the appropriate bin latency values:
	509	with open(ctx.FILE[0], 'r') as fp:
	510	global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
	511	__TOTAL_COLUMNS = len(fp.readline().split(','))
	512	__HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
	513
	514	max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
	515	coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
	516	bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness) for x in np.arange(__HIST_COLUMNS)], dtype=float)
	517	lower_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 0.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
	518	upper_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 1.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
	519
	520	# indicate which directions to output (read(0), write(1), trim(2), mixed(3))
	521	directions = set()
	522	if not ctx.directions or 'm' in ctx.directions: directions.add('m')
	523	if ctx.directions and 'r' in ctx.directions: directions.add('r')
	524	if ctx.directions and 'w' in ctx.directions: directions.add('w')
	525	if ctx.directions and 't' in ctx.directions: directions.add('t')
	526
	527	if ctx.noweight:
	528	output_interval_data(ctx, directions)
	529	else:
	530	output_weighted_interval_data(ctx, directions)
	531
	532
	533	if __name__ == '__main__':
	534	import argparse
	535	runascmd = True
	536	p = argparse.ArgumentParser()
	537	arg = p.add_argument
	538	arg("FILE", help='space separated list of latency log filenames', nargs='+')
	539	arg('--buff_size',
	540	default=10000,
	541	type=int,
	542	help='number of samples to buffer into numpy at a time')
	543
	544	arg('--max_latency',
	545	default=20,
	546	type=float,
	547	help='number of seconds of data to process at a time')
	548
	549	arg('-i', '--interval',
	550	type=int,
	551	help='interval width (ms), default 1000 ms')
	552
	553	arg('--noweight',
	554	action='store_true',
	555	default=False,
	556	help='do not perform weighting of samples between output intervals')
	557
	558	arg('-d', '--divisor',
	559	required=False,
	560	type=int,
	561	default=1,
	562	help='divide the results by this value.')
	563
	564	arg('--decimals',
	565	default=3,
	566	type=int,
	567	help='number of decimal places to print floats to')
	568
	569	arg('--warn',
	570	dest='warn',
	571	action='store_true',
	572	default=False,
	573	help='print warning messages to stderr')
	574
	575	arg('--group_nr',
	576	default=29,
	577	type=int,
	578	help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')
	579
	580	arg('--job-file',
	581	default=None,
	582	type=str,
	583	help='Optional argument pointing to the job file used to create the '
	584	'given histogram files. Useful for auto-detecting --log_hist_msec and '
	585	'--log_unix_epoch (in fio) values.')
	586
	587	arg('--percentiles',
	588	default="90:95:99",
	589	type=str,
	590	help='Optional argument of comma or colon separated percentiles to print. '
	591	'The default is "90.0:95.0:99.0". min, median(50%%) and max percentiles are always printed')
	592
	593	arg('--usbin',
	594	default=False,
	595	action='store_true',
	596	help='histogram bin latencies are in us (fio versions < 2.99. fio uses ns for version >= 2.99')
	597
	598	arg('--directions',
	599	default=None,
	600	type=str,
	601	help='Optionally split results output by reads, writes, trims or mixed. '
	602	'Value may be any combination of "rwtm" characters. '
	603	'By default, only "mixed" results are output without a "dir" field. '
	604	'But, specifying the --directions option '
	605	'adds a "dir" field to the output content, and separate rows for each of the indicated '
	606	'directions.')
	607
	608	main(p.parse_args())
	609