[fio.git] / tools / hist / fiologparser_hist.py

#!/usr/bin/env python2.7
""" 
    Utility for converting *_clat_hist* files generated by fio into latency statistics.
    
    Example usage:
    
            $ fiologparser_hist.py *_clat_hist*
            end-time, samples, min, avg, median, 90%, 95%, 99%, max
            1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
            2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
            4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
            ...
    
    @author Karl Cronburg <karl.cronburg@gmail.com>
"""
import os
import sys
import pandas
import numpy as np

err = sys.stderr.write

def weighted_percentile(percs, vs, ws):
    """ Use linear interpolation to calculate the weighted percentile.
        
        Value and weight arrays are first sorted by value. The cumulative
        distribution function (cdf) is then computed, after which np.interp
        finds the two values closest to our desired weighted percentile(s)
        and linearly interpolates them.
        
        percs  :: List of percentiles we want to calculate
        vs     :: Array of values we are computing the percentile of
        ws     :: Array of weights for our corresponding values
        return :: Array of percentiles
    """
    idx = np.argsort(vs)
    vs, ws = vs[idx], ws[idx] # weights and values sorted by value
    cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
    return np.interp(percs, cdf, vs) # linear interpolation

def weights(start_ts, end_ts, start, end):
    """ Calculate weights based on fraction of sample falling in the
        given interval [start,end]. Weights computed using vector / array
        computation instead of for-loops.
    
        Note that samples with zero time length are effectively ignored
        (we set their weight to zero).

        start_ts :: Array of start times for a set of samples
        end_ts   :: Array of end times for a set of samples
        start    :: int
        end      :: int
        return   :: Array of weights
    """
    sbounds = np.maximum(start_ts, start).astype(float)
    ebounds = np.minimum(end_ts,   end).astype(float)
    ws = (ebounds - sbounds) / (end_ts - start_ts)
    if np.any(np.isnan(ws)):
      err("WARNING: zero-length sample(s) detected. Log file corrupt"
          " / bad time values? Ignoring these samples.\n")
    ws[np.where(np.isnan(ws))] = 0.0;
    return ws

def weighted_average(vs, ws):
    return np.sum(vs * ws) / np.sum(ws)

columns = ["end-time", "samples", "min", "avg", "median", "90%", "95%", "99%", "max"]
percs   = [50, 90, 95, 99]

def fmt_float_list(ctx, num=1):
  """ Return a comma separated list of float formatters to the required number
      of decimal places. For instance:

        fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
  """
  return ', '.join(["%%.%df" % ctx.decimals] * num)

# Default values - see beginning of main() for how we detect number columns in
# the input files:
__HIST_COLUMNS = 1216
__NON_HIST_COLUMNS = 3
__TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
    
def read_chunk(rdr, sz):
    """ Read the next chunk of size sz from the given reader. """
    try:
        """ StopIteration occurs when the pandas reader is empty, and AttributeError
            occurs if rdr is None due to the file being empty. """
        new_arr = rdr.read().values
    except (StopIteration, AttributeError):
        return None    

    """ Extract array of just the times, and histograms matrix without times column. """
    times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
    hists = new_arr[:,__NON_HIST_COLUMNS:]
    times = times.reshape((len(times),1))
    arr = np.append(times, hists, axis=1)

    return arr

def get_min(fps, arrs):
    """ Find the file with the current first row with the smallest start time """
    return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])

def histogram_generator(ctx, fps, sz):
    
    # Create a chunked pandas reader for each of the files:
    rdrs = {}
    for fp in fps:
        try:
            rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
        except ValueError as e:
            if e.message == 'No columns to parse from file':
                if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
                rdrs[fp] = None
            else:
                raise(e)

    # Initial histograms from disk:
    arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
    while True:

        try:
            """ ValueError occurs when nothing more to read """
            fp = get_min(fps, arrs)
        except ValueError:
            return
        arr = arrs[fp]
        yield np.insert(arr[0], 1, fps.index(fp))
        arrs[fp] = arr[1:]

        if arrs[fp].shape[0] == 0:
            arrs[fp] = read_chunk(rdrs[fp], sz)

def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
    """ Taken from fio's stat.c for calculating the latency value of a bin
        from that bin's index.
        
            idx  : the value of the index into the histogram bins
            edge : fractional value in the range [0,1]** indicating how far into
            the bin we wish to compute the latency value of.
        
        ** edge = 0.0 and 1.0 computes the lower and upper latency bounds
           respectively of the given bin index. """

    # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
    # all bits of the sample as index
    if (idx < (FIO_IO_U_PLAT_VAL << 1)):
        return idx 

    # Find the group and compute the minimum value of that group
    error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1 
    base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)

    # Find its bucket number of the group
    k = idx % FIO_IO_U_PLAT_VAL

    # Return the mean (if edge=0.5) of the range of the bucket
    return base + ((k + edge) * (1 << error_bits))
    
def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
    """ Converts the given *coarse* index into a non-coarse index as used by fio
        in stat.h:plat_idx_to_val(), subsequently computing the appropriate
        latency value for that bin.
        """

    # Multiply the index by the power of 2 coarseness to get the bin
    # bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
    stride = 1 << coarseness
    idx = idx * stride
    lower = _plat_idx_to_val(idx, edge=0.0)
    upper = _plat_idx_to_val(idx + stride, edge=1.0)
    return lower + (upper - lower) * edge

def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx):
    ps = weighted_percentile(percs, vs, ws)

    avg = weighted_average(vs, ws)
    values = [mn, avg] + list(ps) + [mx]
    row = [end, ss_cnt] + map(lambda x: float(x) / ctx.divisor, values)
    fmt = "%d, %d, %d, " + fmt_float_list(ctx, 5) + ", %d"
    print (fmt % tuple(row))

def update_extreme(val, fncn, new_val):
    """ Calculate min / max in the presence of None values """
    if val is None: return new_val
    else: return fncn(val, new_val)

# See beginning of main() for how bin_vals are computed
bin_vals = []
lower_bin_vals = [] # lower edge of each bin
upper_bin_vals = [] # upper edge of each bin 

def process_interval(ctx, samples, iStart, iEnd):
    """ Construct the weighted histogram for the given interval by scanning
        through all the histograms and figuring out which of their bins have
        samples with latencies which overlap with the given interval
        [iStart,iEnd].
    """
    
    times, files, hists = samples[:,0], samples[:,1], samples[:,2:]
    iHist = np.zeros(__HIST_COLUMNS)
    ss_cnt = 0 # number of samples affecting this interval
    mn_bin_val, mx_bin_val = None, None

    for end_time,file,hist in zip(times,files,hists):
            
        # Only look at bins of the current histogram sample which
        # started before the end of the current time interval [start,end]
        start_times = (end_time - 0.5 * ctx.interval) - bin_vals / 1000.0
        idx = np.where(start_times < iEnd)
        s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]

        # Increment current interval histogram by weighted values of future histogram:
        ws = hs * weights(s_ts, end_time, iStart, iEnd)
        iHist[idx] += ws
    
        # Update total number of samples affecting current interval histogram:
        ss_cnt += np.sum(hs)
        
        # Update min and max bin values seen if necessary:
        idx = np.where(hs != 0)[0]
        if idx.size > 0:
            mn_bin_val = update_extreme(mn_bin_val, min, l_bvs[max(0,           idx[0]  - 1)])
            mx_bin_val = update_extreme(mx_bin_val, max, u_bvs[min(len(hs) - 1, idx[-1] + 1)])

    if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val)

def guess_max_from_bins(ctx, hist_cols):
    """ Try to guess the GROUP_NR from given # of histogram
        columns seen in an input file """
    max_coarse = 8
    if ctx.group_nr < 19 or ctx.group_nr > 26:
        bins = [ctx.group_nr * (1 << 6)]
    else:
        bins = [1216,1280,1344,1408,1472,1536,1600,1664]
    coarses = range(max_coarse + 1)
    fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else -10, coarses))
    
    arr = np.transpose(list(map(fncn, bins)))
    idx = np.where(arr == hist_cols)
    if len(idx[1]) == 0:
        table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array','     ')
        err("Unable to determine bin values from input clat_hist files. Namely \n"
            "the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
            "columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
            "This number needs to be equal to one of the following numbers:\n\n"
            + table + "\n\n"
            "Possible reasons and corresponding solutions:\n"
            "  - Input file(s) does not contain histograms.\n"
            "  - You recompiled fio with a different GROUP_NR. If so please specify this\n"
            "    new GROUP_NR on the command line with --group_nr\n")
        exit(1)
    return bins[idx[1][0]]

def main(ctx):

    if ctx.job_file:
        try:
            from configparser import SafeConfigParser
        except ImportError:
            from ConfigParser import SafeConfigParser

        cp = SafeConfigParser(allow_no_value=True)
        with open(ctx.job_file, 'r') as fp:
            cp.read(fp)

        if ctx.interval is None:
            # Auto detect --interval value
            for s in cp.sections():
                try:
                    hist_msec = cp[s]['log_hist_msec']
                    if hist_msec is not None:
                        ctx.interval = int(hist_msec)
                except KeyError:
                    pass

    if ctx.interval is None:
        ctx.interval = 1000

    # Automatically detect how many columns are in the input files,
    # calculate the corresponding 'coarseness' parameter used to generate
    # those files, and calculate the appropriate bin latency values:
    with open(ctx.FILE[0], 'r') as fp:
        global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
        __TOTAL_COLUMNS = len(fp.readline().split(','))
        __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS

        max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
        coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
        bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness), np.arange(__HIST_COLUMNS)), dtype=float)
        lower_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 0.0), np.arange(__HIST_COLUMNS)), dtype=float)
        upper_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 1.0), np.arange(__HIST_COLUMNS)), dtype=float)

    fps = [open(f, 'r') for f in ctx.FILE]
    gen = histogram_generator(ctx, fps, ctx.buff_size)

    print(', '.join(columns))

    try:
        start, end = 0, ctx.interval
        arr = np.empty(shape=(0,__TOTAL_COLUMNS - 1))
        more_data = True
        while more_data or len(arr) > 0:
            
            # Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
            while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
                try:
                    new_arr = next(gen)
                except StopIteration:
                    more_data = False
                    break
                arr = np.append(arr, new_arr.reshape((1,__TOTAL_COLUMNS - 1)), axis=0)
            arr = arr.astype(int)
            
            if arr.size > 0:
                # Jump immediately to the start of the input, rounding
                # down to the nearest multiple of the interval (useful when --log_unix_epoch
                # was used to create these histograms):
                if start == 0 and arr[0][0] - ctx.max_latency > end:
                    start = arr[0][0] - ctx.max_latency
                    start = start - (start % ctx.interval)
                    end = start + ctx.interval

                process_interval(ctx, arr, start, end)
                
                # Update arr to throw away samples we no longer need - samples which
                # end before the start of the next interval, i.e. the end of the
                # current interval:
                idx = np.where(arr[:,0] > end)
                arr = arr[idx]
            
            start += ctx.interval
            end = start + ctx.interval
    finally:
        map(lambda f: f.close(), fps)


if __name__ == '__main__':
    import argparse
    p = argparse.ArgumentParser()
    arg = p.add_argument
    arg("FILE", help='space separated list of latency log filenames', nargs='+')
    arg('--buff_size',
        default=10000,
        type=int,
        help='number of samples to buffer into numpy at a time')

    arg('--max_latency',
        default=20,
        type=float,
        help='number of seconds of data to process at a time')

    arg('-i', '--interval',
        type=int,
        help='interval width (ms), default 1000 ms')

    arg('-d', '--divisor',
        required=False,
        type=int,
        default=1,
        help='divide the results by this value.')

    arg('--decimals',
        default=3,
        type=int,
        help='number of decimal places to print floats to')

    arg('--warn',
        dest='warn',
        action='store_true',
        default=False,
        help='print warning messages to stderr')

    arg('--group_nr',
        default=19,
        type=int,
        help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')

    arg('--job-file',
        default=None,
        type=str,
        help='Optional argument pointing to the job file used to create the '
             'given histogram files. Useful for auto-detecting --log_hist_msec and '
             '--log_unix_epoch (in fio) values.')

    main(p.parse_args())
Commit	Line	Data
1e613c9c KC	1	#!/usr/bin/env python2.7
	2	"""
	3	Utility for converting _clat_hist files generated by fio into latency statistics.
	4
	5	Example usage:
	6
	7	$ fiologparser_hist.py _clat_hist
	8	end-time, samples, min, avg, median, 90%, 95%, 99%, max
	9	1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
	10	2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
	11	4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
	12	...
	13
1e613c9c KC	14	@author Karl Cronburg <karl.cronburg@gmail.com>
	15	"""
	16	import os
	17	import sys
	18	import pandas
	19	import numpy as np
	20
	21	err = sys.stderr.write
	22
	23	def weighted_percentile(percs, vs, ws):
	24	""" Use linear interpolation to calculate the weighted percentile.
	25
	26	Value and weight arrays are first sorted by value. The cumulative
	27	distribution function (cdf) is then computed, after which np.interp
	28	finds the two values closest to our desired weighted percentile(s)
	29	and linearly interpolates them.
	30
	31	percs :: List of percentiles we want to calculate
	32	vs :: Array of values we are computing the percentile of
	33	ws :: Array of weights for our corresponding values
	34	return :: Array of percentiles
	35	"""
	36	idx = np.argsort(vs)
	37	vs, ws = vs[idx], ws[idx] # weights and values sorted by value
	38	cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
	39	return np.interp(percs, cdf, vs) # linear interpolation
	40
	41	def weights(start_ts, end_ts, start, end):
	42	""" Calculate weights based on fraction of sample falling in the
	43	given interval [start,end]. Weights computed using vector / array
	44	computation instead of for-loops.
	45
	46	Note that samples with zero time length are effectively ignored
	47	(we set their weight to zero).
	48
	49	start_ts :: Array of start times for a set of samples
	50	end_ts :: Array of end times for a set of samples
	51	start :: int
	52	end :: int
	53	return :: Array of weights
	54	"""
	55	sbounds = np.maximum(start_ts, start).astype(float)
	56	ebounds = np.minimum(end_ts, end).astype(float)
	57	ws = (ebounds - sbounds) / (end_ts - start_ts)
	58	if np.any(np.isnan(ws)):
	59	err("WARNING: zero-length sample(s) detected. Log file corrupt"
	60	" / bad time values? Ignoring these samples.\n")
	61	ws[np.where(np.isnan(ws))] = 0.0;
	62	return ws
	63
	64	def weighted_average(vs, ws):
	65	return np.sum(vs * ws) / np.sum(ws)
	66
	67	columns = ["end-time", "samples", "min", "avg", "median", "90%", "95%", "99%", "max"]
	68	percs = [50, 90, 95, 99]
	69
	70	def fmt_float_list(ctx, num=1):
	71	""" Return a comma separated list of float formatters to the required number
	72	of decimal places. For instance:
	73
	74	fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
	75	"""
	76	return ', '.join(["%%.%df" % ctx.decimals] * num)
	77
78	# Default values - see beginning of main() for how we detect number columns in
79	# the input files:
80	__HIST_COLUMNS = 1216
81	__NON_HIST_COLUMNS = 3
82	__TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
83
65a4d15c KC	84	def read_chunk(rdr, sz):
65a4d15c KC	85	""" Read the next chunk of size sz from the given reader. """
1e613c9c KC	86	try:
	87	""" StopIteration occurs when the pandas reader is empty, and AttributeError
	88	occurs if rdr is None due to the file being empty. """
	89	new_arr = rdr.read().values
	90	except (StopIteration, AttributeError):
	91	return None
	92
65a4d15c	93	""" Extract array of just the times, and histograms matrix without times column. """
1e613c9c KC	94	times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
1e613c9c KC	95	hists = new_arr[:,__NON_HIST_COLUMNS:]
1e613c9c	96	times = times.reshape((len(times),1))
65a4d15c	97	arr = np.append(times, hists, axis=1)
1e613c9c	98
65a4d15c	99	return arr
1e613c9c KC	100
	101	def get_min(fps, arrs):
	102	""" Find the file with the current first row with the smallest start time """
65a4d15c	103	return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])
1e613c9c KC	104
	105	def histogram_generator(ctx, fps, sz):
	106
1e613c9c KC	107	# Create a chunked pandas reader for each of the files:
	108	rdrs = {}
	109	for fp in fps:
	110	try:
	111	rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
	112	except ValueError as e:
	113	if e.message == 'No columns to parse from file':
d1f6fcad	114	if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
1e613c9c KC	115	rdrs[fp] = None
	116	else:
	117	raise(e)
	118
65a4d15c KC	119	# Initial histograms from disk:
65a4d15c KC	120	arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
1e613c9c KC	121	while True:
	122
	123	try:
	124	""" ValueError occurs when nothing more to read """
	125	fp = get_min(fps, arrs)
	126	except ValueError:
	127	return
65a4d15c	128	arr = arrs[fp]
1e613c9c	129	yield np.insert(arr[0], 1, fps.index(fp))
65a4d15c	130	arrs[fp] = arr[1:]
1e613c9c	131
65a4d15c KC	132	if arrs[fp].shape[0] == 0:
65a4d15c KC	133	arrs[fp] = read_chunk(rdrs[fp], sz)
1e613c9c KC	134
	135	def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
	136	""" Taken from fio's stat.c for calculating the latency value of a bin
	137	from that bin's index.
	138
	139	idx : the value of the index into the histogram bins
	140	edge : fractional value in the range [0,1]** indicating how far into
	141	the bin we wish to compute the latency value of.
	142
	143	** edge = 0.0 and 1.0 computes the lower and upper latency bounds
	144	respectively of the given bin index. """
	145
	146	# MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
	147	# all bits of the sample as index
	148	if (idx < (FIO_IO_U_PLAT_VAL << 1)):
	149	return idx
	150
	151	# Find the group and compute the minimum value of that group
	152	error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1
	153	base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)
	154
	155	# Find its bucket number of the group
	156	k = idx % FIO_IO_U_PLAT_VAL
	157
	158	# Return the mean (if edge=0.5) of the range of the bucket
	159	return base + ((k + edge) * (1 << error_bits))
	160
	161	def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
	162	""" Converts the given coarse index into a non-coarse index as used by fio
	163	in stat.h:plat_idx_to_val(), subsequently computing the appropriate
	164	latency value for that bin.
	165	"""
	166
	167	# Multiply the index by the power of 2 coarseness to get the bin
	168	# bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
	169	stride = 1 << coarseness
	170	idx = idx * stride
	171	lower = _plat_idx_to_val(idx, edge=0.0)
	172	upper = _plat_idx_to_val(idx + stride, edge=1.0)
	173	return lower + (upper - lower) * edge
	174
	175	def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx):
	176	ps = weighted_percentile(percs, vs, ws)
	177
	178	avg = weighted_average(vs, ws)
	179	values = [mn, avg] + list(ps) + [mx]
	180	row = [end, ss_cnt] + map(lambda x: float(x) / ctx.divisor, values)
	181	fmt = "%d, %d, %d, " + fmt_float_list(ctx, 5) + ", %d"
	182	print (fmt % tuple(row))
	183
	184	def update_extreme(val, fncn, new_val):
	185	""" Calculate min / max in the presence of None values """
	186	if val is None: return new_val
	187	else: return fncn(val, new_val)
	188
	189	# See beginning of main() for how bin_vals are computed
	190	bin_vals = []
	191	lower_bin_vals = [] # lower edge of each bin
	192	upper_bin_vals = [] # upper edge of each bin
	193
	194	def process_interval(ctx, samples, iStart, iEnd):
	195	""" Construct the weighted histogram for the given interval by scanning
	196	through all the histograms and figuring out which of their bins have
	197	samples with latencies which overlap with the given interval
198	[iStart,iEnd].
199	"""
200
201	times, files, hists = samples[:,0], samples[:,1], samples[:,2:]
202	iHist = np.zeros(__HIST_COLUMNS)
203	ss_cnt = 0 # number of samples affecting this interval
204	mn_bin_val, mx_bin_val = None, None
205
206	for end_time,file,hist in zip(times,files,hists):
207
208	# Only look at bins of the current histogram sample which
209	# started before the end of the current time interval [start,end]
210	start_times = (end_time - 0.5 * ctx.interval) - bin_vals / 1000.0
211	idx = np.where(start_times < iEnd)
212	s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]
213
214	# Increment current interval histogram by weighted values of future histogram:
215	ws = hs * weights(s_ts, end_time, iStart, iEnd)
216	iHist[idx] += ws
217
218	# Update total number of samples affecting current interval histogram:
219	ss_cnt += np.sum(hs)
220
221	# Update min and max bin values seen if necessary:
222	idx = np.where(hs != 0)[0]
223	if idx.size > 0:
224	mn_bin_val = update_extreme(mn_bin_val, min, l_bvs[max(0, idx[0] - 1)])
225	mx_bin_val = update_extreme(mx_bin_val, max, u_bvs[min(len(hs) - 1, idx[-1] + 1)])
226
227	if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val)
228
229	def guess_max_from_bins(ctx, hist_cols):
230	""" Try to guess the GROUP_NR from given # of histogram
231	columns seen in an input file """
232	max_coarse = 8
233	if ctx.group_nr < 19 or ctx.group_nr > 26:
234	bins = [ctx.group_nr * (1 << 6)]
235	else:
236	bins = [1216,1280,1344,1408,1472,1536,1600,1664]
237	coarses = range(max_coarse + 1)
238	fncn = lambda z: list(map(lambda x: z/2x if z % 2x == 0 else -10, coarses))
239
240	arr = np.transpose(list(map(fncn, bins)))
241	idx = np.where(arr == hist_cols)
242	if len(idx[1]) == 0:
243	table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array',' ')
244	err("Unable to determine bin values from input clat_hist files. Namely \n"
245	"the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
246	"columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
247	"This number needs to be equal to one of the following numbers:\n\n"
248	+ table + "\n\n"
249	"Possible reasons and corresponding solutions:\n"
250	" - Input file(s) does not contain histograms.\n"
251	" - You recompiled fio with a different GROUP_NR. If so please specify this\n"
252	" new GROUP_NR on the command line with --group_nr\n")
253	exit(1)
254	return bins[idx[1][0]]
255
256	def main(ctx):
257
b9f93e04 KC	258	if ctx.job_file:
	259	try:
	260	from configparser import SafeConfigParser
	261	except ImportError:
	262	from ConfigParser import SafeConfigParser
	263
	264	cp = SafeConfigParser(allow_no_value=True)
	265	with open(ctx.job_file, 'r') as fp:
	266	cp.read(fp)
	267
	268	if ctx.interval is None:
	269	# Auto detect --interval value
	270	for s in cp.sections():
	271	try:
	272	hist_msec = cp[s]['log_hist_msec']
	273	if hist_msec is not None:
	274	ctx.interval = int(hist_msec)
	275	except KeyError:
	276	pass
	277
	278	if ctx.interval is None:
	279	ctx.interval = 1000
	280
1e613c9c KC	281	# Automatically detect how many columns are in the input files,
	282	# calculate the corresponding 'coarseness' parameter used to generate
	283	# those files, and calculate the appropriate bin latency values:
	284	with open(ctx.FILE[0], 'r') as fp:
	285	global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
	286	__TOTAL_COLUMNS = len(fp.readline().split(','))
	287	__HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
	288
	289	max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
	290	coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
	291	bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness), np.arange(__HIST_COLUMNS)), dtype=float)
	292	lower_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 0.0), np.arange(__HIST_COLUMNS)), dtype=float)
	293	upper_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 1.0), np.arange(__HIST_COLUMNS)), dtype=float)
	294
	295	fps = [open(f, 'r') for f in ctx.FILE]
	296	gen = histogram_generator(ctx, fps, ctx.buff_size)
	297
	298	print(', '.join(columns))
	299
	300	try:
	301	start, end = 0, ctx.interval
	302	arr = np.empty(shape=(0,__TOTAL_COLUMNS - 1))
	303	more_data = True
	304	while more_data or len(arr) > 0:
	305
	306	# Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
	307	while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
	308	try:
	309	new_arr = next(gen)
	310	except StopIteration:
	311	more_data = False
	312	break
	313	arr = np.append(arr, new_arr.reshape((1,__TOTAL_COLUMNS - 1)), axis=0)
	314	arr = arr.astype(int)
	315
	316	if arr.size > 0:
b9f93e04 KC	317	# Jump immediately to the start of the input, rounding
	318	# down to the nearest multiple of the interval (useful when --log_unix_epoch
	319	# was used to create these histograms):
	320	if start == 0 and arr[0][0] - ctx.max_latency > end:
	321	start = arr[0][0] - ctx.max_latency
	322	start = start - (start % ctx.interval)
	323	end = start + ctx.interval
	324
1e613c9c KC	325	process_interval(ctx, arr, start, end)
	326
	327	# Update arr to throw away samples we no longer need - samples which
	328	# end before the start of the next interval, i.e. the end of the
	329	# current interval:
	330	idx = np.where(arr[:,0] > end)
	331	arr = arr[idx]
	332
	333	start += ctx.interval
	334	end = start + ctx.interval
	335	finally:
	336	map(lambda f: f.close(), fps)
	337
	338
	339	if __name__ == '__main__':
	340	import argparse
	341	p = argparse.ArgumentParser()
	342	arg = p.add_argument
	343	arg("FILE", help='space separated list of latency log filenames', nargs='+')
	344	arg('--buff_size',
	345	default=10000,
	346	type=int,
	347	help='number of samples to buffer into numpy at a time')
	348
	349	arg('--max_latency',
	350	default=20,
	351	type=float,
	352	help='number of seconds of data to process at a time')
	353
	354	arg('-i', '--interval',
1e613c9c	355	type=int,
b9f93e04	356	help='interval width (ms), default 1000 ms')
1e613c9c KC	357
	358	arg('-d', '--divisor',
	359	required=False,
	360	type=int,
	361	default=1,
	362	help='divide the results by this value.')
	363
	364	arg('--decimals',
	365	default=3,
	366	type=int,
	367	help='number of decimal places to print floats to')
	368
d1f6fcad KC	369	arg('--warn',
	370	dest='warn',
	371	action='store_true',
	372	default=False,
	373	help='print warning messages to stderr')
1e613c9c KC	374
	375	arg('--group_nr',
	376	default=19,
	377	type=int,
	378	help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')
	379
b9f93e04 KC	380	arg('--job-file',
	381	default=None,
	382	type=str,
	383	help='Optional argument pointing to the job file used to create the '
	384	'given histogram files. Useful for auto-detecting --log_hist_msec and '
	385	'--log_unix_epoch (in fio) values.')
	386
1e613c9c KC	387	main(p.parse_args())
1e613c9c KC	388