1 #!/usr/bin/env python2.7
3 Utility for converting *_clat_hist* files generated by fio into latency statistics.
7 $ fiologparser_hist.py *_clat_hist*
8 end-time, samples, min, avg, median, 90%, 95%, 99%, max
9 1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
10 2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
11 4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
14 @author Karl Cronburg <karl.cronburg@gmail.com>
21 err = sys.stderr.write
23 def weighted_percentile(percs, vs, ws):
24 """ Use linear interpolation to calculate the weighted percentile.
26 Value and weight arrays are first sorted by value. The cumulative
27 distribution function (cdf) is then computed, after which np.interp
28 finds the two values closest to our desired weighted percentile(s)
29 and linearly interpolates them.
31 percs :: List of percentiles we want to calculate
32 vs :: Array of values we are computing the percentile of
33 ws :: Array of weights for our corresponding values
34 return :: Array of percentiles
37 vs, ws = vs[idx], ws[idx] # weights and values sorted by value
38 cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
39 return np.interp(percs, cdf, vs) # linear interpolation
41 def weights(start_ts, end_ts, start, end):
42 """ Calculate weights based on fraction of sample falling in the
43 given interval [start,end]. Weights computed using vector / array
44 computation instead of for-loops.
46 Note that samples with zero time length are effectively ignored
47 (we set their weight to zero).
49 start_ts :: Array of start times for a set of samples
50 end_ts :: Array of end times for a set of samples
53 return :: Array of weights
55 sbounds = np.maximum(start_ts, start).astype(float)
56 ebounds = np.minimum(end_ts, end).astype(float)
57 ws = (ebounds - sbounds) / (end_ts - start_ts)
58 if np.any(np.isnan(ws)):
59 err("WARNING: zero-length sample(s) detected. Log file corrupt"
60 " / bad time values? Ignoring these samples.\n")
61 ws[np.where(np.isnan(ws))] = 0.0;
64 def weighted_average(vs, ws):
65 return np.sum(vs * ws) / np.sum(ws)
67 columns = ["end-time", "samples", "min", "avg", "median", "90%", "95%", "99%", "max"]
68 percs = [50, 90, 95, 99]
70 def fmt_float_list(ctx, num=1):
71 """ Return a comma separated list of float formatters to the required number
72 of decimal places. For instance:
74 fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
76 return ', '.join(["%%.%df" % ctx.decimals] * num)
78 # Default values - see beginning of main() for how we detect number columns in
81 __NON_HIST_COLUMNS = 3
82 __TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
84 def read_chunk(rdr, sz):
85 """ Read the next chunk of size sz from the given reader. """
87 """ StopIteration occurs when the pandas reader is empty, and AttributeError
88 occurs if rdr is None due to the file being empty. """
89 new_arr = rdr.read().values
90 except (StopIteration, AttributeError):
93 """ Extract array of just the times, and histograms matrix without times column. """
94 times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
95 hists = new_arr[:,__NON_HIST_COLUMNS:]
96 times = times.reshape((len(times),1))
97 arr = np.append(times, hists, axis=1)
101 def get_min(fps, arrs):
102 """ Find the file with the current first row with the smallest start time """
103 return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])
105 def histogram_generator(ctx, fps, sz):
107 # Create a chunked pandas reader for each of the files:
111 rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
112 except ValueError as e:
113 if e.message == 'No columns to parse from file':
114 if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
119 # Initial histograms from disk:
120 arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
124 """ ValueError occurs when nothing more to read """
125 fp = get_min(fps, arrs)
129 yield np.insert(arr[0], 1, fps.index(fp))
132 if arrs[fp].shape[0] == 0:
133 arrs[fp] = read_chunk(rdrs[fp], sz)
135 def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
136 """ Taken from fio's stat.c for calculating the latency value of a bin
137 from that bin's index.
139 idx : the value of the index into the histogram bins
140 edge : fractional value in the range [0,1]** indicating how far into
141 the bin we wish to compute the latency value of.
143 ** edge = 0.0 and 1.0 computes the lower and upper latency bounds
144 respectively of the given bin index. """
146 # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
147 # all bits of the sample as index
148 if (idx < (FIO_IO_U_PLAT_VAL << 1)):
151 # Find the group and compute the minimum value of that group
152 error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1
153 base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)
155 # Find its bucket number of the group
156 k = idx % FIO_IO_U_PLAT_VAL
158 # Return the mean (if edge=0.5) of the range of the bucket
159 return base + ((k + edge) * (1 << error_bits))
161 def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
162 """ Converts the given *coarse* index into a non-coarse index as used by fio
163 in stat.h:plat_idx_to_val(), subsequently computing the appropriate
164 latency value for that bin.
167 # Multiply the index by the power of 2 coarseness to get the bin
168 # bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
169 stride = 1 << coarseness
171 lower = _plat_idx_to_val(idx, edge=0.0)
172 upper = _plat_idx_to_val(idx + stride, edge=1.0)
173 return lower + (upper - lower) * edge
175 def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx):
176 ps = weighted_percentile(percs, vs, ws)
178 avg = weighted_average(vs, ws)
179 values = [mn, avg] + list(ps) + [mx]
180 row = [end, ss_cnt] + map(lambda x: float(x) / ctx.divisor, values)
181 fmt = "%d, %d, %d, " + fmt_float_list(ctx, 5) + ", %d"
182 print (fmt % tuple(row))
184 def update_extreme(val, fncn, new_val):
185 """ Calculate min / max in the presence of None values """
186 if val is None: return new_val
187 else: return fncn(val, new_val)
189 # See beginning of main() for how bin_vals are computed
191 lower_bin_vals = [] # lower edge of each bin
192 upper_bin_vals = [] # upper edge of each bin
194 def process_interval(ctx, samples, iStart, iEnd):
195 """ Construct the weighted histogram for the given interval by scanning
196 through all the histograms and figuring out which of their bins have
197 samples with latencies which overlap with the given interval
201 times, files, hists = samples[:,0], samples[:,1], samples[:,2:]
202 iHist = np.zeros(__HIST_COLUMNS)
203 ss_cnt = 0 # number of samples affecting this interval
204 mn_bin_val, mx_bin_val = None, None
206 for end_time,file,hist in zip(times,files,hists):
208 # Only look at bins of the current histogram sample which
209 # started before the end of the current time interval [start,end]
210 start_times = (end_time - 0.5 * ctx.interval) - bin_vals / 1000.0
211 idx = np.where(start_times < iEnd)
212 s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]
214 # Increment current interval histogram by weighted values of future histogram:
215 ws = hs * weights(s_ts, end_time, iStart, iEnd)
218 # Update total number of samples affecting current interval histogram:
221 # Update min and max bin values seen if necessary:
222 idx = np.where(hs != 0)[0]
224 mn_bin_val = update_extreme(mn_bin_val, min, l_bvs[max(0, idx[0] - 1)])
225 mx_bin_val = update_extreme(mx_bin_val, max, u_bvs[min(len(hs) - 1, idx[-1] + 1)])
227 if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val)
229 def guess_max_from_bins(ctx, hist_cols):
230 """ Try to guess the GROUP_NR from given # of histogram
231 columns seen in an input file """
233 if ctx.group_nr < 19 or ctx.group_nr > 26:
234 bins = [ctx.group_nr * (1 << 6)]
236 bins = [1216,1280,1344,1408,1472,1536,1600,1664]
237 coarses = range(max_coarse + 1)
238 fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else -10, coarses))
240 arr = np.transpose(list(map(fncn, bins)))
241 idx = np.where(arr == hist_cols)
243 table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array',' ')
244 err("Unable to determine bin values from input clat_hist files. Namely \n"
245 "the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
246 "columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
247 "This number needs to be equal to one of the following numbers:\n\n"
249 "Possible reasons and corresponding solutions:\n"
250 " - Input file(s) does not contain histograms.\n"
251 " - You recompiled fio with a different GROUP_NR. If so please specify this\n"
252 " new GROUP_NR on the command line with --group_nr\n")
254 return bins[idx[1][0]]
258 # Automatically detect how many columns are in the input files,
259 # calculate the corresponding 'coarseness' parameter used to generate
260 # those files, and calculate the appropriate bin latency values:
261 with open(ctx.FILE[0], 'r') as fp:
262 global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
263 __TOTAL_COLUMNS = len(fp.readline().split(','))
264 __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
266 max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
267 coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
268 bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness), np.arange(__HIST_COLUMNS)), dtype=float)
269 lower_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 0.0), np.arange(__HIST_COLUMNS)), dtype=float)
270 upper_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 1.0), np.arange(__HIST_COLUMNS)), dtype=float)
272 fps = [open(f, 'r') for f in ctx.FILE]
273 gen = histogram_generator(ctx, fps, ctx.buff_size)
275 print(', '.join(columns))
278 start, end = 0, ctx.interval
279 arr = np.empty(shape=(0,__TOTAL_COLUMNS - 1))
281 while more_data or len(arr) > 0:
283 # Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
284 while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
287 except StopIteration:
290 arr = np.append(arr, new_arr.reshape((1,__TOTAL_COLUMNS - 1)), axis=0)
291 arr = arr.astype(int)
294 process_interval(ctx, arr, start, end)
296 # Update arr to throw away samples we no longer need - samples which
297 # end before the start of the next interval, i.e. the end of the
299 idx = np.where(arr[:,0] > end)
302 start += ctx.interval
303 end = start + ctx.interval
305 map(lambda f: f.close(), fps)
308 if __name__ == '__main__':
310 p = argparse.ArgumentParser()
312 arg("FILE", help='space separated list of latency log filenames', nargs='+')
316 help='number of samples to buffer into numpy at a time')
321 help='number of seconds of data to process at a time')
323 arg('-i', '--interval',
326 help='interval width (ms)')
328 arg('-d', '--divisor',
332 help='divide the results by this value.')
337 help='number of decimal places to print floats to')
343 help='print warning messages to stderr')
348 help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')