summaryrefslogtreecommitdiff
path: root/btreplay
diff options
context:
space:
mode:
Diffstat (limited to 'btreplay')
-rw-r--r--btreplay/Makefile45
-rw-r--r--btreplay/btrecord.c780
-rw-r--r--btreplay/btrecord.h95
-rw-r--r--btreplay/btreplay.c1590
-rw-r--r--btreplay/doc/Makefile18
-rw-r--r--btreplay/doc/abstract.tex34
-rw-r--r--btreplay/doc/btreplay.tex521
7 files changed, 3083 insertions, 0 deletions
diff --git a/btreplay/Makefile b/btreplay/Makefile
new file mode 100644
index 0000000..a8d2e3b
--- /dev/null
+++ b/btreplay/Makefile
@@ -0,0 +1,45 @@
+#
+# OCFLAGS:
+# COUNT_IOS - Counts struct io's left at end
+# DEBUG - Various and sundy debug asserts
+# NDEBUG - Defined: no asserts, Undefined: asserts
+#
+
+CC = gcc
+CFLAGS = -Wall -W -O2 -g
+INCS = -I. -I.. -I../btt
+OCFLAGS = -UCOUNT_IOS -UDEBUG -DNDEBUG
+XCFLAGS = -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
+override CFLAGS += $(INCS) $(XCFLAGS) $(OCFLAGS)
+
+PROGS = btrecord btreplay
+LIBS = -laio -lrt
+
+all: depend $(PROGS)
+
+$(PROGS): | depend
+
+docs:
+ $(MAKE) -C doc all
+
+docsclean:
+ $(MAKE) -C doc clean
+
+clean: docsclean
+ -rm -f *.o $(PROGS) .depend
+
+%.o: %.c
+ $(CC) $(CFLAGS) -c -o $*.o $<
+
+btrecord: btrecord.o
+ $(CC) $(CFLAGS) -o $@ $(filter %.o,$^) $(LIBS)
+
+btreplay: btreplay.o
+ $(CC) $(CFLAGS) -o $@ $(filter %.o,$^) $(LIBS)
+
+depend:
+ @$(CC) -MM $(CFLAGS) *.c 1> .depend
+
+ifneq ($(wildcard .depend),)
+include .depend
+endif
diff --git a/btreplay/btrecord.c b/btreplay/btrecord.c
new file mode 100644
index 0000000..e02c153
--- /dev/null
+++ b/btreplay/btrecord.c
@@ -0,0 +1,780 @@
+/*
+ * Blktrace record utility - Convert binary trace data into bunches of IOs
+ *
+ * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+static char build_date[] = __DATE__ " at "__TIME__;
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <dirent.h>
+
+#if !defined(_GNU_SOURCE)
+# define _GNU_SOURCE
+#endif
+#include <getopt.h>
+
+#include "list.h"
+#include "btrecord.h"
+#include "blktrace.h"
+
+/*
+ * Per input file information
+ *
+ * @head: Used to link up on input_files
+ * @devnm: Device name portion of this input file
+ * @file_name: Fully qualified name for this input file
+ * @cpu: CPU that this file was collected on
+ * @ifd: Input file descriptor (when opened)
+ * @tpkts: Total number of packets processed.
+ */
+struct ifile_info {
+ struct list_head head;
+ char *devnm, *file_name;
+ int cpu, ifd;
+ __u64 tpkts, genesis;
+};
+
+/*
+ * Per IO trace information
+ *
+ * @time: Time stamp when trace was emitted
+ * @sector: IO sector identifier
+ * @bytes: Number of bytes transferred
+ * @rw: Read (1) or write (0)
+ */
+struct io_spec {
+ __u64 time;
+ __u64 sector;
+ __u32 bytes;
+ int rw;
+};
+
+/*
+ * Per output file information
+ *
+ * @ofp: Output file
+ * @vfp: Verbose output file
+ * @file_name: Fully qualified name for this file
+ * @vfn: Fully qualified name for this file
+ * @cur: Current IO bunch being collected
+ * @iip: Input file this is associated with
+ * @start_time: Start time of th ecurrent bunch
+ * @last_time: Time of last packet put in
+ * @bunches: Number of bunches processed
+ * @pkts: Number of packets stored in bunches
+ */
+struct io_stream {
+ FILE *ofp, *vfp;
+ char *file_name, *vfn;
+ struct io_bunch *cur;
+ struct ifile_info *iip;
+ __u64 start_time, last_time, bunches, pkts;
+};
+
+int data_is_native; // Indicates whether to swap
+static LIST_HEAD(input_files); // List of all input files
+static char *idir = "."; // Input directory base
+static char *odir = "."; // Output directory base
+static char *obase = "replay"; // Output file base
+static __u64 max_bunch_tm = (10 * 1000 * 1000); // 10 milliseconds
+static __u64 max_pkts_per_bunch = 8; // Default # of pkts per bunch
+static int verbose = 0; // Boolean: output stats
+static int find_traces = 0; // Boolean: Find traces in dir
+
+static char usage_str[] = \
+ "\n" \
+ "\t[ -d <dir> : --input-directory=<dir> ] Default: .\n" \
+ "\t[ -D <dir> : --output-directory=<dir>] Default: .\n" \
+ "\t[ -F : --find-traces ] Default: Off\n" \
+ "\t[ -h : --help ] Default: Off\n" \
+ "\t[ -m <nsec> : --max-bunch-time=<nsec> ] Default: 10 msec\n" \
+ "\t[ -M <pkts> : --max-pkts=<pkts> ] Default: 8\n" \
+ "\t[ -o <base> : --output-base=<base> ] Default: replay\n" \
+ "\t[ -v : --verbose ] Default: Off\n" \
+ "\t[ -V : --version ] Default: Off\n" \
+ "\t<dev>... Default: None\n" \
+ "\n";
+
+#define S_OPTS "d:D:Fhm:M:o:vV"
+static struct option l_opts[] = {
+ {
+ .name = "input-directory",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'd'
+ },
+ {
+ .name = "output-directory",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'D'
+ },
+ {
+ .name = "find-traces",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'F'
+ },
+ {
+ .name = "help",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'h'
+ },
+ {
+ .name = "max-bunch-time",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'm'
+ },
+ {
+ .name = "max_pkts",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'M'
+ },
+ {
+ .name = "output-base",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'o'
+ },
+ {
+ .name = "verbose",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'v'
+ },
+ {
+ .name = "version",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'V'
+ },
+ {
+ .name = NULL
+ }
+};
+
+#define ERR_ARGS 1
+#define ERR_SYSCALL 2
+#define fatal(errstring, exitval, arg...) \
+ do { \
+ if (errstring) perror(errstring); \
+ fprintf(stderr, ##arg); \
+ exit(exitval); \
+ /*NOTREACHED*/ \
+ } while (0)
+
+/**
+ * match - Return true if this trace is a proper QUEUE transaction
+ * @action: Action field from trace
+ */
+static inline int match(__u32 action)
+{
+ return ((action & 0xffff) == __BLK_TA_QUEUE) &&
+ (action & BLK_TC_ACT(BLK_TC_QUEUE));
+}
+
+/**
+ * usage - Display usage string and version
+ */
+static void usage(void)
+{
+ fprintf(stderr, "Usage: btrecord -- version %s\n%s",
+ my_btversion, usage_str);
+}
+
+/**
+ * write_file_hdr - Seek to and write btrecord file header
+ * @stream: Output file information
+ * @hdr: Header to write
+ */
+static void write_file_hdr(struct io_stream *stream, struct io_file_hdr *hdr)
+{
+ hdr->version = mk_btversion(btver_mjr, btver_mnr, btver_sub);
+
+ if (verbose) {
+ fprintf(stderr, "\t%s: %llx %llx %llx %llx\n",
+ stream->file_name,
+ (long long unsigned)hdr->version,
+ (long long unsigned)hdr->genesis,
+ (long long unsigned)hdr->nbunches,
+ (long long unsigned)hdr->total_pkts);
+ }
+
+ fseek(stream->ofp, 0, SEEK_SET);
+ if (fwrite(hdr, sizeof(*hdr), 1, stream->ofp) != 1) {
+ fatal(stream->file_name, ERR_SYSCALL, "Hdr write failed\n");
+ /*NOTREACHED*/
+ }
+}
+
+/**
+ * io_bunch_create - Allocate & initialize an io_bunch
+ * @io_stream: IO stream being added to
+ * @pre_stall: Amount of time that this bunch should be delayed by
+ * @start_time: Records current start
+ */
+static inline void io_bunch_create(struct io_stream *stream, __u64 start_time)
+{
+ struct io_bunch *cur = malloc(sizeof(*cur));
+
+ memset(cur, 0, sizeof(*cur));
+
+ cur->hdr.npkts = 0;
+ cur->hdr.time_stamp = stream->start_time = start_time;
+
+ stream->cur = cur;
+}
+
+/**
+ * io_bunch_add - Add an IO to the current bunch of IOs
+ * @stream: Per-output file stream information
+ * @spec: IO trace specification
+ *
+ * Returns update bunch information
+ */
+static void io_bunch_add(struct io_stream *stream, struct io_spec *spec)
+{
+ struct io_bunch *cur = stream->cur;
+ struct io_pkt iop = {
+ .sector = spec->sector,
+ .nbytes = spec->bytes,
+ .rw = spec->rw
+ };
+
+ assert(cur != NULL);
+ assert(cur->hdr.npkts < BT_MAX_PKTS);
+ assert(stream->last_time == 0 || stream->last_time <= spec->time);
+
+ cur->pkts[cur->hdr.npkts++] = iop; // Struct copy
+ stream->last_time = spec->time;
+}
+
+/**
+ * rem_input_file - Release resources associated with an input file
+ * @iip: Per-input file information
+ */
+static void rem_input_file(struct ifile_info *iip)
+{
+ list_del(&iip->head);
+
+ close(iip->ifd);
+ free(iip->file_name);
+ free(iip->devnm);
+ free(iip);
+}
+
+/**
+ * __add_input_file - Allocate and initialize per-input file structure
+ * @cpu: CPU for this file
+ * @devnm: Device name for this file
+ * @file_name: Fully qualifed input file name
+ */
+static void __add_input_file(int cpu, char *devnm, char *file_name)
+{
+ struct ifile_info *iip = malloc(sizeof(*iip));
+
+ iip->cpu = cpu;
+ iip->tpkts = 0;
+ iip->genesis = 0;
+ iip->devnm = strdup(devnm);
+ iip->file_name = strdup(file_name);
+ iip->ifd = open(file_name, O_RDONLY);
+ if (iip->ifd < 0) {
+ fatal(file_name, ERR_ARGS, "Unable to open\n");
+ /*NOTREACHED*/
+ }
+
+ list_add_tail(&iip->head, &input_files);
+}
+
+/**
+ * add_input_file - Set up the input file name
+ * @devnm: Device name to use
+ */
+static void add_input_file(char *devnm)
+{
+ struct list_head *p;
+ int cpu, found = 0;
+
+ __list_for_each(p, &input_files) {
+ struct ifile_info *iip = list_entry(p, struct ifile_info, head);
+ if (strcmp(iip->devnm, devnm) == 0)
+ return;
+ }
+
+ for (cpu = 0; ; cpu++) {
+ char full_name[MAXPATHLEN];
+
+ sprintf(full_name, "%s/%s.blktrace.%d", idir, devnm, cpu);
+ if (access(full_name, R_OK) != 0)
+ break;
+
+ __add_input_file(cpu, devnm, full_name);
+ found++;
+ }
+
+ if (!found) {
+ fatal(NULL, ERR_ARGS, "No traces found for %s\n", devnm);
+ /*NOTREACHED*/
+ }
+}
+
+static void find_input_files(char *idir)
+{
+ struct dirent *ent;
+ DIR *dir = opendir(idir);
+
+ if (dir == NULL) {
+ fatal(idir, ERR_ARGS, "Unable to open %s\n", idir);
+ /*NOTREACHED*/
+ }
+
+ while ((ent = readdir(dir)) != NULL) {
+ char *p, *dsf = malloc(256);
+
+ if (strstr(ent->d_name, ".blktrace.") == NULL)
+ continue;
+
+ dsf = strdup(ent->d_name);
+ p = index(dsf, '.');
+ assert(p != NULL);
+ *p = '\0';
+ add_input_file(dsf);
+ free(dsf);
+ }
+
+ closedir(dir);
+}
+
+/**
+ * handle_args - Parse passed in argument list
+ * @argc: Number of arguments in argv
+ * @argv: Arguments passed in
+ *
+ * Does rudimentary parameter verification as well.
+ */
+void handle_args(int argc, char *argv[])
+{
+ int c;
+
+ while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
+ switch (c) {
+ case 'd':
+ idir = optarg;
+ if (access(idir, R_OK | X_OK) != 0) {
+ fatal(idir, ERR_ARGS,
+ "Invalid input directory specified\n");
+ /*NOTREACHED*/
+ }
+ break;
+
+ case 'D':
+ odir = optarg;
+ if (access(odir, R_OK | X_OK) != 0) {
+ fatal(odir, ERR_ARGS,
+ "Invalid output directory specified\n");
+ /*NOTREACHED*/
+ }
+ break;
+
+ case 'F':
+ find_traces = 1;
+ break;
+
+ case 'h':
+ usage();
+ exit(0);
+ /*NOTREACHED*/
+
+ case 'm':
+ max_bunch_tm = (__u64)atoll(optarg);
+ if (max_bunch_tm < 1) {
+ fprintf(stderr, "Invalid bunch time %llu\n",
+ (unsigned long long)max_bunch_tm);
+ exit(ERR_ARGS);
+ /*NOTREACHED*/
+ }
+ break;
+
+ case 'M':
+ max_pkts_per_bunch = (__u64)atoll(optarg);
+ if (!((1 <= max_pkts_per_bunch) &&
+ (max_pkts_per_bunch < 513))) {
+ fprintf(stderr, "Invalid max pkts %llu\n",
+ (unsigned long long)max_pkts_per_bunch);
+ exit(ERR_ARGS);
+ /*NOTREACHED*/
+ }
+ break;
+
+ case 'o':
+ obase = optarg;
+ break;
+
+ case 'V':
+ fprintf(stderr, "btrecord -- version %s\n",
+ my_btversion);
+ fprintf(stderr, " Built on %s\n", build_date);
+ exit(0);
+ /*NOTREACHED*/
+
+ case 'v':
+ verbose++;
+ break;
+
+ default:
+ usage();
+ fatal(NULL, ERR_ARGS, "Invalid command line\n");
+ /*NOTREACHED*/
+ }
+ }
+
+ while (optind < argc)
+ add_input_file(argv[optind++]);
+
+ if (find_traces)
+ find_input_files(idir);
+
+ if (list_len(&input_files) == 0) {
+ fatal(NULL, ERR_ARGS, "Missing required input file name(s)\n");
+ /*NOTREACHED*/
+ }
+}
+
+/**
+ * next_io - Retrieve next Q trace from input stream
+ * @iip: Per-input file information
+ * @spec: IO specifier for trace
+ *
+ * Returns 0 on end of file, 1 if valid data returned.
+ */
+static int next_io(struct ifile_info *iip, struct io_spec *spec)
+{
+ ssize_t ret;
+ __u32 action;
+ __u16 pdu_len;
+ struct blk_io_trace t;
+
+again:
+ ret = read(iip->ifd, &t, sizeof(t));
+ if (ret < 0) {
+ fatal(iip->file_name, ERR_SYSCALL, "Read failed\n");
+ /*NOTREACHED*/
+ }
+ else if (ret == 0)
+ return 0;
+ else if (ret < (ssize_t)sizeof(t)) {
+ fprintf(stderr, "WARNING: Short read on %s (%d)\n",
+ iip->file_name, (int)ret);
+ return 0;
+ }
+
+ if (data_is_native == -1)
+ check_data_endianness(t.magic);
+
+ assert(data_is_native >= 0);
+ if (data_is_native) {
+ spec->time = t.time;
+ spec->sector = t.sector;
+ spec->bytes = t.bytes;
+ action = t.action;
+ pdu_len = t.pdu_len;
+ }
+ else {
+ spec->time = be64_to_cpu(t.time);
+ spec->sector = be64_to_cpu(t.sector);
+ spec->bytes = be32_to_cpu(t.bytes);
+ action = be32_to_cpu(t.action);
+ pdu_len = be16_to_cpu(t.pdu_len);
+ }
+
+
+ if (pdu_len) {
+ char buf[pdu_len];
+
+ ret = read(iip->ifd, buf, pdu_len);
+ if (ret < 0) {
+ fatal(iip->file_name, ERR_SYSCALL, "Read PDU failed\n");
+ /*NOTREACHED*/
+ }
+ else if (ret < (ssize_t)pdu_len) {
+ fprintf(stderr, "WARNING: Short PDU read on %s (%d)\n",
+ iip->file_name, (int)ret);
+ return 0;
+ }
+ }
+
+ iip->tpkts++;
+ if (!match(action))
+ goto again;
+
+ spec->rw = (action & BLK_TC_ACT(BLK_TC_READ)) ? 1 : 0;
+ if (verbose > 1)
+ fprintf(stderr, "%2d: %10llu+%10llu (%d) @ %10llx\n",
+ iip->cpu, (long long unsigned)spec->sector,
+ (long long unsigned)spec->bytes / 512LLU,
+ spec->rw, (long long unsigned)spec->time);
+
+ if (iip->genesis == 0) {
+ iip->genesis = spec->time;
+ if (verbose > 1)
+ fprintf(stderr, "\tSetting new genesis: %llx(%d)\n",
+ (long long unsigned)iip->genesis, iip->cpu);
+ }
+ else if (iip->genesis > spec->time)
+ fatal(NULL, ERR_SYSCALL,
+ "Time inversion? %llu ... %llu\n",
+ (long long unsigned )iip->genesis,
+ (long long unsigned )spec->time);
+
+ return 1;
+}
+
+/**
+ * bunch_output_hdr - Output bunch header
+ */
+static inline void bunch_output_hdr(struct io_stream *stream)
+{
+ struct io_bunch_hdr *hdrp = &stream->cur->hdr;
+
+ assert(0 < hdrp->npkts && hdrp->npkts <= BT_MAX_PKTS);
+ if (fwrite(hdrp, sizeof(struct io_bunch_hdr), 1, stream->ofp) != 1) {
+ fatal(stream->file_name, ERR_SYSCALL, "fwrite(hdr) failed\n");
+ /*NOTREACHED*/
+ }
+
+ if (verbose) {
+ __u64 off = hdrp->time_stamp - stream->iip->genesis;
+
+ assert(stream->vfp);
+ fprintf(stream->vfp, "------------------\n");
+ fprintf(stream->vfp, "%4llu.%09llu %3llu\n",
+ (unsigned long long)off / (1000 * 1000 * 1000),
+ (unsigned long long)off % (1000 * 1000 * 1000),
+ (unsigned long long)hdrp->npkts);
+ fprintf(stream->vfp, "------------------\n");
+ }
+}
+
+/**
+ * bunch_output_pkt - Output IO packets
+ */
+static inline void bunch_output_pkts(struct io_stream *stream)
+{
+ struct io_pkt *p = stream->cur->pkts;
+ size_t npkts = stream->cur->hdr.npkts;
+
+ assert(0 < npkts && npkts <= BT_MAX_PKTS);
+ if (fwrite(p, sizeof(struct io_pkt), npkts, stream->ofp) != npkts) {
+ fatal(stream->file_name, ERR_SYSCALL, "fwrite(pkts) failed\n");
+ /*NOTREACHED*/
+ }
+
+ if (verbose) {
+ size_t i;
+
+ assert(stream->vfp);
+ for (i = 0; i < npkts; i++, p++)
+ fprintf(stream->vfp, "\t%1d %10llu\t%10llu\n",
+ p->rw,
+ (unsigned long long)p->sector,
+ (unsigned long long)p->nbytes / 512);
+ }
+}
+
+/**
+ * stream_flush - Flush current bunch of IOs out to the output stream
+ * @stream: Per-output file stream information
+ */
+static void stream_flush(struct io_stream *stream)
+{
+ struct io_bunch *cur = stream->cur;
+
+ if (cur) {
+ if (cur->hdr.npkts) {
+ assert(cur->hdr.npkts <= BT_MAX_PKTS);
+ bunch_output_hdr(stream);
+ bunch_output_pkts(stream);
+
+ stream->bunches++;
+ stream->pkts += cur->hdr.npkts;
+ }
+ free(cur);
+ }
+}
+
+/**
+ * bunch_done - Returns true if current bunch is either full, or next IO is late
+ * @stream: Output stream information
+ * @spec: IO trace specification
+ */
+static inline int bunch_done(struct io_stream *stream, struct io_spec *spec)
+{
+ if (stream->cur->hdr.npkts >= max_pkts_per_bunch)
+ return 1;
+
+ if ((spec->time - stream->start_time) > max_bunch_tm)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * stream_add_io - Add an IO trace to the current stream
+ * @stream: Output stream information
+ * @spec: IO trace specification
+ */
+static void stream_add_io(struct io_stream *stream, struct io_spec *spec)
+{
+
+ if (stream->cur == NULL)
+ io_bunch_create(stream, spec->time);
+ else if (bunch_done(stream, spec)) {
+ stream_flush(stream);
+ io_bunch_create(stream, spec->time);
+ }
+
+ io_bunch_add(stream, spec);
+}
+
+/**
+ * stream_open - Open output stream for specified input stream
+ * @iip: Per-input file information
+ */
+static struct io_stream *stream_open(struct ifile_info *iip)
+{
+ char ofile_name[MAXPATHLEN];
+ struct io_stream *stream = malloc(sizeof(*stream));
+ struct io_file_hdr io_file_hdr = {
+ .genesis = 0,
+ .nbunches = 0,
+ .total_pkts = 0
+ };
+
+ memset(stream, 0, sizeof(*stream));
+
+ sprintf(ofile_name, "%s/%s.%s.%d", odir, iip->devnm, obase, iip->cpu);
+ stream->ofp = fopen(ofile_name, "w");
+ if (!stream->ofp) {
+ fatal(ofile_name, ERR_SYSCALL, "Open failed\n");
+ /*NOTREACHED*/
+ }
+
+ stream->iip = iip;
+ stream->cur = NULL;
+ stream->bunches = stream->pkts = 0;
+ stream->last_time = 0;
+ stream->file_name = strdup(ofile_name);
+
+ write_file_hdr(stream, &io_file_hdr);
+
+ if (verbose) {
+ char vfile_name[MAXPATHLEN];
+
+ sprintf(vfile_name, "%s/%s.%s.%d.rec", odir, iip->devnm,
+ obase, iip->cpu);
+ stream->vfp = fopen(vfile_name, "w");
+ if (!stream->vfp) {
+ fatal(vfile_name, ERR_SYSCALL, "Open failed\n");
+ /*NOTREACHED*/
+ }
+
+ stream->vfn = strdup(vfile_name);
+ }
+
+ data_is_native = -1;
+ return stream;
+}
+
+/**
+ * stream_close - Release resources associated with an output stream
+ * @stream: Stream to release
+ */
+static void stream_close(struct io_stream *stream)
+{
+ struct io_file_hdr io_file_hdr = {
+ .genesis = stream->iip->genesis,
+ .nbunches = stream->bunches,
+ .total_pkts = stream->pkts
+ };
+
+ stream_flush(stream);
+ write_file_hdr(stream, &io_file_hdr);
+ fclose(stream->ofp);
+
+ if (verbose && stream->bunches) {
+ fprintf(stderr,
+ "%s:%d: %llu pkts (tot), %llu pkts (replay), "
+ "%llu bunches, %.1lf pkts/bunch\n",
+ stream->iip->devnm, stream->iip->cpu,
+ (unsigned long long)stream->iip->tpkts,
+ (unsigned long long)stream->pkts,
+ (unsigned long long)stream->bunches,
+ (double)(stream->pkts) / (double)(stream->bunches));
+
+ fclose(stream->vfp);
+ free(stream->vfn);
+ }
+
+ free(stream->file_name);
+ free(stream);
+}
+
+/**
+ * process - Process one input file to an output file
+ * @iip: Per-input file information
+ */
+static void process(struct ifile_info *iip)
+{
+ struct io_spec spec;
+ struct io_stream *stream;
+
+ stream = stream_open(iip);
+ while (next_io(iip, &spec))
+ stream_add_io(stream, &spec);
+ stream_close(stream);
+
+ rem_input_file(iip);
+}
+
+/**
+ * main -
+ * @argc: Number of arguments
+ * @argv: Array of arguments
+ */
+int main(int argc, char *argv[])
+{
+ struct list_head *p, *q;
+
+ handle_args(argc, argv);
+ list_for_each_safe(p, q, &input_files)
+ process(list_entry(p, struct ifile_info, head));
+
+ return 0;
+}
diff --git a/btreplay/btrecord.h b/btreplay/btrecord.h
new file mode 100644
index 0000000..8026206
--- /dev/null
+++ b/btreplay/btrecord.h
@@ -0,0 +1,95 @@
+/*
+ * Blktrace record utility - Convert binary trace data into bunches of IOs
+ *
+ * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#if !defined(__BTRECORD_H__)
+#define __BTRECORD_H__
+
+#include <asm/types.h>
+
+#define BT_MAX_PKTS 512
+
+/*
+ * Header for each bunch
+ *
+ * @nkts: Number of IO packets to process
+ * @time_stamp: Time stamp for this bunch of IOs
+ */
+struct io_bunch_hdr {
+ __u64 npkts;
+ __u64 time_stamp;
+};
+
+/*
+ * IO specifer
+ *
+ * @sector: Sector number of IO
+ * @nbytes: Number of bytes to process
+ * @rw: IO direction: 0 = write, 1 = read
+ */
+struct io_pkt {
+ __u64 sector;
+ __u64 nbytes;
+ __u32 rw;
+};
+
+/*
+ * Shorthand notion of a bunch of IOs
+ *
+ * @hdr: Header describing stall and how many IO packets follow
+ * @pkts: Individual IOs are described here
+ */
+struct io_bunch {
+ struct io_bunch_hdr hdr;
+ struct io_pkt pkts[BT_MAX_PKTS];
+};
+
+/*
+ * Header for each recorded file
+ *
+ * @version: Version information
+ * @genesis: Time stamp for earliest bunch
+ * @nbunches: Number of bunches put into the file
+ * @total_pkts: Number of packets to be processed
+ */
+struct io_file_hdr {
+ __u64 version;
+ __u64 genesis;
+ __u64 nbunches;
+ __u64 total_pkts;
+};
+
+static inline __u64 mk_btversion(int mjr, int mnr, int sub)
+{
+ return ((mjr & 0xff) << 16) | ((mnr & 0xff) << 8) | (sub & 0xff);
+}
+
+static inline void get_btversion(__u64 version, int *mjr, int *mnr, int *sub)
+{
+ *mjr = (int)((version >> 16) & 0xff);
+ *mnr = (int)((version >> 8) & 0xff);
+ *sub = (int)((version >> 0) & 0xff);
+}
+
+static char my_btversion[] = "0.9.3";
+static int btver_mjr = 0;
+static int btver_mnr = 9;
+static int btver_sub = 3;
+
+#endif
diff --git a/btreplay/btreplay.c b/btreplay/btreplay.c
new file mode 100644
index 0000000..48181a4
--- /dev/null
+++ b/btreplay/btreplay.c
@@ -0,0 +1,1590 @@
+/*
+ * Blktrace replay utility - Play traces back
+ *
+ * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+static char build_date[] = __DATE__ " at "__TIME__;
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libaio.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <dirent.h>
+
+#if !defined(_GNU_SOURCE)
+# define _GNU_SOURCE
+#endif
+#include <getopt.h>
+
+#include "list.h"
+#include "btrecord.h"
+
+/*
+ * ========================================================================
+ * ==== STRUCTURE DEFINITIONS =============================================
+ * ========================================================================
+ */
+
+/**
+ * Each device map has one of these:
+ *
+ * @head: Linked on to map_devs
+ * @from_dev: Device name as seen on recorded system
+ * @to_dev: Device name to be used on replay system
+ */
+struct map_dev {
+ struct list_head head;
+ char *from_dev, *to_dev;
+};
+
+/**
+ * Each device name specified has one of these (until threads are created)
+ *
+ * @head: Linked onto input_devs
+ * @devnm: Device name -- 'sd*'
+ */
+struct dev_info {
+ struct list_head head;
+ char *devnm;
+};
+
+/*
+ * Per input file information
+ *
+ * @head: Used to link up on input_files
+ * @free_iocbs: List of free iocb's available for use
+ * @used_iocbs: List of iocb's currently outstanding
+ * @mutex: Mutex used with condition variable to protect volatile values
+ * @cond: Condition variable used when waiting on a volatile value change
+ * @naios_out: Current number of AIOs outstanding on this context
+ * @naios_free: Number of AIOs on the free list (short cut for list_len)
+ * @send_wait: Boolean: When true, the sub thread is waiting on free IOCBs
+ * @reap_wait: Boolean: When true, the rec thread is waiting on used IOCBs
+ * @send_done: Boolean: When true, the sub thread has completed work
+ * @reap_done: Boolean: When true, the rec thread has completed work
+ * @sub_thread: Thread used to submit IOs.
+ * @rec_thread: Thread used to reclaim IOs.
+ * @ctx: IO context
+ * @devnm: Copy of the device name being managed by this thread
+ * @file_name: Full name of the input file
+ * @cpu: CPU this thread is pinned to
+ * @ifd: Input file descriptor
+ * @ofd: Output file descriptor
+ * @iterations: Remaining iterations to process
+ * @vfp: For verbose dumping of actions performed
+ */
+struct thr_info {
+ struct list_head head, free_iocbs, used_iocbs;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ volatile long naios_out, naios_free;
+ volatile int send_wait, reap_wait, send_done, reap_done;
+ pthread_t sub_thread, rec_thread;
+ io_context_t ctx;
+ char *devnm, *file_name;
+ int cpu, ifd, ofd, iterations;
+ FILE *vfp;
+};
+
+/*
+ * Every Asynchronous IO used has one of these (naios per file/device).
+ *
+ * @iocb: IOCB sent down via io_submit
+ * @head: Linked onto file_list.free_iocbs or file_list.used_iocbs
+ * @tip: Pointer to per-thread information this IO is associated with
+ * @nbytes: Number of bytes in buffer associated with iocb
+ */
+struct iocb_pkt {
+ struct iocb iocb;
+ struct list_head head;
+ struct thr_info *tip;
+ int nbytes;
+};
+
+/*
+ * ========================================================================
+ * ==== GLOBAL VARIABLES ==================================================
+ * ========================================================================
+ */
+
+static volatile int signal_done = 0; // Boolean: Signal'ed, need to quit
+
+static char *ibase = "replay"; // Input base name
+static char *idir = "."; // Input directory base
+static int cpus_to_use = -1; // Number of CPUs to use
+static int def_iterations = 1; // Default number of iterations
+static int naios = 512; // Number of AIOs per thread
+static int ncpus = 0; // Number of CPUs in the system
+static int verbose = 0; // Boolean: Output some extra info
+static int write_enabled = 0; // Boolean: Enable writing
+static __u64 genesis = ~0; // Earliest time seen
+static __u64 rgenesis; // Our start time
+static size_t pgsize; // System Page size
+static int nb_sec = 512; // Number of bytes per sector
+static LIST_HEAD(input_devs); // List of devices to handle
+static LIST_HEAD(input_files); // List of input files to handle
+static LIST_HEAD(map_devs); // List of device maps
+static int nfiles = 0; // Number of files to handle
+static int no_stalls = 0; // Boolean: Disable pre-stalls
+static int find_records = 0; // Boolean: Find record files auto
+
+/*
+ * Variables managed under control of condition variables.
+ *
+ * n_reclaims_done: Counts number of reclaim threads that have completed.
+ * n_replays_done: Counts number of replay threads that have completed.
+ * n_replays_ready: Counts number of replay threads ready to start.
+ * n_iters_done: Counts number of replay threads done one iteration.
+ * iter_start: Starts an iteration for the replay threads.
+ */
+static volatile int n_reclaims_done = 0;
+static pthread_mutex_t reclaim_done_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t reclaim_done_cond = PTHREAD_COND_INITIALIZER;
+
+static volatile int n_replays_done = 0;
+static pthread_mutex_t replay_done_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t replay_done_cond = PTHREAD_COND_INITIALIZER;
+
+static volatile int n_replays_ready = 0;
+static pthread_mutex_t replay_ready_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t replay_ready_cond = PTHREAD_COND_INITIALIZER;
+
+static volatile int n_iters_done = 0;
+static pthread_mutex_t iter_done_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t iter_done_cond = PTHREAD_COND_INITIALIZER;
+
+static volatile int iter_start = 0;
+static pthread_mutex_t iter_start_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t iter_start_cond = PTHREAD_COND_INITIALIZER;
+
+/*
+ * ========================================================================
+ * ==== FORWARD REFERENECES ===============================================
+ * ========================================================================
+ */
+
+static void *replay_sub(void *arg);
+static void *replay_rec(void *arg);
+static char usage_str[];
+
+/*
+ * ========================================================================
+ * ==== INLINE ROUTINES ===================================================
+ * ========================================================================
+ */
+
+/*
+ * The 'fatal' macro will output a perror message (if errstring is !NULL)
+ * and display a string (with variable arguments) and then exit with the
+ * specified exit value.
+ */
+#define ERR_ARGS 1
+#define ERR_SYSCALL 2
+#define fatal(errstring, exitval, arg...) \
+ do { \
+ if (errstring) perror(errstring); \
+ fprintf(stderr, ##arg); \
+ exit(exitval); \
+ /*NOTREACHED*/ \
+ } while (0)
+
+static inline long long unsigned du64_to_sec(__u64 du64)
+{
+ return (long long unsigned)du64 / (1000 * 1000 * 1000);
+}
+
+static inline long long unsigned du64_to_nsec(__u64 du64)
+{
+ return llabs((long long)du64) % (1000 * 1000 * 1000);
+}
+
+/**
+ * min - Return minimum of two integers
+ */
+static inline int min(int a, int b)
+{
+ return a < b ? a : b;
+}
+
+/**
+ * minl - Return minimum of two longs
+ */
+static inline long minl(long a, long b)
+{
+ return a < b ? a : b;
+}
+
+/**
+ * usage - Display usage string and version
+ */
+static inline void usage(void)
+{
+ fprintf(stderr, "Usage: btreplay -- version %s\n%s",
+ my_btversion, usage_str);
+}
+
+/**
+ * is_send_done - Returns true if sender should quit early
+ * @tip: Per-thread information
+ */
+static inline int is_send_done(struct thr_info *tip)
+{
+ return signal_done || tip->send_done;
+}
+
+/**
+ * is_reap_done - Returns true if reaper should quit early
+ * @tip: Per-thread information
+ */
+static inline int is_reap_done(struct thr_info *tip)
+{
+ return tip->send_done && tip->naios_out == 0;
+}
+
+/**
+ * ts2ns - Convert timespec values to a nanosecond value
+ */
+#define NS_TICKS ((__u64)1000 * (__u64)1000 * (__u64)1000)
+static inline __u64 ts2ns(struct timespec *ts)
+{
+ return ((__u64)(ts->tv_sec) * NS_TICKS) + (__u64)(ts->tv_nsec);
+}
+
+/**
+ * ts2ns - Convert timeval values to a nanosecond value
+ */
+static inline __u64 tv2ns(struct timeval *tp)
+{
+ return ((__u64)(tp->tv_sec)) + ((__u64)(tp->tv_usec) * (__u64)1000);
+}
+
+/**
+ * touch_memory - Force physical memory to be allocating it
+ *
+ * For malloc()ed memory we need to /touch/ it to make it really
+ * exist. Otherwise, for write's (to storage) things may not work
+ * as planned - we see Linux just use a single area to /read/ from
+ * (as there isn't any memory that has been associated with the
+ * allocated virtual addresses yet).
+ */
+static inline void touch_memory(char *buf, size_t bsize)
+{
+#if defined(PREP_BUFS)
+ memset(buf, 0, bsize);
+#else
+ size_t i;
+
+ for (i = 0; i < bsize; i += pgsize)
+ buf[i] = 0;
+#endif
+}
+
+/**
+ * buf_alloc - Returns a page-aligned buffer of the specified size
+ * @nbytes: Number of bytes to allocate
+ */
+static inline void *buf_alloc(size_t nbytes)
+{
+ void *buf;
+
+ if (posix_memalign(&buf, pgsize, nbytes)) {
+ fatal("posix_memalign", ERR_SYSCALL, "Allocation failed\n");
+ /*NOTREACHED*/
+ }
+
+ return buf;
+}
+
+/**
+ * gettime - Returns current time
+ */
+static inline __u64 gettime(void)
+{
+ static int use_clock_gettime = -1; // Which clock to use
+
+ if (use_clock_gettime < 0) {
+ use_clock_gettime = clock_getres(CLOCK_MONOTONIC, NULL) == 0;
+ if (use_clock_gettime) {
+ struct timespec ts = {
+ .tv_sec = 0,
+ .tv_nsec = 0
+ };
+ clock_settime(CLOCK_MONOTONIC, &ts);
+ }
+ }
+
+ if (use_clock_gettime) {
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts2ns(&ts);
+ }
+ else {
+ struct timeval tp;
+ gettimeofday(&tp, NULL);
+ return tv2ns(&tp);
+ }
+}
+
+/**
+ * setup_signal - Set up a signal handler for the specified signum
+ */
+static inline void setup_signal(int signum, sighandler_t handler)
+{
+ if (signal(signum, handler) == SIG_ERR) {
+ fatal("signal", ERR_SYSCALL, "Failed to set signal %d\n",
+ signum);
+ /*NOTREACHED*/
+ }
+}
+
+/*
+ * ========================================================================
+ * ==== CONDITION VARIABLE ROUTINES =======================================
+ * ========================================================================
+ */
+
+/**
+ * __set_cv - Increments a variable under condition variable control.
+ * @pmp: Pointer to the associated mutex
+ * @pcp: Pointer to the associated condition variable
+ * @vp: Pointer to the variable being incremented
+ * @mxv: Max value for variable (Used only when ASSERTS are on)
+ */
+static inline void __set_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
+ volatile int *vp,
+ __attribute__((__unused__))int mxv)
+{
+ pthread_mutex_lock(pmp);
+ assert(*vp < mxv);
+ *vp += 1;
+ pthread_cond_signal(pcp);
+ pthread_mutex_unlock(pmp);
+}
+
+/**
+ * __wait_cv - Waits for a variable under cond var control to hit a value
+ * @pmp: Pointer to the associated mutex
+ * @pcp: Pointer to the associated condition variable
+ * @vp: Pointer to the variable being incremented
+ * @mxv: Value to wait for
+ */
+static inline void __wait_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
+ volatile int *vp, int mxv)
+{
+ pthread_mutex_lock(pmp);
+ while (*vp < mxv)
+ pthread_cond_wait(pcp, pmp);
+ *vp = 0;
+ pthread_mutex_unlock(pmp);
+}
+
+static inline void set_reclaim_done(void)
+{
+ __set_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
+ nfiles);
+}
+
+static inline void wait_reclaims_done(void)
+{
+ __wait_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
+ nfiles);
+}
+
+static inline void set_replay_ready(void)
+{
+ __set_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
+ nfiles);
+}
+
+static inline void wait_replays_ready(void)
+{
+ __wait_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
+ nfiles);
+}
+
+static inline void set_replay_done(void)
+{
+ __set_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
+ nfiles);
+}
+
+static inline void wait_replays_done(void)
+{
+ __wait_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
+ nfiles);
+}
+
+static inline void set_iter_done(void)
+{
+ __set_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
+ nfiles);
+}
+
+static inline void wait_iters_done(void)
+{
+ __wait_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
+ nfiles);
+}
+
+/**
+ * wait_iter_start - Wait for an iteration to start
+ *
+ * This is /slightly/ different: we are waiting for a value to become
+ * non-zero, and then we decrement it and go on.
+ */
+static inline void wait_iter_start(void)
+{
+ pthread_mutex_lock(&iter_start_mutex);
+ while (iter_start == 0)
+ pthread_cond_wait(&iter_start_cond, &iter_start_mutex);
+ assert(1 <= iter_start && iter_start <= nfiles);
+ iter_start--;
+ pthread_mutex_unlock(&iter_start_mutex);
+}
+
+/**
+ * start_iter - Start an iteration at the replay thread level
+ */
+static inline void start_iter(void)
+{
+ pthread_mutex_lock(&iter_start_mutex);
+ assert(iter_start == 0);
+ iter_start = nfiles;
+ pthread_cond_broadcast(&iter_start_cond);
+ pthread_mutex_unlock(&iter_start_mutex);
+}
+
+/*
+ * ========================================================================
+ * ==== CPU RELATED ROUTINES ==============================================
+ * ========================================================================
+ */
+
+/**
+ * get_ncpus - Sets up the global 'ncpus' value
+ */
+static void get_ncpus(void)
+{
+ cpu_set_t cpus;
+
+ if (sched_getaffinity(getpid(), sizeof(cpus), &cpus)) {
+ fatal("sched_getaffinity", ERR_SYSCALL, "Can't get CPU info\n");
+ /*NOTREACHED*/
+ }
+
+ /*
+ * XXX This assumes (perhaps wrongly) that there are no /holes/
+ * XXX in the mask.
+ */
+ for (ncpus = 0; ncpus < CPU_SETSIZE && CPU_ISSET(ncpus, &cpus); ncpus++)
+ ;
+ if (ncpus == 0) {
+ fatal(NULL, ERR_SYSCALL, "Insufficient number of CPUs\n");
+ /*NOTREACHED*/
+ }
+}
+
+/**
+ * pin_to_cpu - Pin this thread to a specific CPU
+ * @tip: Thread information
+ */
+static void pin_to_cpu(struct thr_info *tip)
+{
+ cpu_set_t cpus;
+
+ assert(0 <= tip->cpu && tip->cpu < ncpus);
+
+ CPU_ZERO(&cpus);
+ CPU_SET(tip->cpu, &cpus);
+ if (sched_setaffinity(getpid(), sizeof(cpus), &cpus)) {
+ fatal("sched_setaffinity", ERR_SYSCALL, "Failed to pin CPU\n");
+ /*NOTREACHED*/
+ }
+
+ if (verbose > 1) {
+ int i;
+ cpu_set_t now;
+
+ (void)sched_getaffinity(getpid(), sizeof(now), &now);
+ fprintf(tip->vfp, "Pinned to CPU %02d ", tip->cpu);
+ for (i = 0; i < ncpus; i++)
+ fprintf(tip->vfp, "%1d", CPU_ISSET(i, &now));
+ fprintf(tip->vfp, "\n");
+ }
+}
+
+/*
+ * ========================================================================
+ * ==== INPUT DEVICE HANDLERS =============================================
+ * ========================================================================
+ */
+
+/**
+ * add_input_dev - Add a device ('sd*') to the list of devices to handle
+ */
+static void add_input_dev(char *devnm)
+{
+ struct list_head *p;
+ struct dev_info *dip;
+
+ __list_for_each(p, &input_devs) {
+ dip = list_entry(p, struct dev_info, head);
+ if (strcmp(dip->devnm, devnm) == 0)
+ return;
+ }
+
+ dip = malloc(sizeof(*dip));
+ dip->devnm = strdup(devnm);
+ list_add_tail(&dip->head, &input_devs);
+}
+
+/**
+ * rem_input_dev - Remove resources associated with this device
+ */
+static void rem_input_dev(struct dev_info *dip)
+{
+ list_del(&dip->head);
+ free(dip->devnm);
+ free(dip);
+}
+
+static void find_input_devs(char *idir)
+{
+ struct dirent *ent;
+ DIR *dir = opendir(idir);
+
+ if (dir == NULL) {
+ fatal(idir, ERR_ARGS, "Unable to open %s\n", idir);
+ /*NOTREACHED*/
+ }
+
+ while ((ent = readdir(dir)) != NULL) {
+ char *p, *dsf = malloc(256);
+
+ if (strstr(ent->d_name, ".replay.") == NULL)
+ continue;
+
+ dsf = strdup(ent->d_name);
+ p = index(dsf, '.');
+ assert(p != NULL);
+ *p = '\0';
+ add_input_dev(dsf);
+ free(dsf);
+ }
+
+ closedir(dir);
+}
+
+/*
+ * ========================================================================
+ * ==== MAP DEVICE INTERFACES =============================================
+ * ========================================================================
+ */
+
+/**
+ * read_map_devs - Read in a set of device mapping from the provided file.
+ * @file_name: File containing device maps
+ *
+ * We support the notion of multiple such files being specifed on the cmd line
+ */
+static void read_map_devs(char *file_name)
+{
+ FILE *fp;
+ char *from_dev, *to_dev;
+
+ fp = fopen(file_name, "r");
+ if (!fp) {
+ fatal(file_name, ERR_SYSCALL, "Could not open map devs file\n");
+ /*NOTREACHED*/
+ }
+
+ while (fscanf(fp, "%as %as", &from_dev, &to_dev) == 2) {
+ struct map_dev *mdp = malloc(sizeof(*mdp));
+
+ mdp->from_dev = from_dev;
+ mdp->to_dev = to_dev;
+ list_add_tail(&mdp->head, &map_devs);
+ }
+
+ fclose(fp);
+}
+
+/**
+ * release_map_devs - Release resources associated with device mappings.
+ */
+static void release_map_devs(void)
+{
+ struct list_head *p, *q;
+
+ list_for_each_safe(p, q, &map_devs) {
+ struct map_dev *mdp = list_entry(p, struct map_dev, head);
+
+ list_del(&mdp->head);
+
+ free(mdp->from_dev);
+ free(mdp->to_dev);
+ free(mdp);
+ }
+}
+
+/**
+ * map_dev - Return the mapped device for that specified
+ * @from_dev: Device name as seen on recorded system
+ *
+ * Note: If there is no such mapping, we return the same name.
+ */
+static char *map_dev(char *from_dev)
+{
+ struct list_head *p;
+
+ __list_for_each(p, &map_devs) {
+ struct map_dev *mdp = list_entry(p, struct map_dev, head);
+
+ if (strcmp(from_dev, mdp->from_dev) == 0)
+ return mdp->to_dev;
+ }
+
+ return from_dev;
+}
+
+/*
+ * ========================================================================
+ * ==== IOCB MANAGEMENT ROUTINES ==========================================
+ * ========================================================================
+ */
+
+/**
+ * iocb_init - Initialize the fields of an IOCB
+ * @tip: Per-thread information
+ * iocbp: IOCB pointer to update
+ */
+static void iocb_init(struct thr_info *tip, struct iocb_pkt *iocbp)
+{
+ iocbp->tip = tip;
+ iocbp->nbytes = 0;
+ iocbp->iocb.u.c.buf = NULL;
+}
+
+/**
+ * iocb_setup - Set up an iocb with this AIOs information
+ * @iocbp: IOCB pointer to update
+ * @rw: Direction (0 == write, 1 == read)
+ * @n: Number of bytes to transfer
+ * @off: Offset (in bytes)
+ */
+static void iocb_setup(struct iocb_pkt *iocbp, int rw, int n, long long off)
+{
+ char *buf;
+ struct iocb *iop = &iocbp->iocb;
+
+ assert(rw == 0 || rw == 1);
+ assert(0 < n && (n % nb_sec) == 0);
+ assert(0 <= off);
+
+ if (iocbp->nbytes) {
+ if (iocbp->nbytes >= n) {
+ buf = iop->u.c.buf;
+ goto prep;
+ }
+
+ assert(iop->u.c.buf);
+ free(iop->u.c.buf);
+ }
+
+ buf = buf_alloc(n);
+ iocbp->nbytes = n;
+
+prep:
+ if (rw)
+ io_prep_pread(iop, iocbp->tip->ofd, buf, n, off);
+ else {
+ assert(write_enabled);
+ io_prep_pwrite(iop, iocbp->tip->ofd, buf, n, off);
+ touch_memory(buf, n);
+ }
+
+ iop->data = iocbp;
+}
+
+/*
+ * ========================================================================
+ * ==== PER-THREAD SET UP & TEAR DOWN =====================================
+ * ========================================================================
+ */
+
+/**
+ * tip_init - Per thread initialization function
+ */
+static void tip_init(struct thr_info *tip)
+{
+ int i;
+
+ INIT_LIST_HEAD(&tip->free_iocbs);
+ INIT_LIST_HEAD(&tip->used_iocbs);
+
+ pthread_mutex_init(&tip->mutex, NULL);
+ pthread_cond_init(&tip->cond, NULL);
+
+ if (io_setup(naios, &tip->ctx)) {
+ fatal("io_setup", ERR_SYSCALL, "io_setup failed\n");
+ /*NOTREACHED*/
+ }
+
+ tip->ofd = -1;
+ tip->naios_out = 0;
+ tip->send_done = tip->reap_done = 0;
+ tip->send_wait = tip->reap_wait = 0;
+
+ memset(&tip->sub_thread, 0, sizeof(tip->sub_thread));
+ memset(&tip->rec_thread, 0, sizeof(tip->rec_thread));
+
+ for (i = 0; i < naios; i++) {
+ struct iocb_pkt *iocbp = buf_alloc(sizeof(*iocbp));
+
+ iocb_init(tip, iocbp);
+ list_add_tail(&iocbp->head, &tip->free_iocbs);
+ }
+ tip->naios_free = naios;
+
+ if (verbose > 1) {
+ char fn[MAXPATHLEN];
+
+ sprintf(fn, "%s/%s.%s.%d.rep", idir, tip->devnm, ibase,
+ tip->cpu);
+ tip->vfp = fopen(fn, "w");
+ if (!tip->vfp) {
+ fatal(fn, ERR_SYSCALL, "Failed to open report\n");
+ /*NOTREACHED*/
+ }
+
+ setlinebuf(tip->vfp);
+ }
+
+ if (pthread_create(&tip->sub_thread, NULL, replay_sub, tip)) {
+ fatal("pthread_create", ERR_SYSCALL,
+ "thread create failed\n");
+ /*NOTREACHED*/
+ }
+
+ if (pthread_create(&tip->rec_thread, NULL, replay_rec, tip)) {
+ fatal("pthread_create", ERR_SYSCALL,
+ "thread create failed\n");
+ /*NOTREACHED*/
+ }
+}
+
+/**
+ * tip_release - Release resources associated with this thread
+ */
+static void tip_release(struct thr_info *tip)
+{
+ struct list_head *p, *q;
+
+ assert(tip->send_done);
+ assert(tip->reap_done);
+ assert(list_len(&tip->used_iocbs) == 0);
+ assert(tip->naios_free == naios);
+
+ if (pthread_join(tip->sub_thread, NULL)) {
+ fatal("pthread_join", ERR_SYSCALL, "pthread sub join failed\n");
+ /*NOTREACHED*/
+ }
+ if (pthread_join(tip->rec_thread, NULL)) {
+ fatal("pthread_join", ERR_SYSCALL, "pthread rec join failed\n");
+ /*NOTREACHED*/
+ }
+
+ io_destroy(tip->ctx);
+
+ list_splice(&tip->used_iocbs, &tip->free_iocbs);
+ list_for_each_safe(p, q, &tip->free_iocbs) {
+ struct iocb_pkt *iocbp = list_entry(p, struct iocb_pkt, head);
+
+ list_del(&iocbp->head);
+ if (iocbp->nbytes)
+ free(iocbp->iocb.u.c.buf);
+ free(iocbp);
+ }
+
+ pthread_cond_destroy(&tip->cond);
+ pthread_mutex_destroy(&tip->mutex);
+}
+
+/**
+ * add_input_file - Allocate and initialize per-input file structure
+ * @cpu: CPU for this file
+ * @devnm: Device name for this file
+ * @file_name: Fully qualifed input file name
+ */
+static void add_input_file(int cpu, char *devnm, char *file_name)
+{
+ struct stat buf;
+ struct io_file_hdr hdr;
+ struct thr_info *tip = buf_alloc(sizeof(*tip));
+ __u64 my_version = mk_btversion(btver_mjr, btver_mnr, btver_sub);
+
+ assert(0 <= cpu && cpu < ncpus);
+
+ memset(&hdr, 0, sizeof(hdr));
+ memset(tip, 0, sizeof(*tip));
+ tip->cpu = cpu % cpus_to_use;
+ tip->iterations = def_iterations;
+
+ tip->ifd = open(file_name, O_RDONLY);
+ if (tip->ifd < 0) {
+ fatal(file_name, ERR_ARGS, "Unable to open\n");
+ /*NOTREACHED*/
+ }
+ if (fstat(tip->ifd, &buf) < 0) {
+ fatal(file_name, ERR_SYSCALL, "fstat failed\n");
+ /*NOTREACHED*/
+ }
+ if (buf.st_size < (off_t)sizeof(hdr)) {
+ if (verbose)
+ fprintf(stderr, "\t%s empty\n", file_name);
+ goto empty_file;
+ }
+
+ if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
+ fatal(file_name, ERR_ARGS, "Header read failed\n");
+ /*NOTREACHED*/
+ }
+
+ if (hdr.version != my_version) {
+ fprintf(stderr, "%llx %llx %llx %llx\n",
+ (long long unsigned)hdr.version,
+ (long long unsigned)hdr.genesis,
+ (long long unsigned)hdr.nbunches,
+ (long long unsigned)hdr.total_pkts);
+ fatal(NULL, ERR_ARGS,
+ "BT version mismatch: %lx versus my %lx\n",
+ (long)hdr.version, (long)my_version);
+
+ }
+
+ if (hdr.nbunches == 0) {
+empty_file:
+ close(tip->ifd);
+ free(tip);
+ return;
+ }
+
+ if (hdr.genesis < genesis) {
+ if (verbose > 1)
+ fprintf(stderr, "Setting genesis to %llu.%llu\n",
+ du64_to_sec(hdr.genesis),
+ du64_to_nsec(hdr.genesis));
+ genesis = hdr.genesis;
+ }
+
+ tip->devnm = strdup(devnm);
+ tip->file_name = strdup(file_name);
+
+ list_add_tail(&tip->head, &input_files);
+
+ if (verbose)
+ fprintf(stderr, "Added %s %llu\n", file_name,
+ (long long)hdr.genesis);
+}
+
+/**
+ * rem_input_file - Release resources associated with an input file
+ * @tip: Per-input file information
+ */
+static void rem_input_file(struct thr_info *tip)
+{
+ list_del(&tip->head);
+
+ tip_release(tip);
+
+ close(tip->ofd);
+ close(tip->ifd);
+ free(tip->file_name);
+ free(tip->devnm);
+ free(tip);
+}
+
+/**
+ * rem_input_files - Remove all input files
+ */
+static void rem_input_files(void)
+{
+ struct list_head *p, *q;
+
+ list_for_each_safe(p, q, &input_files) {
+ rem_input_file(list_entry(p, struct thr_info, head));
+ }
+}
+
+/**
+ * __find_input_files - Find input files associated with this device (per cpu)
+ */
+static void __find_input_files(struct dev_info *dip)
+{
+ int cpu = 0;
+
+ for (;;) {
+ char full_name[MAXPATHLEN];
+
+ sprintf(full_name, "%s/%s.%s.%d", idir, dip->devnm, ibase, cpu);
+ if (access(full_name, R_OK) != 0)
+ break;
+
+ add_input_file(cpu, dip->devnm, full_name);
+ cpu++;
+ }
+
+ if (!cpu) {
+ fatal(NULL, ERR_ARGS, "No traces found for %s\n", dip->devnm);
+ /*NOTREACHED*/
+ }
+
+ rem_input_dev(dip);
+}
+
+
+/**
+ * find_input_files - Find input files for all devices
+ */
+static void find_input_files(void)
+{
+ struct list_head *p, *q;
+
+ list_for_each_safe(p, q, &input_devs) {
+ __find_input_files(list_entry(p, struct dev_info, head));
+ }
+}
+
+/*
+ * ========================================================================
+ * ==== RECLAIM ROUTINES ==================================================
+ * ========================================================================
+ */
+
+/**
+ * reap_wait_aios - Wait for and return number of outstanding AIOs
+ *
+ * Will return 0 if we are done
+ */
+static int reap_wait_aios(struct thr_info *tip)
+{
+ int naios = 0;
+
+ if (!is_reap_done(tip)) {
+ pthread_mutex_lock(&tip->mutex);
+ while (tip->naios_out == 0) {
+ tip->reap_wait = 1;
+ if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
+ fatal("pthread_cond_wait", ERR_SYSCALL,
+ "nfree_current cond wait failed\n");
+ /*NOTREACHED*/
+ }
+ }
+ naios = tip->naios_out;
+ pthread_mutex_unlock(&tip->mutex);
+ }
+ assert(is_reap_done(tip) || naios > 0);
+
+ return is_reap_done(tip) ? 0 : naios;
+}
+
+/**
+ * reclaim_ios - Reclaim AIOs completed, recycle IOCBs
+ * @tip: Per-thread information
+ * @naios_out: Number of AIOs we have outstanding (min)
+ */
+static void reclaim_ios(struct thr_info *tip, long naios_out)
+{
+ long i, ndone;
+ struct io_event *evp, events[naios_out];
+
+again:
+ assert(naios > 0);
+ for (;;) {
+ ndone = io_getevents(tip->ctx, 1, naios_out, events, NULL);
+ if (ndone > 0)
+ break;
+
+ if (errno && errno != EINTR) {
+ fatal("io_getevents", ERR_SYSCALL,
+ "io_getevents failed\n");
+ /*NOTREACHED*/
+ }
+ }
+ assert(0 < ndone && ndone <= naios_out);
+
+ pthread_mutex_lock(&tip->mutex);
+ for (i = 0, evp = events; i < ndone; i++, evp++) {
+ struct iocb_pkt *iocbp = evp->data;
+
+ if (evp->res != iocbp->iocb.u.c.nbytes) {
+ fatal(NULL, ERR_SYSCALL,
+ "Event failure %ld/%ld\t(%ld + %ld)\n",
+ (long)evp->res, (long)evp->res2,
+ (long)iocbp->iocb.u.c.offset / nb_sec,
+ (long)iocbp->iocb.u.c.nbytes / nb_sec);
+ /*NOTREACHED*/
+ }
+
+ list_move_tail(&iocbp->head, &tip->free_iocbs);
+ }
+
+ tip->naios_free += ndone;
+ tip->naios_out -= ndone;
+ naios_out = minl(naios_out, tip->naios_out);
+
+ if (tip->send_wait) {
+ tip->send_wait = 0;
+ pthread_cond_signal(&tip->cond);
+ }
+ pthread_mutex_unlock(&tip->mutex);
+
+ /*
+ * Short cut: If we /know/ there are some more AIOs, go handle them
+ */
+ if (naios_out)
+ goto again;
+}
+
+/**
+ * replay_rec - Worker thread to reclaim AIOs
+ * @arg: Pointer to thread information
+ */
+static void *replay_rec(void *arg)
+{
+ long naios_out;
+ struct thr_info *tip = arg;
+
+ while ((naios_out = reap_wait_aios(tip)) > 0)
+ reclaim_ios(tip, naios_out);
+
+ assert(tip->send_done);
+ tip->reap_done = 1;
+ set_reclaim_done();
+
+ return NULL;
+}
+
+/*
+ * ========================================================================
+ * ==== REPLAY ROUTINES ===================================================
+ * ========================================================================
+ */
+
+/**
+ * next_bunch - Retrieve next bunch of AIOs to process
+ * @tip: Per-thread information
+ * @bunch: Bunch information
+ *
+ * Returns TRUE if we recovered a bunch of IOs, else hit EOF
+ */
+static int next_bunch(struct thr_info *tip, struct io_bunch *bunch)
+{
+ size_t count, result;
+
+ result = read(tip->ifd, &bunch->hdr, sizeof(bunch->hdr));
+ if (result != sizeof(bunch->hdr)) {
+ if (result == 0)
+ return 0;
+
+ fatal(tip->file_name, ERR_SYSCALL, "Short hdr(%ld)\n",
+ (long)result);
+ /*NOTREACHED*/
+ }
+ assert(bunch->hdr.npkts <= BT_MAX_PKTS);
+
+ count = bunch->hdr.npkts * sizeof(struct io_pkt);
+ result = read(tip->ifd, &bunch->pkts, count);
+ if (result != count) {
+ fatal(tip->file_name, ERR_SYSCALL, "Short pkts(%ld/%ld)\n",
+ (long)result, (long)count);
+ /*NOTREACHED*/
+ }
+
+ return 1;
+}
+
+/**
+ * nfree_current - Returns current number of AIOs that are free
+ *
+ * Will wait for available ones...
+ *
+ * Returns 0 if we have some condition that causes us to exit
+ */
+static int nfree_current(struct thr_info *tip)
+{
+ int nfree = 0;
+
+ pthread_mutex_lock(&tip->mutex);
+ while (!is_send_done(tip) && ((nfree = tip->naios_free) == 0)) {
+ tip->send_wait = 1;
+ if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
+ fatal("pthread_cond_wait", ERR_SYSCALL,
+ "nfree_current cond wait failed\n");
+ /*NOTREACHED*/
+ }
+ }
+ pthread_mutex_unlock(&tip->mutex);
+
+ return nfree;
+}
+
+/**
+ * stall - Stall for the number of nanoseconds requested
+ *
+ * We may be late, in which case we just return.
+ */
+static void stall(struct thr_info *tip, long long oclock)
+{
+ struct timespec req;
+ long long dreal, tclock = gettime() - rgenesis;
+
+ if (verbose > 1)
+ fprintf(tip->vfp, " stall(%lld.%09lld, %lld.%09lld)\n",
+ du64_to_sec(oclock), du64_to_nsec(oclock),
+ du64_to_sec(tclock), du64_to_nsec(tclock));
+
+ while (!is_send_done(tip) && tclock < oclock) {
+ dreal = oclock - tclock;
+ req.tv_sec = dreal / (1000 * 1000 * 1000);
+ req.tv_nsec = dreal % (1000 * 1000 * 1000);
+
+ if (verbose > 1) {
+ fprintf(tip->vfp, "++ stall(%lld.%09lld) ++\n",
+ (long long)req.tv_sec,
+ (long long)req.tv_nsec);
+ }
+
+ if (nanosleep(&req, NULL) < 0 && signal_done)
+ break;
+
+ tclock = gettime() - rgenesis;
+ }
+}
+
+/**
+ * iocbs_map - Map a set of AIOs onto a set of IOCBs
+ * @tip: Per-thread information
+ * @list: List of AIOs created
+ * @pkts: AIOs to map
+ * @ntodo: Number of AIOs to map
+ */
+static void iocbs_map(struct thr_info *tip, struct iocb **list,
+ struct io_pkt *pkts, int ntodo)
+{
+ int i;
+ struct io_pkt *pkt;
+
+ assert(0 < ntodo && ntodo <= naios);
+
+ pthread_mutex_lock(&tip->mutex);
+ assert(ntodo <= list_len(&tip->free_iocbs));
+ for (i = 0, pkt = pkts; i < ntodo; i++, pkt++) {
+ __u32 rw = pkt->rw;
+ struct iocb_pkt *iocbp;
+
+ if (!pkt->rw && !write_enabled)
+ rw = 1;
+
+ if (verbose > 1)
+ fprintf(tip->vfp, "\t%10llu + %10llu %c%c\n",
+ (unsigned long long)pkt->sector,
+ (unsigned long long)pkt->nbytes / nb_sec,
+ rw ? 'R' : 'W',
+ (rw == 1 && pkt->rw == 0) ? '!' : ' ');
+
+ iocbp = list_entry(tip->free_iocbs.next, struct iocb_pkt, head);
+ iocb_setup(iocbp, rw, pkt->nbytes, pkt->sector * nb_sec);
+
+ list_move_tail(&iocbp->head, &tip->used_iocbs);
+ list[i] = &iocbp->iocb;
+ }
+
+ tip->naios_free -= ntodo;
+ assert(tip->naios_free >= 0);
+ pthread_mutex_unlock(&tip->mutex);
+}
+
+/**
+ * process_bunch - Process a bunch of requests
+ * @tip: Per-thread information
+ * @bunch: Bunch to process
+ */
+static void process_bunch(struct thr_info *tip, struct io_bunch *bunch)
+{
+ __u64 i = 0;
+ struct iocb *list[bunch->hdr.npkts];
+
+ assert(0 < bunch->hdr.npkts && bunch->hdr.npkts <= BT_MAX_PKTS);
+ while (!is_send_done(tip) && (i < bunch->hdr.npkts)) {
+ long ndone;
+ int ntodo = min(nfree_current(tip), bunch->hdr.npkts - i);
+
+ assert(0 < ntodo && ntodo <= naios);
+ iocbs_map(tip, list, &bunch->pkts[i], ntodo);
+ if (!no_stalls)
+ stall(tip, bunch->hdr.time_stamp - genesis);
+
+ if (ntodo) {
+ if (verbose > 1)
+ fprintf(tip->vfp, "submit(%d)\n", ntodo);
+ ndone = io_submit(tip->ctx, ntodo, list);
+ if (ndone != (long)ntodo) {
+ fatal("io_submit", ERR_SYSCALL,
+ "%d: io_submit(%d:%ld) failed (%s)\n",
+ tip->cpu, ntodo, ndone,
+ strerror(labs(ndone)));
+ /*NOTREACHED*/
+ }
+
+ pthread_mutex_lock(&tip->mutex);
+ tip->naios_out += ndone;
+ assert(tip->naios_out <= naios);
+ if (tip->reap_wait) {
+ tip->reap_wait = 0;
+ pthread_cond_signal(&tip->cond);
+ }
+ pthread_mutex_unlock(&tip->mutex);
+
+ i += ndone;
+ assert(i <= bunch->hdr.npkts);
+ }
+ }
+}
+
+/**
+ * reset_input_file - Reset the input file for the next iteration
+ * @tip: Thread information
+ *
+ * We also do a dummy read of the file header to get us to the first bunch.
+ */
+static void reset_input_file(struct thr_info *tip)
+{
+ struct io_file_hdr hdr;
+
+ lseek(tip->ifd, 0, 0);
+
+ if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
+ fatal(tip->file_name, ERR_ARGS, "Header reread failed\n");
+ /*NOTREACHED*/
+ }
+}
+
+/**
+ * replay_sub - Worker thread to submit AIOs that are being replayed
+ */
+static void *replay_sub(void *arg)
+{
+ char path[MAXPATHLEN];
+ struct io_bunch bunch;
+ struct thr_info *tip = arg;
+
+ pin_to_cpu(tip);
+
+ sprintf(path, "/dev/%s", map_dev(tip->devnm));
+ tip->ofd = open(path, O_RDWR | O_DIRECT);
+ if (tip->ofd < 0) {
+ fatal(path, ERR_SYSCALL, "Failed device open\n");
+ /*NOTREACHED*/
+ }
+
+ set_replay_ready();
+ while (!is_send_done(tip) && tip->iterations--) {
+ wait_iter_start();
+ if (verbose)
+ fprintf(tip->vfp, "\n=== %d ===\n", tip->iterations);
+ while (!is_send_done(tip) && next_bunch(tip, &bunch))
+ process_bunch(tip, &bunch);
+ set_iter_done();
+ reset_input_file(tip);
+ }
+ tip->send_done = 1;
+ set_replay_done();
+
+ return NULL;
+}
+
+/*
+ * ========================================================================
+ * ==== COMMAND LINE ARGUMENT HANDLING ====================================
+ * ========================================================================
+ */
+
+static char usage_str[] = \
+ "\n" \
+ "\t[ -c <cpus> : --cpus=<cpus> ] Default: 1\n" \
+ "\t[ -d <dir> : --input-directory=<dir> ] Default: .\n" \
+ "\t[ -F : --find-records ] Default: Off\n" \
+ "\t[ -h : --help ] Default: Off\n" \
+ "\t[ -i <base> : --input-base=<base> ] Default: replay\n" \
+ "\t[ -I <iters>: --iterations=<iters> ] Default: 1\n" \
+ "\t[ -M <file> : --map-devs=<file> ] Default: None\n" \
+ "\t[ -N : --no-stalls ] Default: Off\n" \
+ "\t[ -v : --verbose ] Default: Off\n" \
+ "\t[ -V : --version ] Default: Off\n" \
+ "\t[ -W : --write-enable ] Default: Off\n" \
+ "\t<dev...> Default: None\n" \
+ "\n";
+
+#define S_OPTS "c:d:Fhi:I:M:Nt:vVW"
+static struct option l_opts[] = {
+ {
+ .name = "cpus",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'c'
+ },
+ {
+ .name = "input-directory",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'd'
+ },
+ {
+ .name = "find-records",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'F'
+ },
+ {
+ .name = "help",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'h'
+ },
+ {
+ .name = "input-base",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'i'
+ },
+ {
+ .name = "iterations",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'I'
+ },
+ {
+ .name = "map-devs",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'M'
+ },
+ {
+ .name = "no-stalls",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'N'
+ },
+ {
+ .name = "verbose",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'v'
+ },
+ {
+ .name = "version",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'V'
+ },
+ {
+ .name = "write-enable",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'W'
+ },
+ {
+ .name = NULL
+ }
+};
+
+/**
+ * handle_args: Parse passed in argument list
+ * @argc: Number of arguments in argv
+ * @argv: Arguments passed in
+ *
+ * Does rudimentary parameter verification as well.
+ */
+static void handle_args(int argc, char *argv[])
+{
+ int c;
+
+ while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
+ switch (c) {
+ case 'c':
+ cpus_to_use = atoi(optarg);
+ if (cpus_to_use <= 0 || cpus_to_use > ncpus) {
+ fatal(NULL, ERR_ARGS,
+ "Invalid number of cpus %d (0<x<%d)\n",
+ cpus_to_use, ncpus);
+ /*NOTREACHED*/
+ }
+ break;
+
+ case 'd':
+ idir = optarg;
+ if (access(idir, R_OK | X_OK) != 0) {
+ fatal(idir, ERR_ARGS,
+ "Invalid input directory specified\n");
+ /*NOTREACHED*/
+ }
+ break;
+
+ case 'F':
+ find_records = 1;
+ break;
+
+ case 'h':
+ usage();
+ exit(0);
+ /*NOTREACHED*/
+
+ case 'i':
+ ibase = optarg;
+ break;
+
+ case 'I':
+ def_iterations = atoi(optarg);
+ if (def_iterations <= 0) {
+ fprintf(stderr,
+ "Invalid number of iterations %d\n",
+ def_iterations);
+ exit(ERR_ARGS);
+ /*NOTREACHED*/
+ }
+ break;
+
+ case 'M':
+ read_map_devs(optarg);
+ break;
+
+ case 'N':
+ no_stalls = 1;
+ break;
+
+ case 'V':
+ fprintf(stderr, "btreplay -- version %s\n",
+ my_btversion);
+ fprintf(stderr, " Built on %s\n",
+ build_date);
+ exit(0);
+ /*NOTREACHED*/
+
+ case 'v':
+ verbose++;
+ break;
+
+ case 'W':
+ write_enabled = 1;
+ break;
+
+ default:
+ usage();
+ fatal(NULL, ERR_ARGS,
+ "Invalid command line argument %c\n", c);
+ /*NOTREACHED*/
+ }
+ }
+
+ while (optind < argc)
+ add_input_dev(argv[optind++]);
+
+ if (find_records)
+ find_input_devs(idir);
+
+ if (list_len(&input_devs) == 0) {
+ fatal(NULL, ERR_ARGS, "Missing required input dev name(s)\n");
+ /*NOTREACHED*/
+ }
+
+ if (cpus_to_use < 0)
+ cpus_to_use = ncpus;
+}
+
+/*
+ * ========================================================================
+ * ==== MAIN ROUTINE ======================================================
+ * ========================================================================
+ */
+
+/**
+ * set_signal_done - Signal handler, catches signals & sets signal_done
+ */
+static void set_signal_done(__attribute__((__unused__))int signum)
+{
+ signal_done = 1;
+}
+
+/**
+ * main -
+ * @argc: Number of arguments
+ * @argv: Array of arguments
+ */
+int main(int argc, char *argv[])
+{
+ int i;
+ struct list_head *p;
+
+ pgsize = getpagesize();
+ assert(pgsize > 0);
+
+ setup_signal(SIGINT, set_signal_done);
+ setup_signal(SIGTERM, set_signal_done);
+
+ get_ncpus();
+ handle_args(argc, argv);
+ find_input_files();
+
+ nfiles = list_len(&input_files);
+ __list_for_each(p, &input_files) {
+ tip_init(list_entry(p, struct thr_info, head));
+ }
+
+ wait_replays_ready();
+ for (i = 0; i < def_iterations; i++) {
+ rgenesis = gettime();
+ start_iter();
+ if (verbose)
+ fprintf(stderr, "I");
+ wait_iters_done();
+ }
+
+ wait_replays_done();
+ wait_reclaims_done();
+
+ if (verbose)
+ fprintf(stderr, "\n");
+
+ rem_input_files();
+ release_map_devs();
+
+ return 0;
+}
diff --git a/btreplay/doc/Makefile b/btreplay/doc/Makefile
new file mode 100644
index 0000000..e3b383e
--- /dev/null
+++ b/btreplay/doc/Makefile
@@ -0,0 +1,18 @@
+DOCTMP = btreplay.log btreplay.aux btreplay.dvi btreplay.toc
+
+all: btreplay.dvi btreplay.pdf
+
+btreplay.tex:
+ @touch btreplay.tex
+
+btreplay.dvi: btreplay.tex abstract.tex
+ @latex btreplay.tex
+ @latex btreplay.tex
+
+btreplay.pdf: btreplay.dvi
+ @dvipdfm -p letter btreplay
+
+clean:
+ -rm -f $(DOCTMP)
+ -rm -f *.bak *.ps *.pdf
+ @rm -rf btreplay
diff --git a/btreplay/doc/abstract.tex b/btreplay/doc/abstract.tex
new file mode 100644
index 0000000..314d820
--- /dev/null
+++ b/btreplay/doc/abstract.tex
@@ -0,0 +1,34 @@
+%
+% Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com>
+%
+% This program is free software; you can redistribute it and/or modify
+% it under the terms of the GNU General Public License as published by
+% the Free Software Foundation; either version 2 of the License, or
+% (at your option) any later version.
+%
+% This program is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+% GNU General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with this program; if not, write to the Free Software
+% Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+%
+% vi :set textwidth=75
+%
+The \texttt{btrecord} and \texttt{btreplay} tools provide the ability to
+record and replay IOs captured by the \texttt{blktrace} utility. Attempts
+are made to maintain ordering, CPU mappings and time-separation of IOs. The
+general workflow is expected to be:
+
+\begin{enumerate}
+ \item Initiate \texttt{blktrace} to capture traces
+ \item Generate traces\ldots
+ \item Stop \texttt{blktrace}
+ \item Run \texttt{btrecord} to convert traces into IO records
+ \item Utilize \texttt{btreplay} to replay IOs
+\end{enumerate}
+
+This document will discuss the operating characteristics of
+\texttt{btreplay} and provide detailed command line option descriptions.
diff --git a/btreplay/doc/btreplay.tex b/btreplay/doc/btreplay.tex
new file mode 100644
index 0000000..beec720
--- /dev/null
+++ b/btreplay/doc/btreplay.tex
@@ -0,0 +1,521 @@
+%
+% Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com>
+%
+% This program is free software; you can redistribute it and/or modify
+% it under the terms of the GNU General Public License as published by
+% the Free Software Foundation; either version 2 of the License, or
+% (at your option) any later version.
+%
+% This program is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+% GNU General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with this program; if not, write to the Free Software
+% Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+%
+% vi :set textwidth=75
+%
+\documentclass{article}
+\usepackage{multirow,graphicx,placeins}
+
+\begin{document}
+%---------------------
+\title{\texttt{btrecord} and \texttt{btreplay} User Guide}
+\author{Alan D. Brunelle (Alan.Brunelle@hp.com)}
+\date{\today}
+\maketitle
+\begin{abstract}
+\input{abstract.tex}
+\end{abstract}
+\thispagestyle{empty}\newpage
+%---------------------
+\tableofcontents\thispagestyle{empty}\newpage
+%---------------------
+\section{Introduction}
+\input{abstract.tex}
+
+\bigskip
+This document presents the command line overview for
+\texttt{btrecord} and \texttt{btreplay}, and shows some commonly used
+example usages of it in everyday work here at OSLO's Scalability and
+Performance Group.
+
+\subsection*{Build Note}
+
+To build these tools, one needs to
+place the source directory next to a valid
+\texttt{blktrace}\footnote{\texttt{git://git.kernel.dk/blktrace.git}}
+directory, as it includes \texttt{../blktrace} in the \texttt{Makefile}.
+
+
+%---------------------
+\newpage\section{\texttt{btrecord} and \texttt{btreplay} Operating Model}
+
+The \texttt{blktrace} utility provides the ability to collect detailed
+traces from the kernel for each IO processed by the block IO layer. The
+traces provide a complete timeline for each IO processed, including
+detailed information concerning when an IO was first received by the block
+IO layer -- indicating the device, CPU number, time stamp, IO direction,
+sector number and IO size (number of sectors). Using this information,
+one is able to \emph{replay} the IO again on the same machine or another
+set up entirely.
+
+\subsection{Basic Workflow}
+The basic operating work-flow to replay IOs would be something like:
+
+\begin{enumerate}
+ \item Run \texttt{blktrace} to collect traces. Here you specify the
+ device or devices that you wish to trace and later replay IOs upon. Note:
+ the only traces you are interested in are \emph{QUEUE} requests --
+ thus, to save system resources (including storage for traces), one could
+ specify the \texttt{-a queue} command line option to \texttt{blktrace}.
+
+ \item While \texttt{blktrace} is running, you run the workload that you
+ are interested in.
+
+ \item When the work load has completed, you stop the \texttt{blktrace}
+ utility (thus saving all traces over the complete workload).
+
+ \item You extract the pertinent IO information from the traces saved by
+ \texttt{blktrace} using the \texttt{btrecord} utility. This will parse
+ each trace file created by \texttt{blktrace}, and crafty IO descriptions
+ to be used in the next phase of the workload processing.
+
+ \item Once \texttt{btrecord} has successfully created a series of data
+ files to be processed, you can run the \texttt{btreplay} utility which
+ attempts to generate the same IOs seen during the sample workload phase.
+\end{enumerate}
+
+\subsection{IO Stream Replay Characteristics}
+ The major characteristics of the IO stream that are kept intact include:
+
+ \begin{description}
+ \item[Device] The IOs are replayed on the same device as was seen
+ during the sample workload.
+
+ \item[IO direction] The same IO direction (read/write) is maintained.
+
+ \item[IO offset] The same device offset is maintained.
+
+ \item[IO size] The same number of sectors are transferred.
+
+ \item[Time differential] The time stamps stored during the
+ \texttt{blktrace} run are used to determine the amount of time between
+ IOs during the sample workload. \texttt{btreplay} \emph{attempts} to
+ maintain the same time differential between IOs, but no guarantees as
+ to complete accuracy are provided by the utility.
+
+ \item[Device IO Stream Ordering] All IOs on a device are submitted in
+ the precise order they were seen during the sample workload run.
+ \end{description}
+
+ As noted above, the time between IOs may not be accurately maintained
+ during replays. In addition the actual ordering of IOs \emph{between}
+ devices is not necessarily maintained. (Each device with an IO stream
+ maintains its own concept of time, and thus there may be slippage of the
+ time kept between managing threads.)
+
+ \begin{quotation}
+ We have prototyped a different approach, wherein a single managing
+ thread handles all IOs across all devices. This approach, while
+ guaranteeing correct ordering of IOs across all devices, resulted in
+ much worse timing on a per IO basis.
+ \end{quotation}
+
+\subsection{\texttt{btrecord/btreplay} Method of Operation}
+
+As noted above, \texttt{btrecord} extracts \texttt{QUEUE} operations from
+\texttt{blktrace} output. These \texttt{QUEUE} operations indicate the
+entrance of IOs into the block IO layer. In order to replay these IOs with
+some accuracy in regards to ordering and timeliness, we decided to take
+multiple sequential (in time) IOs and put them in a single \emph{bunch} of
+IOs that will be processed as a single \emph{asynchronous IO} call to the
+kernel\footnote{Attempts to do them individually resulted in too large of a
+turnaround time penalty (user-space to kernel and back). Note that in a
+number of workloads, the IOs are coming in from the page cache handling
+code, and thus are submitted to the block IO layer with \emph{very small}
+time intervals between issues.}. To manage the size of the \emph{bunches},
+the \texttt{btrecord} utility provides you with two controlling knobs:
+
+\begin{description}
+ \item[\texttt{--max-bunch-time}] This is the amount of time to encompass
+ in one bunch -- only IOs within the time specified are eligible
+ for \emph{bunching.} The default time is 10 milliseconds (10,000,000
+ nanoseconds). Refer to section~\ref{sec:c-o-m} on page~\pageref{sec:c-o-m}
+ for more information.
+
+ \item[\texttt{--max-pkts}] A \emph{bunch} size can be anywhere from
+ 1 to 512 packets in size and by default we max a bunch to contain no
+ more than 8 individual IOs. With this option, one can increase or
+ decrease the maximum \emph{bunch} size. Refer to section~\ref{sec:c-o-M}
+ on page~\pageref{sec:c-o-M} for more information.
+\end{description}
+
+Each input data file (one per device per CPU) results in a new record
+data file (again, one per device per CPU) which contains information
+about \emph{bunches} of IOs to be replayed. \texttt{btreplay} operates on
+these record data files by spawning a new pair of threads per file. One
+thread managed the submitting of AIOs per bunch in the record data file,
+while the other thread manages reclaiming AIOs completed\footnote{We
+have found that having the same thread do both results in a further
+reduction in replay timing accuracty.}.
+
+Each submitting thread simply reads the input file of \emph{bunches}
+recorded by \texttt{btrecord}, and attempts to faithfully reproduce the
+ordering and timing of IOs seen during the sample workload. The reclaiming
+thread simply wait for AIO completions, freeing up resources for the
+submitting thread to utilize to submit new AIOs.
+
+The number of CPUs being used on the replay system can be different from
+the number on the recorded system. To help with mappings here the
+\texttt{--cpus} option allows one to state how many CPUs on the replay
+system to utilize. If the number of CPUs on the replay system is less than
+on the recording system, we wrap CPU IDs. This \emph{may} result in an
+overload of CPU processing capabilities on the replay system. (Refer to
+section~\ref{sec:p-o-c} on page~\pageref{sec:p-o-c} for more details about the
+\texttt{--cpus} option.)
+
+\newpage\subsection{Known Deficiencies and Proposed Possible Fixes}
+
+The overall known deficiencies with this current set of utilities is
+outlined here, in some cases ideas on additions and/or improvements are
+included as well.
+
+\begin{enumerate}
+ \item Lack of IO ordering across devices.
+
+ \begin{quote}
+ \emph{We could institute the notion of global time across threads,
+ and thus ensure IO ordering across devices, with some reduction in
+ timing accuracy.}
+ \end{quote}
+
+ \item Lack of IO timing accuracy -- additional time between IO bunches.
+
+ \begin{quote}
+ \emph{This is the primary problem with any IO replay mechanism -- how
+ to guarantee per-IO timing accuracy with respect to other replayed IOs?
+ One idea to reduce errors in this area would be to push the IO replay
+ into the kernel, where you \emph{may} receive more responsive timings.}
+ \end{quote}
+
+ \item Bunching of IOs results in reduced time amongst IOs within a bunch.
+
+ \begin{quote}
+ \emph{The user has \emph{some} control over this (via the
+ \texttt{--max-pkts} option). One \emph{could} simply specify
+ \texttt{-max-pkts=1} and then each IO would be treated individualy. Of
+ course, this would probably then run into the problem of excessive
+ inter-IO times.}
+ \end{quote}
+
+ \item 1-to-1 mapping of devices -- for now the devices on the replay
+ machine must be the same as on the recording machine.
+
+ \begin{quote}
+ \emph{It should be relatively trivial to add in the notion of
+ mapping -- simply include a file that is read which maps devices
+ on one machine to devices (with offsets and sizes) on the replay
+ machine\footnote{The notion of an offset and device size to replay on
+ could be used to both allow for a single device to masquerade as more
+ than one device, and could be utilized in case the replay device is
+ smaller than the recorded device.}.}
+
+ \medskip\emph{One could also add in the notion of CPU mappings as well --
+ device $D_{rec}$ managed by CPU $C_{rec}$ on the recorded system
+ shall be replayed on device $D_{rep}$ and CPU $C_{rep}$ on the
+ replay machine.}
+
+ \bigskip
+ \begin{quote}
+ With version 0.9.1 we now support the \texttt{-M} option to do this
+ -- see section~\ref{sec:p-o-M} on page~\pageref{sec:p-o-M} for more
+ information on device mapping.
+ \end{quote}
+ \end{quote}
+
+\end{enumerate}
+
+%---------------------
+\newpage\section{\label{sec:command-line}Command Line Options}
+\subsection{\texttt{btrecord} Command Line Options}
+\begin{figure}[h!]
+\begin{verbatim}
+Usage: btrecord -- version 0.9.3
+
+ [ -d <dir> : --input-directory=<dir> ] Default: .
+ [ -D <dir> : --output-directory=<dir>] Default: .
+ [ -F : --find-traces ] Default: Off
+ [ -h : --help ] Default: Off
+ [ -m <nsec> : --max-bunch-time=<nsec> ] Default: 10 msec
+ [ -M <pkts> : --max-pkts=<pkts> ] Default: 8
+ [ -o <base> : --output-base=<base> ] Default: replay
+ [ -v : --verbose ] Default: Off
+ [ -V : --version ] Default: Off
+ <dev>... Default: None
+\end{verbatim}
+\caption{\label{fig:btrecord--help}\texttt{btrecord --help} Output}
+\end{figure}
+\FloatBarrier
+
+\subsubsection{\label{sec:c-o-d}\texttt{-d} or
+\texttt{--input-directory}\\Set Input Directory}
+
+The \texttt{-d} option requires a single parameter providing the directory
+name for where input files are to be found. The default directory is the
+current directory (\texttt{.}).
+
+\subsubsection{\label{sec:c-o-D}\texttt{-D} or
+\texttt{--output-directory}\\Set Output Directory}
+
+The \texttt{-D} option requires a single parameter providing the directory
+name for where output files are to be placed. The default directory is the
+current directory (\texttt{.}).
+
+\subsubsection{\texttt{-F} or \texttt{--find-traces}\\Find Trace Files
+Automatically}
+
+The \texttt{-F} option instructs \texttt{btrecord} to go find all the
+trace files in the directory specified (either via the \texttt{-d}
+option, or in the default directory '.').
+
+\subsubsection{\texttt{-h} or \texttt{--help}\\Display Help Message}
+\subsubsection{\texttt{-V} or \texttt{--version}\\Display
+\texttt{btrecord}Version}
+
+The \texttt{-h} option displays the command line options and
+defaults, as presented in figure~\ref{fig:btrecord--help} on
+page~\pageref{fig:btrecord--help}.
+
+The \texttt{-V} option displays the \texttt{btreplay} version, as shown here:
+
+\begin{verbatim}
+$ btrecord --version
+btrecord -- version 0.9.0
+\end{verbatim}
+
+Both commands exit immediately after processing the option.
+
+\subsubsection{\label{sec:c-o-m}\texttt{-m} or
+\texttt{--max-bunch-time}\\Set Maximum Time Per Bunch}
+
+The \texttt{-m} option requires a single parameter which specifies an
+amount of time (in nanoseconds) to include in any one bunch of IOs that
+are to be processed. The smaller the value, the smaller the number of
+IOs processed at one time -- perhaps yielding in more realistic replay.
+However, after a certain point the amount of overhead per bunch may result
+in additonal real replay time, thus yielding less accurate replay times.
+
+The default value is 10,000,000 nanoseconds (10 milliseconds).
+
+\subsubsection{\label{sec:c-o-M}\texttt{-M} or
+\texttt{--max-pkts}\\Set Maximum Packets Per Bunch}
+
+The \texttt{-M} option requires a single parameter which specifies the
+maximum number of IOs to store in a single bunch. As with the \texttt{-m}
+option (section~\ref{sec:c-o-m}), smaller values \emph{may} or \emph{may not}
+yield more accurate replay times.
+
+The default value is 8, with a maximum value of up to 512 being supported.
+
+\subsubsection{\label{sec:c-o-o}\texttt{-o} or
+\texttt{--output-base}\\Set Base Name for Output Files}
+
+Each output file has 3 fields:
+
+\begin{enumerate}
+ \item Device identifier (taken directly from the device name of the
+ \texttt{blktrace} output file).
+
+ \item \texttt{btrecord} base name -- by default ``replay''.
+
+ \item And the CPU number (again, taken directly from the
+ \texttt{blktrace} output file name).
+\end{enumerate}
+
+This option requires a single parameter that will override the default name
+(replay), and replace it with the specified value.
+
+\subsubsection{\label{sec:c-o-v}\texttt{-v} or
+\texttt{--verbose}\\Select Verbose Output}
+
+This option will output some simple statistics at the end of a successful
+run. Figure~\ref{fig:verb-out} (page~\pageref{fig:verb-out}) shows
+an example of some output, while figure~\ref{fig:verb-defs}
+(page~\pageref{fig:verb-defs}) shows what the fields mean.
+
+\begin{figure}[h!]
+\begin{verbatim}
+sdab:0: 580661 pkts (tot), 126030 pkts (replay), 89809 bunches, 1.4 pkts/bunch
+sdab:1: 2559775 pkts (tot), 430172 pkts (replay), 293029 bunches, 1.5 pkts/bunch
+sdab:2: 653559 pkts (tot), 136522 pkts (replay), 102288 bunches, 1.3 pkts/bunch
+sdab:3: 474773 pkts (tot), 117849 pkts (replay), 69572 bunches, 1.7 pkts/bunch
+\end{verbatim}
+\caption{\label{fig:verb-out}Verbose Output Example}
+\end{figure}
+\FloatBarrier
+
+\begin{figure}[h!]
+\begin{description}
+ \item[Field 1] The first field contains the device name and CPU
+ identrifer. Thus: \texttt{sdab:0:} means the device \texttt{sdab} and
+ traces on CPU 0.
+
+ \item[Field 2] The second field contains the total number of packets
+ processed for each device file.
+
+ \item[Field 3] The next field shows the number of packets eligible for
+ replay.
+
+ \item[Field 4] The fourth field contains the total number of IO bunches.
+
+ \item[Field 5] The last field shows the average number of IOs per bunch
+ recorded.
+\end{description}
+\caption{\label{fig:verb-defs}Verbose Field Definitions}
+\end{figure}
+\FloatBarrier
+
+%---------------------
+\newpage\subsection{\texttt{btreplay} Command Line Options}
+\begin{figure}[h!]
+\begin{verbatim}
+Usage: btreplay -- version 0.9.3
+
+ [ -c <cpus> : --cpus=<cpus> ] Default: 1
+ [ -d <dir> : --input-directory=<dir> ] Default: .
+ [ -F : --find-records ] Default: Off
+ [ -h : --help ] Default: Off
+ [ -i <base> : --input-base=<base> ] Default: replay
+ [ -I <iters>: --iterations=<iters> ] Default: 1
+ [ -M <file> : --map-devs=<file> ] Default: None
+ [ -N : --no-stalls ] Default: Off
+ [ -v : --verbose ] Default: Off
+ [ -V : --version ] Default: Off
+ [ -W : --write-enable ] Default: Off
+ <dev...> Default: None
+\end{verbatim}
+\caption{\label{fig:btreplay--help}\texttt{btreplay --help} Output}
+\end{figure}
+\FloatBarrier
+
+\subsubsection{\label{sec:p-o-c}\texttt{-c} or
+\texttt{--cpus}\\Set Number of CPUs to Use}
+
+\subsubsection{\label{sec:p-o-d}\texttt{-d} or
+\texttt{--input-directory}\\Set Input Directory}
+
+The \texttt{-d} option requires a single parameter providing the directory
+name for where input files are to be found. The default directory is the
+current directory (\texttt{.}).
+
+\subsubsection{\texttt{-F} or \texttt{--find-records}\\Find RecordFiles
+Automatically}
+
+The \texttt{-F} option instructs \texttt{btreplay} to go find all the
+record files in the directory specified (either via the \texttt{-d}
+option, or in the default directory '.').
+
+\subsubsection{\texttt{-h} or \texttt{--help}\\Display Help Message}
+\subsubsection{\texttt{-V} or \texttt{--version}\\Display
+\texttt{btreplay}Version}
+
+The \texttt{-h} option displays the command line options and
+defaults, as presented in figure~\ref{fig:btreplay--help} on
+page~\pageref{fig:btreplay--help}.
+
+The \texttt{-V} option displays the \texttt{btreplay} version, as show here:
+
+\begin{verbatim}
+$ btreplay --version
+btreplay -- version 0.9.0
+\end{verbatim}
+
+Both commands exit immediately after processing the option.
+
+\subsubsection{\label{sec:p-o-i}\texttt{-i} or
+\texttt{--input-base}\\Set Base Name for Input Files}
+
+Each input file has 3 fields:
+
+\begin{enumerate}
+ \item Device identifier (taken directly from the device name of the
+ \texttt{blktrace} output file).
+
+ \item \texttt{btrecord} base name -- by default ``replay''.
+
+ \item And the CPU number (again, taken directly from the
+ \texttt{blktrace} output file name).
+\end{enumerate}
+
+This option requires a single parameter that will override the default name
+(replay), and replace it with the specified value.
+
+\subsubsection{\label{sec:p-o-I}\texttt{-I} or
+\texttt{--iterations}\\Set Number of Iterations to Run}
+
+This option requires a single parameter which specifies the number of times
+to run through the input files. The default value is 1.
+
+\subsubsection{\label{sec:p-o-M}\texttt{-M} or \texttt{map-devs}\\
+Specify Device Mappings}
+
+This option requires a single paramter which specifies the name of a
+file contain device mappings. The file must be very simply managed, with
+just two pieces of data per line:
+
+\begin{enumerate}
+ \item The device name on the recorded system (with the \texttt{'/dev/'}
+ removed). Example: \texttt{/dev/sda} would just be \texttt{sda}.
+
+ \item The device name on the replay system to use (again, without the
+ \texttt{'/dev/'} path prepended).
+\end{enumerate}
+
+An example file for when one would map devices \texttt{/dev/sda} and
+\texttt{/dev/sdb} on the recorded system to \texttt{dev/sdg} and
+\texttt{sdh} on the replay system would be:
+
+\begin{verbatim}
+sda sdg
+sdb sdh
+\end{verbatim}
+
+The only entries in the file that are allowed are these two element lines
+-- we do not (yet?) support the notion of blank lines, or comment lines, or
+the like.
+
+The utility \emph{does} allow for multiple \texttt{-M} options to be
+supplied on the command line.
+
+\subsubsection{\label{sec:o-N}\texttt{-N} or \texttt{--no-stalls}\\Disable
+Pre-bunch Stalls}
+
+When specified on the command line, all pre-bunch stall indicators will be
+ignored. IOs will be replayed without inter-bunch delays.
+
+\subsubsection{\label{sec:p-o-v}\texttt{-v} or
+\texttt{--verbose}\\Select Verbose Output}
+
+When specified on the command line, this option instructs \texttt{btreplay}
+to store information concerning each \emph{stall} and IO operation
+performed by \texttt{btreplay}. The name of each file so created will be
+the input file name used with an extension of \texttt{.rep} appended onto
+it. Thus, an input file of the name \texttt{sdab.replay.3} would generate a
+verbose output file with the name \texttt{sdab.replay.3.rep} in the
+directory specified for input files.
+
+In addition, \texttt{btreplay} will also output to \texttt{stderr} the
+names of the input files being processed.
+
+\subsubsection{\label{sec:p-o-W}\texttt{-W} or
+\texttt{--write-enable}\\Enable Writing During Replay}
+
+As a precautionary measure, by default \texttt{btreplay} will \emph{not}
+process \emph{write} requests. In order to enable \texttt{btreplay} to
+actually \emph{write} to devices one must explicitly specify the
+\texttt{-W} option.
+
+\end{document}