2 * Blktrace replay utility - Play traces back
4 * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 static char build_date[] = __DATE__ " at "__TIME__;
35 #include <sys/param.h>
38 #include <sys/types.h>
42 #if !defined(_GNU_SOURCE)
51 * ========================================================================
52 * ==== STRUCTURE DEFINITIONS =============================================
53 * ========================================================================
57 * Each device map has one of these:
59 * @head: Linked on to map_devs
60 * @from_dev: Device name as seen on recorded system
61 * @to_dev: Device name to be used on replay system
64 struct list_head head;
65 char *from_dev, *to_dev;
69 * Each device name specified has one of these (until threads are created)
71 * @head: Linked onto input_devs
72 * @devnm: Device name -- 'sd*'
75 struct list_head head;
80 * Per input file information
82 * @head: Used to link up on input_files
83 * @free_iocbs: List of free iocb's available for use
84 * @used_iocbs: List of iocb's currently outstanding
85 * @mutex: Mutex used with condition variable to protect volatile values
86 * @cond: Condition variable used when waiting on a volatile value change
87 * @naios_out: Current number of AIOs outstanding on this context
88 * @naios_free: Number of AIOs on the free list (short cut for list_len)
89 * @send_wait: Boolean: When true, the sub thread is waiting on free IOCBs
90 * @reap_wait: Boolean: When true, the rec thread is waiting on used IOCBs
91 * @send_done: Boolean: When true, the sub thread has completed work
92 * @reap_done: Boolean: When true, the rec thread has completed work
93 * @sub_thread: Thread used to submit IOs.
94 * @rec_thread: Thread used to reclaim IOs.
96 * @devnm: Copy of the device name being managed by this thread
97 * @file_name: Full name of the input file
98 * @cpu: CPU this thread is pinned to
99 * @ifd: Input file descriptor
100 * @ofd: Output file descriptor
101 * @iterations: Remaining iterations to process
102 * @vfp: For verbose dumping of actions performed
105 struct list_head head, free_iocbs, used_iocbs;
106 pthread_mutex_t mutex;
108 volatile long naios_out, naios_free;
109 volatile int send_wait, reap_wait, send_done, reap_done;
110 pthread_t sub_thread, rec_thread;
112 char *devnm, *file_name;
113 int cpu, ifd, ofd, iterations;
118 * Every Asynchronous IO used has one of these (naios per file/device).
120 * @iocb: IOCB sent down via io_submit
121 * @head: Linked onto file_list.free_iocbs or file_list.used_iocbs
122 * @tip: Pointer to per-thread information this IO is associated with
123 * @nbytes: Number of bytes in buffer associated with iocb
127 struct list_head head;
128 struct thr_info *tip;
133 * ========================================================================
134 * ==== GLOBAL VARIABLES ==================================================
135 * ========================================================================
138 static volatile int signal_done = 0; // Boolean: Signal'ed, need to quit
140 static char *ibase = "replay"; // Input base name
141 static char *idir = "."; // Input directory base
142 static int cpus_to_use = -1; // Number of CPUs to use
143 static int def_iterations = 1; // Default number of iterations
144 static int naios = 512; // Number of AIOs per thread
145 static int ncpus = 0; // Number of CPUs in the system
146 static int verbose = 0; // Boolean: Output some extra info
147 static int write_enabled = 0; // Boolean: Enable writing
148 static __u64 genesis = ~0; // Earliest time seen
149 static __u64 rgenesis; // Our start time
150 static size_t pgsize; // System Page size
151 static int nb_sec = 512; // Number of bytes per sector
152 static LIST_HEAD(input_devs); // List of devices to handle
153 static LIST_HEAD(input_files); // List of input files to handle
154 static LIST_HEAD(map_devs); // List of device maps
155 static int nfiles = 0; // Number of files to handle
156 static int no_stalls = 0; // Boolean: Disable pre-stalls
157 static unsigned acc_factor = 1; // Int: Acceleration factor
158 static int find_records = 0; // Boolean: Find record files auto
161 * Variables managed under control of condition variables.
163 * n_reclaims_done: Counts number of reclaim threads that have completed.
164 * n_replays_done: Counts number of replay threads that have completed.
165 * n_replays_ready: Counts number of replay threads ready to start.
166 * n_iters_done: Counts number of replay threads done one iteration.
167 * iter_start: Starts an iteration for the replay threads.
169 static volatile int n_reclaims_done = 0;
170 static pthread_mutex_t reclaim_done_mutex = PTHREAD_MUTEX_INITIALIZER;
171 static pthread_cond_t reclaim_done_cond = PTHREAD_COND_INITIALIZER;
173 static volatile int n_replays_done = 0;
174 static pthread_mutex_t replay_done_mutex = PTHREAD_MUTEX_INITIALIZER;
175 static pthread_cond_t replay_done_cond = PTHREAD_COND_INITIALIZER;
177 static volatile int n_replays_ready = 0;
178 static pthread_mutex_t replay_ready_mutex = PTHREAD_MUTEX_INITIALIZER;
179 static pthread_cond_t replay_ready_cond = PTHREAD_COND_INITIALIZER;
181 static volatile int n_iters_done = 0;
182 static pthread_mutex_t iter_done_mutex = PTHREAD_MUTEX_INITIALIZER;
183 static pthread_cond_t iter_done_cond = PTHREAD_COND_INITIALIZER;
185 static volatile int iter_start = 0;
186 static pthread_mutex_t iter_start_mutex = PTHREAD_MUTEX_INITIALIZER;
187 static pthread_cond_t iter_start_cond = PTHREAD_COND_INITIALIZER;
190 * ========================================================================
191 * ==== FORWARD REFERENECES ===============================================
192 * ========================================================================
195 static void *replay_sub(void *arg);
196 static void *replay_rec(void *arg);
197 static char usage_str[];
200 * ========================================================================
201 * ==== INLINE ROUTINES ===================================================
202 * ========================================================================
206 * The 'fatal' macro will output a perror message (if errstring is !NULL)
207 * and display a string (with variable arguments) and then exit with the
208 * specified exit value.
211 #define ERR_SYSCALL 2
212 static inline void fatal(const char *errstring, const int exitval,
213 const char *fmt, ...)
221 vfprintf(stderr, fmt, ap);
228 static inline long long unsigned du64_to_sec(__u64 du64)
230 return (long long unsigned)du64 / (1000 * 1000 * 1000);
233 static inline long long unsigned du64_to_nsec(__u64 du64)
235 return llabs((long long)du64) % (1000 * 1000 * 1000);
239 * min - Return minimum of two integers
241 static inline int min(int a, int b)
243 return a < b ? a : b;
247 * minl - Return minimum of two longs
249 static inline long minl(long a, long b)
251 return a < b ? a : b;
255 * usage - Display usage string and version
257 static inline void usage(void)
259 fprintf(stderr, "Usage: btreplay -- version %s\n%s",
260 my_btversion, usage_str);
264 * is_send_done - Returns true if sender should quit early
265 * @tip: Per-thread information
267 static inline int is_send_done(struct thr_info *tip)
269 return signal_done || tip->send_done;
273 * is_reap_done - Returns true if reaper should quit early
274 * @tip: Per-thread information
276 static inline int is_reap_done(struct thr_info *tip)
278 return tip->send_done && tip->naios_out == 0;
282 * ts2ns - Convert timespec values to a nanosecond value
284 #define NS_TICKS ((__u64)1000 * (__u64)1000 * (__u64)1000)
285 static inline __u64 ts2ns(struct timespec *ts)
287 return ((__u64)(ts->tv_sec) * NS_TICKS) + (__u64)(ts->tv_nsec);
291 * ts2ns - Convert timeval values to a nanosecond value
293 static inline __u64 tv2ns(struct timeval *tp)
295 return ((__u64)(tp->tv_sec)) + ((__u64)(tp->tv_usec) * (__u64)1000);
299 * touch_memory - Force physical memory to be allocating it
301 * For malloc()ed memory we need to /touch/ it to make it really
302 * exist. Otherwise, for write's (to storage) things may not work
303 * as planned - we see Linux just use a single area to /read/ from
304 * (as there isn't any memory that has been associated with the
305 * allocated virtual addresses yet).
307 static inline void touch_memory(char *buf, size_t bsize)
309 #if defined(PREP_BUFS)
310 memset(buf, 0, bsize);
314 for (i = 0; i < bsize; i += pgsize)
320 * buf_alloc - Returns a page-aligned buffer of the specified size
321 * @nbytes: Number of bytes to allocate
323 static inline void *buf_alloc(size_t nbytes)
327 if (posix_memalign(&buf, pgsize, nbytes)) {
328 fatal("posix_memalign", ERR_SYSCALL, "Allocation failed\n");
336 * gettime - Returns current time
338 static inline __u64 gettime(void)
340 static int use_clock_gettime = -1; // Which clock to use
342 if (use_clock_gettime < 0) {
343 use_clock_gettime = clock_getres(CLOCK_MONOTONIC, NULL) == 0;
344 if (use_clock_gettime) {
345 struct timespec ts = {
349 clock_settime(CLOCK_MONOTONIC, &ts);
353 if (use_clock_gettime) {
355 clock_gettime(CLOCK_MONOTONIC, &ts);
360 gettimeofday(&tp, NULL);
366 * setup_signal - Set up a signal handler for the specified signum
368 static inline void setup_signal(int signum, sighandler_t handler)
370 if (signal(signum, handler) == SIG_ERR) {
371 fatal("signal", ERR_SYSCALL, "Failed to set signal %d\n",
378 * ========================================================================
379 * ==== CONDITION VARIABLE ROUTINES =======================================
380 * ========================================================================
384 * __set_cv - Increments a variable under condition variable control.
385 * @pmp: Pointer to the associated mutex
386 * @pcp: Pointer to the associated condition variable
387 * @vp: Pointer to the variable being incremented
388 * @mxv: Max value for variable (Used only when ASSERTS are on)
390 static inline void __set_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
392 __attribute__((__unused__))int mxv)
394 pthread_mutex_lock(pmp);
397 pthread_cond_signal(pcp);
398 pthread_mutex_unlock(pmp);
402 * __wait_cv - Waits for a variable under cond var control to hit a value
403 * @pmp: Pointer to the associated mutex
404 * @pcp: Pointer to the associated condition variable
405 * @vp: Pointer to the variable being incremented
406 * @mxv: Value to wait for
408 static inline void __wait_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
409 volatile int *vp, int mxv)
411 pthread_mutex_lock(pmp);
413 pthread_cond_wait(pcp, pmp);
415 pthread_mutex_unlock(pmp);
418 static inline void set_reclaim_done(void)
420 __set_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
424 static inline void wait_reclaims_done(void)
426 __wait_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
430 static inline void set_replay_ready(void)
432 __set_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
436 static inline void wait_replays_ready(void)
438 __wait_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
442 static inline void set_replay_done(void)
444 __set_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
448 static inline void wait_replays_done(void)
450 __wait_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
454 static inline void set_iter_done(void)
456 __set_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
460 static inline void wait_iters_done(void)
462 __wait_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
467 * wait_iter_start - Wait for an iteration to start
469 * This is /slightly/ different: we are waiting for a value to become
470 * non-zero, and then we decrement it and go on.
472 static inline void wait_iter_start(void)
474 pthread_mutex_lock(&iter_start_mutex);
475 while (iter_start == 0)
476 pthread_cond_wait(&iter_start_cond, &iter_start_mutex);
477 assert(1 <= iter_start && iter_start <= nfiles);
479 pthread_mutex_unlock(&iter_start_mutex);
483 * start_iter - Start an iteration at the replay thread level
485 static inline void start_iter(void)
487 pthread_mutex_lock(&iter_start_mutex);
488 assert(iter_start == 0);
490 pthread_cond_broadcast(&iter_start_cond);
491 pthread_mutex_unlock(&iter_start_mutex);
495 * ========================================================================
496 * ==== CPU RELATED ROUTINES ==============================================
497 * ========================================================================
501 * get_ncpus - Sets up the global 'ncpus' value
503 static void get_ncpus(void)
505 #ifdef _SC_NPROCESSORS_CONF
506 ncpus = sysconf(_SC_NPROCESSORS_CONF);
511 if (sched_getaffinity(getpid(), sizeof(cpus), &cpus)) {
512 fatal("sched_getaffinity", ERR_SYSCALL, "Can't get CPU info\n");
517 for (last_cpu = 0; last_cpu < CPU_SETSIZE && CPU_ISSET(last_cpu, &cpus); last_cpu++)
518 if (CPU_ISSET( last_cpu, &cpus) )
523 fatal(NULL, ERR_SYSCALL, "Insufficient number of CPUs\n");
529 * pin_to_cpu - Pin this thread to a specific CPU
530 * @tip: Thread information
532 static void pin_to_cpu(struct thr_info *tip)
536 assert(0 <= tip->cpu && tip->cpu < ncpus);
539 CPU_SET(tip->cpu, &cpus);
540 if (sched_setaffinity(getpid(), sizeof(cpus), &cpus)) {
541 fatal("sched_setaffinity", ERR_SYSCALL, "Failed to pin CPU\n");
549 (void)sched_getaffinity(getpid(), sizeof(now), &now);
550 fprintf(tip->vfp, "Pinned to CPU %02d ", tip->cpu);
551 for (i = 0; i < ncpus; i++)
552 fprintf(tip->vfp, "%1d", CPU_ISSET(i, &now));
553 fprintf(tip->vfp, "\n");
558 * ========================================================================
559 * ==== INPUT DEVICE HANDLERS =============================================
560 * ========================================================================
564 * add_input_dev - Add a device ('sd*') to the list of devices to handle
566 static void add_input_dev(char *devnm)
569 struct dev_info *dip;
571 __list_for_each(p, &input_devs) {
572 dip = list_entry(p, struct dev_info, head);
573 if (strcmp(dip->devnm, devnm) == 0)
577 dip = malloc(sizeof(*dip));
578 dip->devnm = strdup(devnm);
579 list_add_tail(&dip->head, &input_devs);
583 * rem_input_dev - Remove resources associated with this device
585 static void rem_input_dev(struct dev_info *dip)
587 list_del(&dip->head);
592 static void find_input_devs(char *idir)
595 DIR *dir = opendir(idir);
598 fatal(idir, ERR_ARGS, "Unable to open %s\n", idir);
602 while ((ent = readdir(dir)) != NULL) {
605 if (strstr(ent->d_name, ".replay.") == NULL)
608 dsf = strdup(ent->d_name);
620 * ========================================================================
621 * ==== MAP DEVICE INTERFACES =============================================
622 * ========================================================================
626 * read_map_devs - Read in a set of device mapping from the provided file.
627 * @file_name: File containing device maps
629 * We support the notion of multiple such files being specifed on the cmd line
631 static void read_map_devs(char *file_name)
634 char *from_dev, *to_dev;
636 fp = fopen(file_name, "r");
638 fatal(file_name, ERR_SYSCALL, "Could not open map devs file\n");
642 while (fscanf(fp, "%as %as", &from_dev, &to_dev) == 2) {
643 struct map_dev *mdp = malloc(sizeof(*mdp));
645 mdp->from_dev = from_dev;
646 mdp->to_dev = to_dev;
647 list_add_tail(&mdp->head, &map_devs);
654 * release_map_devs - Release resources associated with device mappings.
656 static void release_map_devs(void)
658 struct list_head *p, *q;
660 list_for_each_safe(p, q, &map_devs) {
661 struct map_dev *mdp = list_entry(p, struct map_dev, head);
663 list_del(&mdp->head);
672 * map_dev - Return the mapped device for that specified
673 * @from_dev: Device name as seen on recorded system
675 * Note: If there is no such mapping, we return the same name.
677 static char *map_dev(char *from_dev)
681 __list_for_each(p, &map_devs) {
682 struct map_dev *mdp = list_entry(p, struct map_dev, head);
684 if (strcmp(from_dev, mdp->from_dev) == 0)
692 * ========================================================================
693 * ==== IOCB MANAGEMENT ROUTINES ==========================================
694 * ========================================================================
698 * iocb_init - Initialize the fields of an IOCB
699 * @tip: Per-thread information
700 * iocbp: IOCB pointer to update
702 static void iocb_init(struct thr_info *tip, struct iocb_pkt *iocbp)
706 iocbp->iocb.u.c.buf = NULL;
710 * iocb_setup - Set up an iocb with this AIOs information
711 * @iocbp: IOCB pointer to update
712 * @rw: Direction (0 == write, 1 == read)
713 * @n: Number of bytes to transfer
714 * @off: Offset (in bytes)
716 static void iocb_setup(struct iocb_pkt *iocbp, int rw, int n, long long off)
719 struct iocb *iop = &iocbp->iocb;
721 assert(rw == 0 || rw == 1);
722 assert(0 < n && (n % nb_sec) == 0);
726 if (iocbp->nbytes >= n) {
731 assert(iop->u.c.buf);
740 io_prep_pread(iop, iocbp->tip->ofd, buf, n, off);
742 assert(write_enabled);
743 io_prep_pwrite(iop, iocbp->tip->ofd, buf, n, off);
744 touch_memory(buf, n);
751 * ========================================================================
752 * ==== PER-THREAD SET UP & TEAR DOWN =====================================
753 * ========================================================================
757 * tip_init - Per thread initialization function
759 static void tip_init(struct thr_info *tip)
763 INIT_LIST_HEAD(&tip->free_iocbs);
764 INIT_LIST_HEAD(&tip->used_iocbs);
766 pthread_mutex_init(&tip->mutex, NULL);
767 pthread_cond_init(&tip->cond, NULL);
769 if (io_setup(naios, &tip->ctx)) {
770 fatal("io_setup", ERR_SYSCALL, "io_setup failed\n");
776 tip->send_done = tip->reap_done = 0;
777 tip->send_wait = tip->reap_wait = 0;
779 memset(&tip->sub_thread, 0, sizeof(tip->sub_thread));
780 memset(&tip->rec_thread, 0, sizeof(tip->rec_thread));
782 for (i = 0; i < naios; i++) {
783 struct iocb_pkt *iocbp = buf_alloc(sizeof(*iocbp));
785 iocb_init(tip, iocbp);
786 list_add_tail(&iocbp->head, &tip->free_iocbs);
788 tip->naios_free = naios;
793 sprintf(fn, "%s/%s.%s.%d.rep", idir, tip->devnm, ibase,
795 tip->vfp = fopen(fn, "w");
797 fatal(fn, ERR_SYSCALL, "Failed to open report\n");
801 setlinebuf(tip->vfp);
804 if (pthread_create(&tip->sub_thread, NULL, replay_sub, tip)) {
805 fatal("pthread_create", ERR_SYSCALL,
806 "thread create failed\n");
810 if (pthread_create(&tip->rec_thread, NULL, replay_rec, tip)) {
811 fatal("pthread_create", ERR_SYSCALL,
812 "thread create failed\n");
818 * tip_release - Release resources associated with this thread
820 static void tip_release(struct thr_info *tip)
822 struct list_head *p, *q;
824 assert(tip->send_done);
825 assert(tip->reap_done);
826 assert(list_len(&tip->used_iocbs) == 0);
827 assert(tip->naios_free == naios);
829 if (pthread_join(tip->sub_thread, NULL)) {
830 fatal("pthread_join", ERR_SYSCALL, "pthread sub join failed\n");
833 if (pthread_join(tip->rec_thread, NULL)) {
834 fatal("pthread_join", ERR_SYSCALL, "pthread rec join failed\n");
838 io_destroy(tip->ctx);
840 list_splice(&tip->used_iocbs, &tip->free_iocbs);
841 list_for_each_safe(p, q, &tip->free_iocbs) {
842 struct iocb_pkt *iocbp = list_entry(p, struct iocb_pkt, head);
844 list_del(&iocbp->head);
846 free(iocbp->iocb.u.c.buf);
850 pthread_cond_destroy(&tip->cond);
851 pthread_mutex_destroy(&tip->mutex);
855 * add_input_file - Allocate and initialize per-input file structure
856 * @cpu: CPU for this file
857 * @devnm: Device name for this file
858 * @file_name: Fully qualifed input file name
860 static void add_input_file(int cpu, char *devnm, char *file_name)
863 struct io_file_hdr hdr;
864 struct thr_info *tip = buf_alloc(sizeof(*tip));
865 __u64 my_version = mk_btversion(btver_mjr, btver_mnr, btver_sub);
867 assert(0 <= cpu && cpu < ncpus);
869 memset(&hdr, 0, sizeof(hdr));
870 memset(tip, 0, sizeof(*tip));
871 tip->cpu = cpu % cpus_to_use;
872 tip->iterations = def_iterations;
874 tip->ifd = open(file_name, O_RDONLY);
876 fatal(file_name, ERR_ARGS, "Unable to open\n");
879 if (fstat(tip->ifd, &buf) < 0) {
880 fatal(file_name, ERR_SYSCALL, "fstat failed\n");
883 if (buf.st_size < (off_t)sizeof(hdr)) {
885 fprintf(stderr, "\t%s empty\n", file_name);
889 if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
890 fatal(file_name, ERR_ARGS, "Header read failed\n");
894 if (hdr.version != my_version) {
895 fprintf(stderr, "%llx %llx %llx %llx\n",
896 (long long unsigned)hdr.version,
897 (long long unsigned)hdr.genesis,
898 (long long unsigned)hdr.nbunches,
899 (long long unsigned)hdr.total_pkts);
900 fatal(NULL, ERR_ARGS,
901 "BT version mismatch: %lx versus my %lx\n",
902 (long)hdr.version, (long)my_version);
906 if (hdr.nbunches == 0) {
913 if (hdr.genesis < genesis) {
915 fprintf(stderr, "Setting genesis to %llu.%llu\n",
916 du64_to_sec(hdr.genesis),
917 du64_to_nsec(hdr.genesis));
918 genesis = hdr.genesis;
921 tip->devnm = strdup(devnm);
922 tip->file_name = strdup(file_name);
924 list_add_tail(&tip->head, &input_files);
927 fprintf(stderr, "Added %s %llu\n", file_name,
928 (long long)hdr.genesis);
932 * rem_input_file - Release resources associated with an input file
933 * @tip: Per-input file information
935 static void rem_input_file(struct thr_info *tip)
937 list_del(&tip->head);
943 free(tip->file_name);
949 * rem_input_files - Remove all input files
951 static void rem_input_files(void)
953 struct list_head *p, *q;
955 list_for_each_safe(p, q, &input_files) {
956 rem_input_file(list_entry(p, struct thr_info, head));
961 * __find_input_files - Find input files associated with this device (per cpu)
963 static void __find_input_files(struct dev_info *dip)
968 char full_name[MAXPATHLEN];
970 sprintf(full_name, "%s/%s.%s.%d", idir, dip->devnm, ibase, cpu);
971 if (access(full_name, R_OK) != 0)
974 add_input_file(cpu, dip->devnm, full_name);
979 fatal(NULL, ERR_ARGS, "No traces found for %s\n", dip->devnm);
988 * find_input_files - Find input files for all devices
990 static void find_input_files(void)
992 struct list_head *p, *q;
994 list_for_each_safe(p, q, &input_devs) {
995 __find_input_files(list_entry(p, struct dev_info, head));
1000 * ========================================================================
1001 * ==== RECLAIM ROUTINES ==================================================
1002 * ========================================================================
1006 * reap_wait_aios - Wait for and return number of outstanding AIOs
1008 * Will return 0 if we are done
1010 static int reap_wait_aios(struct thr_info *tip)
1014 if (!is_reap_done(tip)) {
1015 pthread_mutex_lock(&tip->mutex);
1016 while (tip->naios_out == 0) {
1018 if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
1019 fatal("pthread_cond_wait", ERR_SYSCALL,
1020 "nfree_current cond wait failed\n");
1024 naios = tip->naios_out;
1025 pthread_mutex_unlock(&tip->mutex);
1027 assert(is_reap_done(tip) || naios > 0);
1029 return is_reap_done(tip) ? 0 : naios;
1033 * reclaim_ios - Reclaim AIOs completed, recycle IOCBs
1034 * @tip: Per-thread information
1035 * @naios_out: Number of AIOs we have outstanding (min)
1037 static void reclaim_ios(struct thr_info *tip, long naios_out)
1040 struct io_event *evp, events[naios_out];
1045 ndone = io_getevents(tip->ctx, 1, naios_out, events, NULL);
1049 if (errno && errno != EINTR) {
1050 fatal("io_getevents", ERR_SYSCALL,
1051 "io_getevents failed\n");
1055 assert(0 < ndone && ndone <= naios_out);
1057 pthread_mutex_lock(&tip->mutex);
1058 for (i = 0, evp = events; i < ndone; i++, evp++) {
1059 struct iocb_pkt *iocbp = evp->data;
1061 if (evp->res != iocbp->iocb.u.c.nbytes) {
1062 fatal(NULL, ERR_SYSCALL,
1063 "Event failure %ld/%ld\t(%ld + %ld)\n",
1064 (long)evp->res, (long)evp->res2,
1065 (long)iocbp->iocb.u.c.offset / nb_sec,
1066 (long)iocbp->iocb.u.c.nbytes / nb_sec);
1070 list_move_tail(&iocbp->head, &tip->free_iocbs);
1073 tip->naios_free += ndone;
1074 tip->naios_out -= ndone;
1075 naios_out = minl(naios_out, tip->naios_out);
1077 if (tip->send_wait) {
1079 pthread_cond_signal(&tip->cond);
1081 pthread_mutex_unlock(&tip->mutex);
1084 * Short cut: If we /know/ there are some more AIOs, go handle them
1091 * replay_rec - Worker thread to reclaim AIOs
1092 * @arg: Pointer to thread information
1094 static void *replay_rec(void *arg)
1097 struct thr_info *tip = arg;
1099 while ((naios_out = reap_wait_aios(tip)) > 0)
1100 reclaim_ios(tip, naios_out);
1102 assert(tip->send_done);
1110 * ========================================================================
1111 * ==== REPLAY ROUTINES ===================================================
1112 * ========================================================================
1116 * next_bunch - Retrieve next bunch of AIOs to process
1117 * @tip: Per-thread information
1118 * @bunch: Bunch information
1120 * Returns TRUE if we recovered a bunch of IOs, else hit EOF
1122 static int next_bunch(struct thr_info *tip, struct io_bunch *bunch)
1124 size_t count, result;
1126 result = read(tip->ifd, &bunch->hdr, sizeof(bunch->hdr));
1127 if (result != sizeof(bunch->hdr)) {
1131 fatal(tip->file_name, ERR_SYSCALL, "Short hdr(%ld)\n",
1135 assert(bunch->hdr.npkts <= BT_MAX_PKTS);
1137 count = bunch->hdr.npkts * sizeof(struct io_pkt);
1138 result = read(tip->ifd, &bunch->pkts, count);
1139 if (result != count) {
1140 fatal(tip->file_name, ERR_SYSCALL, "Short pkts(%ld/%ld)\n",
1141 (long)result, (long)count);
1149 * nfree_current - Returns current number of AIOs that are free
1151 * Will wait for available ones...
1153 * Returns 0 if we have some condition that causes us to exit
1155 static int nfree_current(struct thr_info *tip)
1159 pthread_mutex_lock(&tip->mutex);
1160 while (!is_send_done(tip) && ((nfree = tip->naios_free) == 0)) {
1162 if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
1163 fatal("pthread_cond_wait", ERR_SYSCALL,
1164 "nfree_current cond wait failed\n");
1168 pthread_mutex_unlock(&tip->mutex);
1174 * stall - Stall for the number of nanoseconds requested
1176 * We may be late, in which case we just return.
1178 static void stall(struct thr_info *tip, long long oclock)
1180 struct timespec req;
1181 long long dreal, tclock = gettime() - rgenesis;
1183 oclock /= acc_factor;
1186 fprintf(tip->vfp, " stall(%lld.%09lld, %lld.%09lld)\n",
1187 du64_to_sec(oclock), du64_to_nsec(oclock),
1188 du64_to_sec(tclock), du64_to_nsec(tclock));
1190 while (!is_send_done(tip) && tclock < oclock) {
1191 dreal = oclock - tclock;
1192 req.tv_sec = dreal / (1000 * 1000 * 1000);
1193 req.tv_nsec = dreal % (1000 * 1000 * 1000);
1196 fprintf(tip->vfp, "++ stall(%lld.%09lld) ++\n",
1197 (long long)req.tv_sec,
1198 (long long)req.tv_nsec);
1201 if (nanosleep(&req, NULL) < 0 && signal_done)
1204 tclock = gettime() - rgenesis;
1209 * iocbs_map - Map a set of AIOs onto a set of IOCBs
1210 * @tip: Per-thread information
1211 * @list: List of AIOs created
1212 * @pkts: AIOs to map
1213 * @ntodo: Number of AIOs to map
1215 static void iocbs_map(struct thr_info *tip, struct iocb **list,
1216 struct io_pkt *pkts, int ntodo)
1221 assert(0 < ntodo && ntodo <= naios);
1223 pthread_mutex_lock(&tip->mutex);
1224 assert(ntodo <= list_len(&tip->free_iocbs));
1225 for (i = 0, pkt = pkts; i < ntodo; i++, pkt++) {
1227 struct iocb_pkt *iocbp;
1229 if (!pkt->rw && !write_enabled)
1233 fprintf(tip->vfp, "\t%10llu + %10llu %c%c\n",
1234 (unsigned long long)pkt->sector,
1235 (unsigned long long)pkt->nbytes / nb_sec,
1237 (rw == 1 && pkt->rw == 0) ? '!' : ' ');
1239 iocbp = list_entry(tip->free_iocbs.next, struct iocb_pkt, head);
1240 iocb_setup(iocbp, rw, pkt->nbytes, pkt->sector * nb_sec);
1242 list_move_tail(&iocbp->head, &tip->used_iocbs);
1243 list[i] = &iocbp->iocb;
1246 tip->naios_free -= ntodo;
1247 assert(tip->naios_free >= 0);
1248 pthread_mutex_unlock(&tip->mutex);
1252 * process_bunch - Process a bunch of requests
1253 * @tip: Per-thread information
1254 * @bunch: Bunch to process
1256 static void process_bunch(struct thr_info *tip, struct io_bunch *bunch)
1259 struct iocb *list[bunch->hdr.npkts];
1261 assert(0 < bunch->hdr.npkts && bunch->hdr.npkts <= BT_MAX_PKTS);
1262 while (!is_send_done(tip) && (i < bunch->hdr.npkts)) {
1264 int ntodo = min(nfree_current(tip), bunch->hdr.npkts - i);
1266 assert(0 < ntodo && ntodo <= naios);
1267 iocbs_map(tip, list, &bunch->pkts[i], ntodo);
1269 stall(tip, bunch->hdr.time_stamp - genesis);
1273 fprintf(tip->vfp, "submit(%d)\n", ntodo);
1274 ndone = io_submit(tip->ctx, ntodo, list);
1275 if (ndone != (long)ntodo) {
1276 fatal("io_submit", ERR_SYSCALL,
1277 "%d: io_submit(%d:%ld) failed (%s)\n",
1278 tip->cpu, ntodo, ndone,
1279 strerror(labs(ndone)));
1283 pthread_mutex_lock(&tip->mutex);
1284 tip->naios_out += ndone;
1285 assert(tip->naios_out <= naios);
1286 if (tip->reap_wait) {
1288 pthread_cond_signal(&tip->cond);
1290 pthread_mutex_unlock(&tip->mutex);
1293 assert(i <= bunch->hdr.npkts);
1299 * reset_input_file - Reset the input file for the next iteration
1300 * @tip: Thread information
1302 * We also do a dummy read of the file header to get us to the first bunch.
1304 static void reset_input_file(struct thr_info *tip)
1306 struct io_file_hdr hdr;
1308 lseek(tip->ifd, 0, 0);
1310 if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
1311 fatal(tip->file_name, ERR_ARGS, "Header reread failed\n");
1317 * replay_sub - Worker thread to submit AIOs that are being replayed
1319 static void *replay_sub(void *arg)
1323 char path[MAXPATHLEN];
1324 struct io_bunch bunch;
1325 struct thr_info *tip = arg;
1330 mdev = map_dev(tip->devnm);
1331 sprintf(path, "/dev/%s", mdev);
1333 * convert underscores to slashes to
1334 * restore device names that have larger paths
1336 for (i = 0; i < strlen(mdev); i++)
1337 if (path[strlen("/dev/") + i] == '_')
1338 path[strlen("/dev/") + i] = '/';
1344 tip->ofd = open(path, O_RDWR | O_DIRECT | oflags);
1346 fatal(path, ERR_SYSCALL, "Failed device open\n");
1351 while (!is_send_done(tip) && tip->iterations--) {
1354 fprintf(tip->vfp, "\n=== %d ===\n", tip->iterations);
1355 while (!is_send_done(tip) && next_bunch(tip, &bunch))
1356 process_bunch(tip, &bunch);
1358 reset_input_file(tip);
1367 * ========================================================================
1368 * ==== COMMAND LINE ARGUMENT HANDLING ====================================
1369 * ========================================================================
1372 static char usage_str[] = \
1374 "\t[ -c <cpus> : --cpus=<cpus> ] Default: 1\n" \
1375 "\t[ -d <dir> : --input-directory=<dir> ] Default: .\n" \
1376 "\t[ -F : --find-records ] Default: Off\n" \
1377 "\t[ -h : --help ] Default: Off\n" \
1378 "\t[ -i <base> : --input-base=<base> ] Default: replay\n" \
1379 "\t[ -I <iters>: --iterations=<iters> ] Default: 1\n" \
1380 "\t[ -M <file> : --map-devs=<file> ] Default: None\n" \
1381 "\t[ -N : --no-stalls ] Default: Off\n" \
1382 "\t[ -x : --acc-factor ] Default: 1\n" \
1383 "\t[ -v : --verbose ] Default: Off\n" \
1384 "\t[ -V : --version ] Default: Off\n" \
1385 "\t[ -W : --write-enable ] Default: Off\n" \
1386 "\t<dev...> Default: None\n" \
1389 #define S_OPTS "c:d:Fhi:I:M:Nx:t:vVW"
1390 static struct option l_opts[] = {
1393 .has_arg = required_argument,
1398 .name = "input-directory",
1399 .has_arg = required_argument,
1404 .name = "find-records",
1405 .has_arg = no_argument,
1411 .has_arg = no_argument,
1416 .name = "input-base",
1417 .has_arg = required_argument,
1422 .name = "iterations",
1423 .has_arg = required_argument,
1429 .has_arg = required_argument,
1434 .name = "no-stalls",
1435 .has_arg = no_argument,
1440 .name = "acc-factor",
1441 .has_arg = required_argument,
1447 .has_arg = no_argument,
1453 .has_arg = no_argument,
1458 .name = "write-enable",
1459 .has_arg = no_argument,
1469 * handle_args: Parse passed in argument list
1470 * @argc: Number of arguments in argv
1471 * @argv: Arguments passed in
1473 * Does rudimentary parameter verification as well.
1475 static void handle_args(int argc, char *argv[])
1480 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1483 cpus_to_use = atoi(optarg);
1484 if (cpus_to_use <= 0 || cpus_to_use > ncpus) {
1485 fatal(NULL, ERR_ARGS,
1486 "Invalid number of cpus %d (0<x<%d)\n",
1487 cpus_to_use, ncpus);
1494 if (access(idir, R_OK | X_OK) != 0) {
1495 fatal(idir, ERR_ARGS,
1496 "Invalid input directory specified\n");
1515 def_iterations = atoi(optarg);
1516 if (def_iterations <= 0) {
1518 "Invalid number of iterations %d\n",
1526 read_map_devs(optarg);
1534 r = sscanf(optarg,"%u",&acc_factor);
1537 "Invalid acceleration factor\n");
1544 fprintf(stderr, "btreplay -- version %s\n",
1546 fprintf(stderr, " Built on %s\n",
1561 fatal(NULL, ERR_ARGS,
1562 "Invalid command line argument %c\n", c);
1567 while (optind < argc)
1568 add_input_dev(argv[optind++]);
1571 find_input_devs(idir);
1573 if (list_len(&input_devs) == 0) {
1574 fatal(NULL, ERR_ARGS, "Missing required input dev name(s)\n");
1578 if (cpus_to_use < 0)
1579 cpus_to_use = ncpus;
1583 * ========================================================================
1584 * ==== MAIN ROUTINE ======================================================
1585 * ========================================================================
1589 * set_signal_done - Signal handler, catches signals & sets signal_done
1591 static void set_signal_done(__attribute__((__unused__))int signum)
1598 * @argc: Number of arguments
1599 * @argv: Array of arguments
1601 int main(int argc, char *argv[])
1604 struct list_head *p;
1606 pgsize = getpagesize();
1609 setup_signal(SIGINT, set_signal_done);
1610 setup_signal(SIGTERM, set_signal_done);
1613 handle_args(argc, argv);
1616 nfiles = list_len(&input_files);
1617 __list_for_each(p, &input_files) {
1618 tip_init(list_entry(p, struct thr_info, head));
1621 wait_replays_ready();
1622 for (i = 0; i < def_iterations; i++) {
1623 rgenesis = gettime();
1626 fprintf(stderr, "I");
1630 wait_replays_done();
1631 wait_reclaims_done();
1634 fprintf(stderr, "\n");