2a1525e046620ecae71d2b8eea88b8da921c7e93
[blktrace.git] / btreplay / btreplay.c
1 /*
2  * Blktrace replay utility - Play traces back
3  *
4  * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  */
20
21 static char build_date[] = __DATE__ " at "__TIME__;
22
23 #include <assert.h>
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <libaio.h>
27 #include <pthread.h>
28 #include <sched.h>
29 #include <signal.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <sys/param.h>
36 #include <sys/stat.h>
37 #include <sys/time.h>
38 #include <sys/types.h>
39 #include <dirent.h>
40 #include <stdarg.h>
41
42 #if !defined(_GNU_SOURCE)
43 #       define _GNU_SOURCE
44 #endif
45 #include <getopt.h>
46
47 #include "list.h"
48 #include "btrecord.h"
49
50 /* 
51  * ========================================================================
52  * ==== STRUCTURE DEFINITIONS =============================================
53  * ========================================================================
54  */
55
56 /**
57  * Each device map has one of these:
58  * 
59  * @head:       Linked on to map_devs
60  * @from_dev:   Device name as seen on recorded system
61  * @to_dev:     Device name to be used on replay system
62  */
63 struct map_dev {
64         struct list_head head;
65         char *from_dev, *to_dev;
66 };
67
68 /**
69  * Each device name specified has one of these (until threads are created)
70  *
71  * @head:       Linked onto input_devs
72  * @devnm:      Device name -- 'sd*'
73  */
74 struct dev_info {
75         struct list_head head;
76         char *devnm;
77 };
78
79 /*
80  * Per input file information
81  *
82  * @head:       Used to link up on input_files
83  * @free_iocbs: List of free iocb's available for use
84  * @used_iocbs: List of iocb's currently outstanding
85  * @mutex:      Mutex used with condition variable to protect volatile values
86  * @cond:       Condition variable used when waiting on a volatile value change
87  * @naios_out:  Current number of AIOs outstanding on this context
88  * @naios_free: Number of AIOs on the free list (short cut for list_len)
89  * @send_wait:  Boolean: When true, the sub thread is waiting on free IOCBs
90  * @reap_wait:  Boolean: When true, the rec thread is waiting on used IOCBs
91  * @send_done:  Boolean: When true, the sub thread has completed work
92  * @reap_done:  Boolean: When true, the rec thread has completed work
93  * @sub_thread: Thread used to submit IOs.
94  * @rec_thread: Thread used to reclaim IOs.
95  * @ctx:        IO context
96  * @devnm:      Copy of the device name being managed by this thread
97  * @file_name:  Full name of the input file
98  * @cpu:        CPU this thread is pinned to
99  * @ifd:        Input file descriptor
100  * @ofd:        Output file descriptor
101  * @iterations: Remaining iterations to process
102  * @vfp:        For verbose dumping of actions performed
103  */
104 struct thr_info {
105         struct list_head head, free_iocbs, used_iocbs;
106         pthread_mutex_t mutex;
107         pthread_cond_t cond;
108         volatile long naios_out, naios_free;
109         volatile int send_wait, reap_wait, send_done, reap_done;
110         pthread_t sub_thread, rec_thread;
111         io_context_t ctx;
112         char *devnm, *file_name;
113         int cpu, ifd, ofd, iterations;
114         FILE *vfp;
115 };
116
117 /*
118  * Every Asynchronous IO used has one of these (naios per file/device).
119  *
120  * @iocb:       IOCB sent down via io_submit
121  * @head:       Linked onto file_list.free_iocbs or file_list.used_iocbs
122  * @tip:        Pointer to per-thread information this IO is associated with
123  * @nbytes:     Number of bytes in buffer associated with iocb
124  */
125 struct iocb_pkt {
126         struct iocb iocb;
127         struct list_head head;
128         struct thr_info *tip;
129         int nbytes;
130 };
131
132 /* 
133  * ========================================================================
134  * ==== GLOBAL VARIABLES ==================================================
135  * ========================================================================
136  */
137
138 static volatile int signal_done = 0;    // Boolean: Signal'ed, need to quit
139
140 static char *ibase = "replay";          // Input base name
141 static char *idir = ".";                // Input directory base
142 static int cpus_to_use = -1;            // Number of CPUs to use
143 static int def_iterations = 1;          // Default number of iterations
144 static int naios = 512;                 // Number of AIOs per thread
145 static int ncpus = 0;                   // Number of CPUs in the system
146 static int verbose = 0;                 // Boolean: Output some extra info
147 static int write_enabled = 0;           // Boolean: Enable writing
148 static __u64 genesis = ~0;              // Earliest time seen
149 static __u64 rgenesis;                  // Our start time
150 static size_t pgsize;                   // System Page size
151 static int nb_sec = 512;                // Number of bytes per sector
152 static LIST_HEAD(input_devs);           // List of devices to handle
153 static LIST_HEAD(input_files);          // List of input files to handle
154 static LIST_HEAD(map_devs);             // List of device maps
155 static int nfiles = 0;                  // Number of files to handle
156 static int no_stalls = 0;               // Boolean: Disable pre-stalls
157 static unsigned acc_factor = 1;         // Int: Acceleration factor
158 static int find_records = 0;            // Boolean: Find record files auto
159
160 /*
161  * Variables managed under control of condition variables.
162  *
163  * n_reclaims_done:     Counts number of reclaim threads that have completed.
164  * n_replays_done:      Counts number of replay threads that have completed.
165  * n_replays_ready:     Counts number of replay threads ready to start.
166  * n_iters_done:        Counts number of replay threads done one iteration.
167  * iter_start:          Starts an iteration for the replay threads.
168  */
169 static volatile int n_reclaims_done = 0;
170 static pthread_mutex_t reclaim_done_mutex = PTHREAD_MUTEX_INITIALIZER;
171 static pthread_cond_t reclaim_done_cond = PTHREAD_COND_INITIALIZER;
172
173 static volatile int n_replays_done = 0;
174 static pthread_mutex_t replay_done_mutex = PTHREAD_MUTEX_INITIALIZER;
175 static pthread_cond_t replay_done_cond = PTHREAD_COND_INITIALIZER;
176
177 static volatile int n_replays_ready = 0;
178 static pthread_mutex_t replay_ready_mutex = PTHREAD_MUTEX_INITIALIZER;
179 static pthread_cond_t replay_ready_cond = PTHREAD_COND_INITIALIZER;
180
181 static volatile int n_iters_done = 0;
182 static pthread_mutex_t iter_done_mutex = PTHREAD_MUTEX_INITIALIZER;
183 static pthread_cond_t iter_done_cond = PTHREAD_COND_INITIALIZER;
184
185 static volatile int iter_start = 0;
186 static pthread_mutex_t iter_start_mutex = PTHREAD_MUTEX_INITIALIZER;
187 static pthread_cond_t iter_start_cond = PTHREAD_COND_INITIALIZER;
188
189 /* 
190  * ========================================================================
191  * ==== FORWARD REFERENECES ===============================================
192  * ========================================================================
193  */
194
195 static void *replay_sub(void *arg);
196 static void *replay_rec(void *arg);
197 static char usage_str[];
198
199 /* 
200  * ========================================================================
201  * ==== INLINE ROUTINES ===================================================
202  * ========================================================================
203  */
204
205 /*
206  * The 'fatal' macro will output a perror message (if errstring is !NULL)
207  * and display a string (with variable arguments) and then exit with the 
208  * specified exit value.
209  */
210 #define ERR_ARGS                        1
211 #define ERR_SYSCALL                     2
212 static inline void fatal(const char *errstring, const int exitval,
213                          const char *fmt, ...)
214 {
215         va_list ap;
216
217         if (errstring)
218                 perror(errstring);
219
220         va_start(ap, fmt);
221         vfprintf(stderr, fmt, ap);
222         va_end(ap);
223
224         exit(exitval);
225         /*NOTREACHED*/
226 }
227
228 static inline long long unsigned du64_to_sec(__u64 du64)
229 {
230         return (long long unsigned)du64 / (1000 * 1000 * 1000);
231 }
232
233 static inline long long unsigned du64_to_nsec(__u64 du64)
234 {
235         return llabs((long long)du64) % (1000 * 1000 * 1000);
236 }
237
238 /**
239  * min - Return minimum of two integers
240  */
241 static inline int min(int a, int b)
242
243         return a < b ? a : b;
244 }
245
246 /**
247  * minl - Return minimum of two longs
248  */
249 static inline long minl(long a, long b)
250
251         return a < b ? a : b;
252 }
253
254 /**
255  * usage - Display usage string and version
256  */
257 static inline void usage(void)
258 {
259         fprintf(stderr, "Usage: btreplay -- version %s\n%s", 
260                 my_btversion, usage_str);
261 }
262
263 /**
264  * is_send_done - Returns true if sender should quit early
265  * @tip: Per-thread information
266  */
267 static inline int is_send_done(struct thr_info *tip)
268 {
269         return signal_done || tip->send_done;
270 }
271
272 /**
273  * is_reap_done - Returns true if reaper should quit early
274  * @tip: Per-thread information
275  */
276 static inline int is_reap_done(struct thr_info *tip)
277 {
278         return tip->send_done && tip->naios_out == 0;
279 }
280
281 /**
282  * ts2ns - Convert timespec values to a nanosecond value
283  */
284 #define NS_TICKS                ((__u64)1000 * (__u64)1000 * (__u64)1000)
285 static inline __u64 ts2ns(struct timespec *ts)
286 {
287         return ((__u64)(ts->tv_sec) * NS_TICKS) + (__u64)(ts->tv_nsec);
288 }
289
290 /**
291  * ts2ns - Convert timeval values to a nanosecond value
292  */
293 static inline __u64 tv2ns(struct timeval *tp)
294 {
295         return ((__u64)(tp->tv_sec)) + ((__u64)(tp->tv_usec) * (__u64)1000);
296 }
297
298 /**
299  * touch_memory - Force physical memory to be allocating it
300  * 
301  * For malloc()ed memory we need to /touch/ it to make it really
302  * exist. Otherwise, for write's (to storage) things may not work
303  * as planned - we see Linux just use a single area to /read/ from
304  * (as there isn't any memory that has been associated with the 
305  * allocated virtual addresses yet).
306  */
307 static inline void touch_memory(char *buf, size_t bsize)
308 {
309 #if defined(PREP_BUFS)
310         memset(buf, 0, bsize);
311 #else
312         size_t i;
313
314         for (i = 0; i < bsize; i += pgsize)
315                 buf[i] = 0;
316 #endif
317 }
318
319 /**
320  * buf_alloc - Returns a page-aligned buffer of the specified size
321  * @nbytes: Number of bytes to allocate
322  */
323 static inline void *buf_alloc(size_t nbytes)
324 {
325         void *buf;
326
327         if (posix_memalign(&buf, pgsize, nbytes)) {
328                 fatal("posix_memalign", ERR_SYSCALL, "Allocation failed\n");
329                 /*NOTREACHED*/
330         }
331
332         return buf;
333 }
334
335 /**
336  * gettime - Returns current time 
337  */
338 static inline __u64 gettime(void)
339 {
340         static int use_clock_gettime = -1;              // Which clock to use
341
342         if (use_clock_gettime < 0) {
343                 use_clock_gettime = clock_getres(CLOCK_MONOTONIC, NULL) == 0;
344                 if (use_clock_gettime) {
345                         struct timespec ts = {
346                                 .tv_sec = 0,
347                                 .tv_nsec = 0
348                         };
349                         clock_settime(CLOCK_MONOTONIC, &ts);
350                 }
351         }
352
353         if (use_clock_gettime) {
354                 struct timespec ts;
355                 clock_gettime(CLOCK_MONOTONIC, &ts);
356                 return ts2ns(&ts);
357         }
358         else {
359                 struct timeval tp;
360                 gettimeofday(&tp, NULL);
361                 return tv2ns(&tp);
362         }
363 }
364
365 /**
366  * setup_signal - Set up a signal handler for the specified signum
367  */
368 static inline void setup_signal(int signum, sighandler_t handler)
369 {
370         if (signal(signum, handler) == SIG_ERR) {
371                 fatal("signal", ERR_SYSCALL, "Failed to set signal %d\n",
372                         signum);
373                 /*NOTREACHED*/
374         }
375 }
376
377 /* 
378  * ========================================================================
379  * ==== CONDITION VARIABLE ROUTINES =======================================
380  * ========================================================================
381  */
382
383 /**
384  * __set_cv - Increments a variable under condition variable control.
385  * @pmp:        Pointer to the associated mutex
386  * @pcp:        Pointer to the associated condition variable
387  * @vp:         Pointer to the variable being incremented
388  * @mxv:        Max value for variable (Used only when ASSERTS are on)
389  */
390 static inline void __set_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
391                             volatile int *vp, 
392                             __attribute__((__unused__))int mxv)
393 {
394         pthread_mutex_lock(pmp);
395         assert(*vp < mxv);
396         *vp += 1;
397         pthread_cond_signal(pcp);
398         pthread_mutex_unlock(pmp);
399 }
400
401 /**
402  * __wait_cv - Waits for a variable under cond var control to hit a value
403  * @pmp:        Pointer to the associated mutex
404  * @pcp:        Pointer to the associated condition variable
405  * @vp:         Pointer to the variable being incremented
406  * @mxv:        Value to wait for
407  */
408 static inline void __wait_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
409                              volatile int *vp, int mxv)
410 {
411         pthread_mutex_lock(pmp);
412         while (*vp < mxv)
413                 pthread_cond_wait(pcp, pmp);
414         *vp = 0;
415         pthread_mutex_unlock(pmp);
416 }
417
418 static inline void set_reclaim_done(void)
419 {
420         __set_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
421                  nfiles);
422 }
423
424 static inline void wait_reclaims_done(void)
425 {
426         __wait_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
427                   nfiles);
428 }
429
430 static inline void set_replay_ready(void)
431 {
432         __set_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
433                  nfiles);
434 }
435
436 static inline void wait_replays_ready(void)
437 {
438         __wait_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
439                   nfiles);
440 }
441
442 static inline void set_replay_done(void)
443 {
444         __set_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
445                 nfiles);
446 }
447
448 static inline void wait_replays_done(void)
449 {
450         __wait_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
451                   nfiles);
452 }
453
454 static inline void set_iter_done(void)
455 {
456         __set_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
457                 nfiles);
458 }
459
460 static inline void wait_iters_done(void)
461 {
462         __wait_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
463                   nfiles);
464 }
465
466 /**
467  * wait_iter_start - Wait for an iteration to start 
468  * 
469  * This is /slightly/ different: we are waiting for a value to become
470  * non-zero, and then we decrement it and go on. 
471  */
472 static inline void wait_iter_start(void)
473 {
474         pthread_mutex_lock(&iter_start_mutex);
475         while (iter_start == 0)
476                 pthread_cond_wait(&iter_start_cond, &iter_start_mutex);
477         assert(1 <= iter_start && iter_start <= nfiles);
478         iter_start--;
479         pthread_mutex_unlock(&iter_start_mutex);
480 }
481
482 /**
483  * start_iter - Start an iteration at the replay thread level
484  */
485 static inline void start_iter(void)
486 {
487         pthread_mutex_lock(&iter_start_mutex);
488         assert(iter_start == 0);
489         iter_start = nfiles;
490         pthread_cond_broadcast(&iter_start_cond);
491         pthread_mutex_unlock(&iter_start_mutex);
492 }
493
494 /* 
495  * ========================================================================
496  * ==== CPU RELATED ROUTINES ==============================================
497  * ========================================================================
498  */
499
500 /**
501  * get_ncpus - Sets up the global 'ncpus' value
502  */
503 static void get_ncpus(void)
504 {
505 #ifdef _SC_NPROCESSORS_ONLN
506         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
507 #else
508         int nrcpus = 4096;
509         cpu_set_t * cpus;
510         
511 realloc:
512         cpus = CPU_ALLOC(nrcpus);
513         size = CPU_ALLOC_SIZE(nrcpus);
514         CPU_ZERO_S(size, cpus);
515
516         if (sched_getaffinity(getpid(), size, cpus)) {
517                 if( errno == EINVAL && nrcpus < (4096<<4) ) {
518                         CPU_FREE(cpus);
519                         nrcpus <<= 1;
520                         goto realloc;
521                 }
522                 fatal("sched_getaffinity", ERR_SYSCALL, "Can't get CPU info\n");
523                 /*NOTREACHED*/
524         }
525
526         ncpus = -1;
527         for (last_cpu = 0; last_cpu < CPU_SETSIZE && CPU_ISSET(last_cpu, &cpus); last_cpu++)
528                 if (CPU_ISSET( last_cpu, &cpus) ) 
529                         ncpus = last_cpu;
530         ncpus++;
531         CPU_FREE(cpus);
532 #endif
533         if (ncpus == 0) {
534                 fatal(NULL, ERR_SYSCALL, "Insufficient number of CPUs\n");
535                 /*NOTREACHED*/
536         }
537 }
538
539 /**
540  * pin_to_cpu - Pin this thread to a specific CPU
541  * @tip: Thread information
542  */
543 static void pin_to_cpu(struct thr_info *tip)
544 {
545         cpu_set_t *cpus;
546         size_t size;
547
548         cpus = CPU_ALLOC(ncpus);
549         size = CPU_ALLOC_SIZE(ncpus);   
550
551         assert(0 <= tip->cpu && tip->cpu < ncpus);
552
553         CPU_ZERO_S(size, cpus);
554         CPU_SET_S(tip->cpu, size, cpus);
555         if (sched_setaffinity(getpid(), size, cpus)) {
556                 fatal("sched_setaffinity", ERR_SYSCALL, "Failed to pin CPU\n");
557                 /*NOTREACHED*/
558         }
559
560         if (verbose > 1) {
561                 int i;
562                 cpu_set_t *now = CPU_ALLOC(ncpus);
563
564                 (void)sched_getaffinity(getpid(), size, now);
565                 fprintf(tip->vfp, "Pinned to CPU %02d ", tip->cpu);
566                 for (i = 0; i < ncpus; i++)
567                         fprintf(tip->vfp, "%1d", CPU_ISSET_S(i, size, now));
568                 fprintf(tip->vfp, "\n");
569         }
570 }
571
572 /* 
573  * ========================================================================
574  * ==== INPUT DEVICE HANDLERS =============================================
575  * ========================================================================
576  */
577
578 /**
579  * add_input_dev - Add a device ('sd*') to the list of devices to handle
580  */
581 static void add_input_dev(char *devnm)
582 {
583         struct list_head *p;
584         struct dev_info *dip;
585
586         __list_for_each(p, &input_devs) {
587                 dip = list_entry(p, struct dev_info, head);
588                 if (strcmp(dip->devnm, devnm) == 0)
589                         return;
590         }
591
592         dip = malloc(sizeof(*dip));
593         dip->devnm = strdup(devnm);
594         list_add_tail(&dip->head, &input_devs);
595 }
596
597 /**
598  * rem_input_dev - Remove resources associated with this device
599  */
600 static void rem_input_dev(struct dev_info *dip)
601 {
602         list_del(&dip->head);
603         free(dip->devnm);
604         free(dip);
605 }
606
607 static void find_input_devs(char *idir)
608 {
609         struct dirent *ent;
610         DIR *dir = opendir(idir);
611
612         if (dir == NULL) {
613                 fatal(idir, ERR_ARGS, "Unable to open %s\n", idir);
614                 /*NOTREACHED*/
615         }
616
617         while ((ent = readdir(dir)) != NULL) {
618                 char *p, *dsf;
619
620                 if (strstr(ent->d_name, ".replay.") == NULL)
621                         continue;
622
623                 dsf = strdup(ent->d_name);
624                 p = index(dsf, '.');
625                 assert(p != NULL);
626                 *p = '\0';
627                 add_input_dev(dsf);
628                 free(dsf);
629         }
630
631         closedir(dir);
632 }
633
634 /* 
635  * ========================================================================
636  * ==== MAP DEVICE INTERFACES =============================================
637  * ========================================================================
638  */
639
640 /**
641  * read_map_devs - Read in a set of device mapping from the provided file.
642  * @file_name:  File containing device maps
643  *
644  * We support the notion of multiple such files being specifed on the cmd line
645  */
646 static void read_map_devs(char *file_name)
647 {
648         FILE *fp;
649         char from_dev[256], to_dev[256];
650
651         fp = fopen(file_name, "r");
652         if (!fp) {
653                 fatal(file_name, ERR_SYSCALL, "Could not open map devs file\n");
654                 /*NOTREACHED*/
655         }
656
657         while (fscanf(fp, "%s %s", from_dev, to_dev) == 2) {
658                 struct map_dev *mdp = malloc(sizeof(*mdp));
659
660                 mdp->from_dev = from_dev;
661                 mdp->to_dev = to_dev;
662                 list_add_tail(&mdp->head, &map_devs);
663         }
664
665         fclose(fp);
666 }
667
668 /**
669  * release_map_devs - Release resources associated with device mappings.
670  */
671 static void release_map_devs(void)
672 {
673         struct list_head *p, *q;
674
675         list_for_each_safe(p, q, &map_devs) {
676                 struct map_dev *mdp = list_entry(p, struct map_dev, head);
677
678                 list_del(&mdp->head);
679
680                 free(mdp->from_dev);
681                 free(mdp->to_dev);
682                 free(mdp);
683         }
684 }
685
686 /**
687  * map_dev - Return the mapped device for that specified
688  * @from_dev:   Device name as seen on recorded system
689  *
690  * Note: If there is no such mapping, we return the same name.
691  */
692 static char *map_dev(char *from_dev)
693 {
694         struct list_head *p;
695
696         __list_for_each(p, &map_devs) {
697                 struct map_dev *mdp = list_entry(p, struct map_dev, head);
698
699                 if (strcmp(from_dev, mdp->from_dev) == 0)
700                         return mdp->to_dev;
701         }
702
703         return from_dev;
704 }
705
706 /* 
707  * ========================================================================
708  * ==== IOCB MANAGEMENT ROUTINES ==========================================
709  * ========================================================================
710  */
711
712 /**
713  * iocb_init - Initialize the fields of an IOCB
714  * @tip: Per-thread information
715  * iocbp: IOCB pointer to update
716  */
717 static void iocb_init(struct thr_info *tip, struct iocb_pkt *iocbp)
718 {
719         iocbp->tip = tip;
720         iocbp->nbytes = 0;
721         iocbp->iocb.u.c.buf = NULL;
722 }
723
724 /**
725  * iocb_setup - Set up an iocb with this AIOs information
726  * @iocbp: IOCB pointer to update
727  * @rw: Direction (0 == write, 1 == read)
728  * @n: Number of bytes to transfer
729  * @off: Offset (in bytes)
730  */
731 static void iocb_setup(struct iocb_pkt *iocbp, int rw, int n, long long off)
732 {
733         char *buf;
734         struct iocb *iop = &iocbp->iocb;
735
736         assert(rw == 0 || rw == 1);
737         assert(0 < n && (n % nb_sec) == 0);
738         assert(0 <= off);
739
740         if (iocbp->nbytes) {
741                 if (iocbp->nbytes >= n) {
742                         buf = iop->u.c.buf;
743                         goto prep;
744                 }
745
746                 assert(iop->u.c.buf);
747                 free(iop->u.c.buf);
748         }
749
750         buf = buf_alloc(n);
751         iocbp->nbytes = n;
752
753 prep:
754         if (rw)
755                 io_prep_pread(iop, iocbp->tip->ofd, buf, n, off);
756         else {
757                 assert(write_enabled);
758                 io_prep_pwrite(iop, iocbp->tip->ofd, buf, n, off);
759                 touch_memory(buf, n);
760         }
761
762         iop->data = iocbp;
763 }
764
765 /* 
766  * ========================================================================
767  * ==== PER-THREAD SET UP & TEAR DOWN =====================================
768  * ========================================================================
769  */
770
771 /**
772  * tip_init - Per thread initialization function
773  */
774 static void tip_init(struct thr_info *tip)
775 {
776         int i;
777
778         INIT_LIST_HEAD(&tip->free_iocbs);
779         INIT_LIST_HEAD(&tip->used_iocbs);
780
781         pthread_mutex_init(&tip->mutex, NULL);
782         pthread_cond_init(&tip->cond, NULL);
783
784         if (io_setup(naios, &tip->ctx)) {
785                 fatal("io_setup", ERR_SYSCALL, "io_setup failed\n");
786                 /*NOTREACHED*/
787         }
788
789         tip->ofd = -1;
790         tip->naios_out = 0;
791         tip->send_done = tip->reap_done = 0;
792         tip->send_wait = tip->reap_wait = 0;
793
794         memset(&tip->sub_thread, 0, sizeof(tip->sub_thread));
795         memset(&tip->rec_thread, 0, sizeof(tip->rec_thread));
796
797         for (i = 0; i < naios; i++) {
798                 struct iocb_pkt *iocbp = buf_alloc(sizeof(*iocbp));
799
800                 iocb_init(tip, iocbp);
801                 list_add_tail(&iocbp->head, &tip->free_iocbs);
802         }
803         tip->naios_free = naios;
804
805         if (verbose > 1) {
806                 char fn[MAXPATHLEN];
807
808                 sprintf(fn, "%s/%s.%s.%d.rep", idir, tip->devnm, ibase, 
809                         tip->cpu);
810                 tip->vfp = fopen(fn, "w");
811                 if (!tip->vfp) {
812                         fatal(fn, ERR_SYSCALL, "Failed to open report\n");
813                         /*NOTREACHED*/
814                 }
815
816                 setlinebuf(tip->vfp);
817         }
818
819         if (pthread_create(&tip->sub_thread, NULL, replay_sub, tip)) {
820                 fatal("pthread_create", ERR_SYSCALL, 
821                         "thread create failed\n");
822                 /*NOTREACHED*/
823         }
824
825         if (pthread_create(&tip->rec_thread, NULL, replay_rec, tip)) {
826                 fatal("pthread_create", ERR_SYSCALL, 
827                         "thread create failed\n");
828                 /*NOTREACHED*/
829         }
830 }
831
832 /**
833  * tip_release - Release resources associated with this thread
834  */
835 static void tip_release(struct thr_info *tip)
836 {
837         struct list_head *p, *q;
838
839         assert(tip->send_done);
840         assert(tip->reap_done);
841         assert(list_len(&tip->used_iocbs) == 0);
842         assert(tip->naios_free == naios);
843
844         if (pthread_join(tip->sub_thread, NULL)) {
845                 fatal("pthread_join", ERR_SYSCALL, "pthread sub join failed\n");
846                 /*NOTREACHED*/
847         }
848         if (pthread_join(tip->rec_thread, NULL)) {
849                 fatal("pthread_join", ERR_SYSCALL, "pthread rec join failed\n");
850                 /*NOTREACHED*/
851         }
852
853         io_destroy(tip->ctx);
854
855         list_splice(&tip->used_iocbs, &tip->free_iocbs);
856         list_for_each_safe(p, q, &tip->free_iocbs) {
857                 struct iocb_pkt *iocbp = list_entry(p, struct iocb_pkt, head);
858
859                 list_del(&iocbp->head);
860                 if (iocbp->nbytes) 
861                         free(iocbp->iocb.u.c.buf);
862                 free(iocbp);
863         }
864
865         pthread_cond_destroy(&tip->cond);
866         pthread_mutex_destroy(&tip->mutex);
867 }
868
869 /**
870  * add_input_file - Allocate and initialize per-input file structure
871  * @cpu: CPU for this file
872  * @devnm: Device name for this file
873  * @file_name: Fully qualifed input file name
874  */
875 static void add_input_file(int cpu, char *devnm, char *file_name)
876 {
877         struct stat buf;
878         struct io_file_hdr hdr;
879         struct thr_info *tip = buf_alloc(sizeof(*tip));
880         __u64 my_version = mk_btversion(btver_mjr, btver_mnr, btver_sub);
881
882         assert(0 <= cpu && cpu < ncpus);
883
884         memset(&hdr, 0, sizeof(hdr));
885         memset(tip, 0, sizeof(*tip));
886         tip->cpu = cpu % cpus_to_use;
887         tip->iterations = def_iterations;
888
889         tip->ifd = open(file_name, O_RDONLY);
890         if (tip->ifd < 0) {
891                 fatal(file_name, ERR_ARGS, "Unable to open\n");
892                 /*NOTREACHED*/
893         }
894         if (fstat(tip->ifd, &buf) < 0) {
895                 fatal(file_name, ERR_SYSCALL, "fstat failed\n");
896                 /*NOTREACHED*/
897         }
898         if (buf.st_size < (off_t)sizeof(hdr)) {
899                 if (verbose)
900                         fprintf(stderr, "\t%s empty\n", file_name);
901                 goto empty_file;
902         }
903
904         if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
905                 fatal(file_name, ERR_ARGS, "Header read failed\n");
906                 /*NOTREACHED*/
907         }
908
909         if (hdr.version != my_version) {
910                 fprintf(stderr, "%llx %llx %llx %llx\n", 
911                         (long long unsigned)hdr.version,
912                         (long long unsigned)hdr.genesis,
913                         (long long unsigned)hdr.nbunches,
914                         (long long unsigned)hdr.total_pkts);
915                 fatal(NULL, ERR_ARGS, 
916                         "BT version mismatch: %lx versus my %lx\n",
917                         (long)hdr.version, (long)my_version);
918                         
919         }
920
921         if (hdr.nbunches == 0) {
922 empty_file:
923                 close(tip->ifd);
924                 free(tip);
925                 return;
926         }
927
928         if (hdr.genesis < genesis) {
929                 if (verbose > 1)
930                         fprintf(stderr, "Setting genesis to %llu.%llu\n",
931                                 du64_to_sec(hdr.genesis),
932                                 du64_to_nsec(hdr.genesis));
933                 genesis = hdr.genesis;
934         }
935
936         tip->devnm = strdup(devnm);
937         tip->file_name = strdup(file_name);
938
939         list_add_tail(&tip->head, &input_files);
940
941         if (verbose)
942                 fprintf(stderr, "Added %s %llu\n", file_name, 
943                         (long long)hdr.genesis);
944 }
945
946 /**
947  * rem_input_file - Release resources associated with an input file
948  * @tip: Per-input file information
949  */
950 static void rem_input_file(struct thr_info *tip)
951 {
952         list_del(&tip->head);
953
954         tip_release(tip);
955
956         close(tip->ofd);
957         close(tip->ifd);
958         free(tip->file_name);
959         free(tip->devnm);
960         free(tip);
961 }
962
963 /**
964  * rem_input_files - Remove all input files
965  */
966 static void rem_input_files(void)
967 {
968         struct list_head *p, *q;
969
970         list_for_each_safe(p, q, &input_files) {
971                 rem_input_file(list_entry(p, struct thr_info, head));
972         }
973 }
974
975 /**
976  * __find_input_files - Find input files associated with this device (per cpu)
977  */
978 static void __find_input_files(struct dev_info *dip)
979 {
980         int cpu = 0;
981
982         for (;;) {
983                 char full_name[MAXPATHLEN];
984
985                 sprintf(full_name, "%s/%s.%s.%d", idir, dip->devnm, ibase, cpu);
986                 if (access(full_name, R_OK) != 0)
987                         break;
988
989                 add_input_file(cpu, dip->devnm, full_name);
990                 cpu++;
991         }
992
993         if (!cpu) {
994                 fatal(NULL, ERR_ARGS, "No traces found for %s\n", dip->devnm);
995                 /*NOTREACHED*/
996         }
997
998         rem_input_dev(dip);
999 }
1000
1001
1002 /**
1003  * find_input_files - Find input files for all devices
1004  */
1005 static void find_input_files(void)
1006 {
1007         struct list_head *p, *q;
1008
1009         list_for_each_safe(p, q, &input_devs) {
1010                 __find_input_files(list_entry(p, struct dev_info, head));
1011         }
1012 }
1013
1014 /* 
1015  * ========================================================================
1016  * ==== RECLAIM ROUTINES ==================================================
1017  * ========================================================================
1018  */
1019
1020 /**
1021  * reap_wait_aios - Wait for and return number of outstanding AIOs
1022  *
1023  * Will return 0 if we are done
1024  */
1025 static int reap_wait_aios(struct thr_info *tip)
1026 {
1027         int naios = 0;
1028
1029         if (!is_reap_done(tip)) {
1030                 pthread_mutex_lock(&tip->mutex);
1031                 while (tip->naios_out == 0) {
1032                         tip->reap_wait = 1;
1033                         if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
1034                                 fatal("pthread_cond_wait", ERR_SYSCALL, 
1035                                         "nfree_current cond wait failed\n");
1036                                 /*NOTREACHED*/
1037                         }
1038                 }
1039                 naios = tip->naios_out;
1040                 pthread_mutex_unlock(&tip->mutex);
1041         }
1042         assert(is_reap_done(tip) || naios > 0);
1043
1044         return is_reap_done(tip) ? 0 : naios;
1045 }
1046
1047 /**
1048  * reclaim_ios - Reclaim AIOs completed, recycle IOCBs
1049  * @tip: Per-thread information
1050  * @naios_out: Number of AIOs we have outstanding (min)
1051  */
1052 static void reclaim_ios(struct thr_info *tip, long naios_out)
1053 {
1054         long i, ndone;
1055         struct io_event *evp, events[naios_out];
1056
1057 again:
1058         assert(naios > 0);
1059         for (;;) {
1060                 ndone = io_getevents(tip->ctx, 1, naios_out, events, NULL);
1061                 if (ndone > 0)
1062                         break;
1063
1064                 if (errno && errno != EINTR) {
1065                         fatal("io_getevents", ERR_SYSCALL, 
1066                                 "io_getevents failed\n");
1067                         /*NOTREACHED*/
1068                 }
1069         }
1070         assert(0 < ndone && ndone <= naios_out);
1071
1072         pthread_mutex_lock(&tip->mutex);
1073         for (i = 0, evp = events; i < ndone; i++, evp++) {
1074                 struct iocb_pkt *iocbp = evp->data;
1075
1076                 if (evp->res != iocbp->iocb.u.c.nbytes) {
1077                         fatal(NULL, ERR_SYSCALL,
1078                               "Event failure %ld/%ld\t(%ld + %ld)\n",
1079                               (long)evp->res, (long)evp->res2,
1080                               (long)iocbp->iocb.u.c.offset / nb_sec, 
1081                               (long)iocbp->iocb.u.c.nbytes / nb_sec);
1082                         /*NOTREACHED*/
1083                 }
1084
1085                 list_move_tail(&iocbp->head, &tip->free_iocbs);
1086         }
1087
1088         tip->naios_free += ndone;
1089         tip->naios_out -= ndone;
1090         naios_out = minl(naios_out, tip->naios_out);
1091
1092         if (tip->send_wait) {
1093                 tip->send_wait = 0;
1094                 pthread_cond_signal(&tip->cond);
1095         }
1096         pthread_mutex_unlock(&tip->mutex);
1097
1098         /*
1099          * Short cut: If we /know/ there are some more AIOs, go handle them
1100          */
1101         if (naios_out)
1102                 goto again;
1103 }
1104
1105 /**
1106  * replay_rec - Worker thread to reclaim AIOs
1107  * @arg: Pointer to thread information
1108  */
1109 static void *replay_rec(void *arg)
1110 {
1111         long naios_out;
1112         struct thr_info *tip = arg;
1113
1114         while ((naios_out = reap_wait_aios(tip)) > 0) 
1115                 reclaim_ios(tip, naios_out);
1116
1117         assert(tip->send_done);
1118         tip->reap_done = 1;
1119         set_reclaim_done();
1120
1121         return NULL;
1122 }
1123
1124 /* 
1125  * ========================================================================
1126  * ==== REPLAY ROUTINES ===================================================
1127  * ========================================================================
1128  */
1129
1130 /**
1131  * next_bunch - Retrieve next bunch of AIOs to process
1132  * @tip: Per-thread information
1133  * @bunch: Bunch information
1134  *
1135  * Returns TRUE if we recovered a bunch of IOs, else hit EOF
1136  */
1137 static int next_bunch(struct thr_info *tip, struct io_bunch *bunch)
1138 {
1139         size_t count, result;
1140         
1141         result = read(tip->ifd, &bunch->hdr, sizeof(bunch->hdr));
1142         if (result != sizeof(bunch->hdr)) {
1143                 if (result == 0)
1144                         return 0;
1145
1146                 fatal(tip->file_name, ERR_SYSCALL, "Short hdr(%ld)\n", 
1147                         (long)result);
1148                 /*NOTREACHED*/
1149         }
1150         assert(bunch->hdr.npkts <= BT_MAX_PKTS);
1151
1152         count = bunch->hdr.npkts * sizeof(struct io_pkt);
1153         result = read(tip->ifd, &bunch->pkts, count);
1154         if (result != count) {
1155                 fatal(tip->file_name, ERR_SYSCALL, "Short pkts(%ld/%ld)\n", 
1156                         (long)result, (long)count);
1157                 /*NOTREACHED*/
1158         }
1159
1160         return 1;
1161 }
1162
1163 /**
1164  * nfree_current - Returns current number of AIOs that are free
1165  *
1166  * Will wait for available ones...
1167  *
1168  * Returns 0 if we have some condition that causes us to exit
1169  */
1170 static int nfree_current(struct thr_info *tip)
1171 {
1172         int nfree = 0;
1173
1174         pthread_mutex_lock(&tip->mutex);
1175         while (!is_send_done(tip) && ((nfree = tip->naios_free) == 0)) {
1176                 tip->send_wait = 1;
1177                 if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
1178                         fatal("pthread_cond_wait", ERR_SYSCALL, 
1179                                 "nfree_current cond wait failed\n");
1180                         /*NOTREACHED*/
1181                 }
1182         }
1183         pthread_mutex_unlock(&tip->mutex);
1184
1185         return nfree;
1186 }
1187
1188 /**
1189  * stall - Stall for the number of nanoseconds requested
1190  *
1191  * We may be late, in which case we just return.
1192  */
1193 static void stall(struct thr_info *tip, long long oclock)
1194 {
1195         struct timespec req;
1196         long long dreal, tclock = gettime() - rgenesis;
1197
1198         oclock /= acc_factor;
1199         
1200         if (verbose > 1)
1201                 fprintf(tip->vfp, "   stall(%lld.%09lld, %lld.%09lld)\n",
1202                         du64_to_sec(oclock), du64_to_nsec(oclock),
1203                         du64_to_sec(tclock), du64_to_nsec(tclock));
1204
1205         while (!is_send_done(tip) && tclock < oclock) {
1206                 dreal = oclock - tclock;
1207                 req.tv_sec = dreal / (1000 * 1000 * 1000);
1208                 req.tv_nsec = dreal % (1000 * 1000 * 1000);
1209
1210                 if (verbose > 1) {
1211                         fprintf(tip->vfp, "++ stall(%lld.%09lld) ++\n",
1212                                 (long long)req.tv_sec,
1213                                 (long long)req.tv_nsec);
1214                 }
1215
1216                 if (nanosleep(&req, NULL) < 0 && signal_done)
1217                         break;
1218
1219                 tclock = gettime() - rgenesis;
1220         }
1221 }
1222
1223 /**
1224  * iocbs_map - Map a set of AIOs onto a set of IOCBs
1225  * @tip: Per-thread information
1226  * @list: List of AIOs created
1227  * @pkts: AIOs to map
1228  * @ntodo: Number of AIOs to map
1229  */
1230 static void iocbs_map(struct thr_info *tip, struct iocb **list, 
1231                                              struct io_pkt *pkts, int ntodo)
1232 {
1233         int i;
1234         struct io_pkt *pkt;
1235
1236         assert(0 < ntodo && ntodo <= naios);
1237
1238         pthread_mutex_lock(&tip->mutex);
1239         assert(ntodo <= list_len(&tip->free_iocbs));
1240         for (i = 0, pkt = pkts; i < ntodo; i++, pkt++) {
1241                 __u32 rw = pkt->rw;
1242                 struct iocb_pkt *iocbp;
1243
1244                 if (!pkt->rw && !write_enabled)
1245                         rw = 1;
1246
1247                 if (verbose > 1)
1248                         fprintf(tip->vfp, "\t%10llu + %10llu %c%c\n",
1249                                 (unsigned long long)pkt->sector, 
1250                                 (unsigned long long)pkt->nbytes / nb_sec,
1251                                 rw ? 'R' : 'W', 
1252                                 (rw == 1 && pkt->rw == 0) ? '!' : ' ');
1253                 
1254                 iocbp = list_entry(tip->free_iocbs.next, struct iocb_pkt, head);
1255                 iocb_setup(iocbp, rw, pkt->nbytes, pkt->sector * nb_sec);
1256
1257                 list_move_tail(&iocbp->head, &tip->used_iocbs);
1258                 list[i] = &iocbp->iocb;
1259         }
1260
1261         tip->naios_free -= ntodo;
1262         assert(tip->naios_free >= 0);
1263         pthread_mutex_unlock(&tip->mutex);
1264 }
1265
1266 /**
1267  * process_bunch - Process a bunch of requests
1268  * @tip: Per-thread information
1269  * @bunch: Bunch to process
1270  */
1271 static void process_bunch(struct thr_info *tip, struct io_bunch *bunch)
1272 {
1273         __u64 i = 0;
1274         struct iocb *list[bunch->hdr.npkts];
1275
1276         assert(0 < bunch->hdr.npkts && bunch->hdr.npkts <= BT_MAX_PKTS);
1277         while (!is_send_done(tip) && (i < bunch->hdr.npkts)) {
1278                 long ndone;
1279                 int ntodo = min(nfree_current(tip), bunch->hdr.npkts - i);
1280
1281                 assert(0 < ntodo && ntodo <= naios);
1282                 iocbs_map(tip, list, &bunch->pkts[i], ntodo);
1283                 if (!no_stalls)
1284                         stall(tip, bunch->hdr.time_stamp - genesis);
1285
1286                 if (ntodo) {
1287                         if (verbose > 1)
1288                                 fprintf(tip->vfp, "submit(%d)\n", ntodo);
1289                         ndone = io_submit(tip->ctx, ntodo, list);
1290                         if (ndone != (long)ntodo) {
1291                                 fatal("io_submit", ERR_SYSCALL,
1292                                         "%d: io_submit(%d:%ld) failed (%s)\n", 
1293                                         tip->cpu, ntodo, ndone, 
1294                                         strerror(labs(ndone)));
1295                                 /*NOTREACHED*/
1296                         }
1297
1298                         pthread_mutex_lock(&tip->mutex);
1299                         tip->naios_out += ndone;
1300                         assert(tip->naios_out <= naios);
1301                         if (tip->reap_wait) {
1302                                 tip->reap_wait = 0;
1303                                 pthread_cond_signal(&tip->cond);
1304                         }
1305                         pthread_mutex_unlock(&tip->mutex);
1306
1307                         i += ndone;
1308                         assert(i <= bunch->hdr.npkts);
1309                 }
1310         }
1311 }
1312
1313 /**
1314  * reset_input_file - Reset the input file for the next iteration
1315  * @tip: Thread information
1316  *
1317  * We also do a dummy read of the file header to get us to the first bunch.
1318  */
1319 static void reset_input_file(struct thr_info *tip)
1320 {
1321         struct io_file_hdr hdr;
1322
1323         lseek(tip->ifd, 0, 0);
1324
1325         if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
1326                 fatal(tip->file_name, ERR_ARGS, "Header reread failed\n");
1327                 /*NOTREACHED*/
1328         }
1329 }
1330
1331 /**
1332  * replay_sub - Worker thread to submit AIOs that are being replayed
1333  */
1334 static void *replay_sub(void *arg)
1335 {
1336         unsigned int i;
1337         char *mdev;
1338         char path[MAXPATHLEN];
1339         struct io_bunch bunch;
1340         struct thr_info *tip = arg;
1341         int oflags;
1342
1343         pin_to_cpu(tip);
1344
1345         mdev = map_dev(tip->devnm);
1346         sprintf(path, "/dev/%s", mdev);
1347         /*
1348          * convert underscores to slashes to
1349          * restore device names that have larger paths
1350          */
1351         for (i = 0; i < strlen(mdev); i++)
1352                 if (path[strlen("/dev/") + i] == '_')
1353                         path[strlen("/dev/") + i] = '/';
1354 #ifdef O_NOATIME
1355         oflags = O_NOATIME;
1356 #else
1357         oflags = 0;
1358 #endif
1359         tip->ofd = open(path, O_RDWR | O_DIRECT | oflags);
1360         if (tip->ofd < 0) {
1361                 fatal(path, ERR_SYSCALL, "Failed device open\n");
1362                 /*NOTREACHED*/
1363         }
1364
1365         set_replay_ready();
1366         while (!is_send_done(tip) && tip->iterations--) {
1367                 wait_iter_start();
1368                 if (verbose > 1)
1369                         fprintf(tip->vfp, "\n=== %d ===\n", tip->iterations);
1370                 while (!is_send_done(tip) && next_bunch(tip, &bunch))
1371                         process_bunch(tip, &bunch);
1372                 set_iter_done();
1373                 reset_input_file(tip);
1374         }
1375         tip->send_done = 1;
1376         set_replay_done();
1377
1378         return NULL;
1379 }
1380
1381 /* 
1382  * ========================================================================
1383  * ==== COMMAND LINE ARGUMENT HANDLING ====================================
1384  * ========================================================================
1385  */
1386
1387 static char usage_str[] =                                               \
1388         "\n"                                                            \
1389         "\t[ -c <cpus> : --cpus=<cpus>           ] Default: 1\n"        \
1390         "\t[ -d <dir>  : --input-directory=<dir> ] Default: .\n"        \
1391         "\t[ -F        : --find-records          ] Default: Off\n"      \
1392         "\t[ -h        : --help                  ] Default: Off\n"      \
1393         "\t[ -i <base> : --input-base=<base>     ] Default: replay\n"   \
1394         "\t[ -I <iters>: --iterations=<iters>    ] Default: 1\n"        \
1395         "\t[ -M <file> : --map-devs=<file>       ] Default: None\n"     \
1396         "\t[ -N        : --no-stalls             ] Default: Off\n"      \
1397         "\t[ -x        : --acc-factor            ] Default: 1\n"        \
1398         "\t[ -v        : --verbose               ] Default: Off\n"      \
1399         "\t[ -V        : --version               ] Default: Off\n"      \
1400         "\t[ -W        : --write-enable          ] Default: Off\n"      \
1401         "\t<dev...>                                Default: None\n"     \
1402         "\n";
1403
1404 #define S_OPTS  "c:d:Fhi:I:M:Nx:t:vVW"
1405 static struct option l_opts[] = {
1406         {
1407                 .name = "cpus",
1408                 .has_arg = required_argument,
1409                 .flag = NULL,
1410                 .val = 'c'
1411         },
1412         {
1413                 .name = "input-directory",
1414                 .has_arg = required_argument,
1415                 .flag = NULL,
1416                 .val = 'd'
1417         },
1418         {
1419                 .name = "find-records",
1420                 .has_arg = no_argument,
1421                 .flag = NULL,
1422                 .val = 'F'
1423         },
1424         {
1425                 .name = "help",
1426                 .has_arg = no_argument,
1427                 .flag = NULL,
1428                 .val = 'h'
1429         },
1430         {
1431                 .name = "input-base",
1432                 .has_arg = required_argument,
1433                 .flag = NULL,
1434                 .val = 'i'
1435         },
1436         {
1437                 .name = "iterations",
1438                 .has_arg = required_argument,
1439                 .flag = NULL,
1440                 .val = 'I'
1441         },
1442         {
1443                 .name = "map-devs",
1444                 .has_arg = required_argument,
1445                 .flag = NULL,
1446                 .val = 'M'
1447         },
1448         {
1449                 .name = "no-stalls",
1450                 .has_arg = no_argument,
1451                 .flag = NULL,
1452                 .val = 'N'
1453         },
1454         {
1455                 .name = "acc-factor",
1456                 .has_arg = required_argument,
1457                 .flag = NULL,
1458                 .val = 'x'
1459         },
1460         {
1461                 .name = "verbose",
1462                 .has_arg = no_argument,
1463                 .flag = NULL,
1464                 .val = 'v'
1465         },
1466         {
1467                 .name = "version",
1468                 .has_arg = no_argument,
1469                 .flag = NULL,
1470                 .val = 'V'
1471         },
1472         {
1473                 .name = "write-enable",
1474                 .has_arg = no_argument,
1475                 .flag = NULL,
1476                 .val = 'W'
1477         },
1478         {
1479                 .name = NULL
1480         }
1481 };
1482
1483 /**
1484  * handle_args: Parse passed in argument list
1485  * @argc: Number of arguments in argv
1486  * @argv: Arguments passed in
1487  *
1488  * Does rudimentary parameter verification as well.
1489  */
1490 static void handle_args(int argc, char *argv[])
1491 {
1492         int c;
1493         int r;
1494
1495         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1496                 switch (c) {
1497                 case 'c': 
1498                         cpus_to_use = atoi(optarg);
1499                         if (cpus_to_use <= 0 || cpus_to_use > ncpus) {
1500                                 fatal(NULL, ERR_ARGS, 
1501                                       "Invalid number of cpus %d (0<x<%d)\n",
1502                                       cpus_to_use, ncpus);
1503                                 /*NOTREACHED*/
1504                         }
1505                         break;
1506
1507                 case 'd':
1508                         idir = optarg;
1509                         if (access(idir, R_OK | X_OK) != 0) {
1510                                 fatal(idir, ERR_ARGS, 
1511                                       "Invalid input directory specified\n");
1512                                 /*NOTREACHED*/
1513                         }
1514                         break;
1515
1516                 case 'F': 
1517                         find_records = 1;
1518                         break;
1519
1520                 case 'h': 
1521                         usage(); 
1522                         exit(0);
1523                         /*NOTREACHED*/
1524
1525                 case 'i': 
1526                         ibase = optarg;
1527                         break;
1528
1529                 case 'I':
1530                         def_iterations = atoi(optarg);
1531                         if (def_iterations <= 0) {
1532                                 fprintf(stderr, 
1533                                         "Invalid number of iterations %d\n",
1534                                         def_iterations);
1535                                 exit(ERR_ARGS);
1536                                 /*NOTREACHED*/
1537                         }
1538                         break;
1539
1540                 case 'M':
1541                         read_map_devs(optarg);
1542                         break;
1543
1544                 case 'N':
1545                         no_stalls = 1;
1546                         break;
1547
1548                 case 'x':
1549                         r = sscanf(optarg,"%u",&acc_factor);
1550                         if (r!=1) {
1551                                 fprintf(stderr,
1552                                         "Invalid acceleration factor\n");
1553                                 exit(ERR_ARGS);
1554                                 /*NOTREACHED*/
1555                         }
1556                         break;
1557
1558                 case 'V':
1559                         fprintf(stderr, "btreplay -- version %s\n", 
1560                                 my_btversion);
1561                         fprintf(stderr, "            Built on %s\n", 
1562                                 build_date);
1563                         exit(0);
1564                         /*NOTREACHED*/
1565
1566                 case 'v':
1567                         verbose++;
1568                         break;
1569
1570                 case 'W':
1571                         write_enabled = 1;
1572                         break;
1573
1574                 default:
1575                         usage();
1576                         fatal(NULL, ERR_ARGS, 
1577                               "Invalid command line argument %c\n", c);
1578                         /*NOTREACHED*/
1579                 }
1580         }
1581
1582         while (optind < argc)
1583                 add_input_dev(argv[optind++]);
1584
1585         if (find_records)
1586                 find_input_devs(idir);
1587
1588         if (list_len(&input_devs) == 0) {
1589                 fatal(NULL, ERR_ARGS, "Missing required input dev name(s)\n");
1590                 /*NOTREACHED*/
1591         }
1592
1593         if (cpus_to_use < 0)
1594                 cpus_to_use = ncpus;
1595 }
1596
1597 /* 
1598  * ========================================================================
1599  * ==== MAIN ROUTINE ======================================================
1600  * ========================================================================
1601  */
1602
1603 /**
1604  * set_signal_done - Signal handler, catches signals & sets signal_done
1605  */
1606 static void set_signal_done(__attribute__((__unused__))int signum)
1607 {
1608         signal_done = 1;
1609 }
1610
1611 /**
1612  * main - 
1613  * @argc: Number of arguments
1614  * @argv: Array of arguments
1615  */
1616 int main(int argc, char *argv[])
1617 {
1618         int i;
1619         struct list_head *p;
1620
1621         pgsize = getpagesize();
1622         assert(pgsize > 0);
1623
1624         setup_signal(SIGINT, set_signal_done);
1625         setup_signal(SIGTERM, set_signal_done);
1626
1627         get_ncpus();
1628         handle_args(argc, argv);
1629         find_input_files();
1630
1631         nfiles = list_len(&input_files);
1632         __list_for_each(p, &input_files) {
1633                 tip_init(list_entry(p, struct thr_info, head));
1634         }
1635
1636         wait_replays_ready();
1637         for (i = 0; i < def_iterations; i++) {
1638                 rgenesis = gettime();
1639                 start_iter();
1640                 if (verbose)
1641                         fprintf(stderr, "I");
1642                 wait_iters_done();
1643         }
1644
1645         wait_replays_done();
1646         wait_reclaims_done();
1647
1648         if (verbose)
1649                 fprintf(stderr, "\n");
1650
1651         rem_input_files();
1652         release_map_devs();
1653
1654         return 0;
1655 }